diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2711810543175663, + "eval_steps": 500, + "global_step": 200000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006355905271587832, + "grad_norm": 3.609375, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.3296, + "step": 100 + }, + { + "epoch": 0.0012711810543175664, + "grad_norm": 3.484375, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.3266, + "step": 200 + }, + { + "epoch": 0.0019067715814763497, + "grad_norm": 3.625, + "learning_rate": 6.000000000000001e-07, + "loss": 2.3437, + "step": 300 + }, + { + "epoch": 0.002542362108635133, + "grad_norm": 3.640625, + "learning_rate": 8.000000000000001e-07, + "loss": 2.3306, + "step": 400 + }, + { + "epoch": 0.003177952635793916, + "grad_norm": 3.25, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.3232, + "step": 500 + }, + { + "epoch": 0.0038135431629526995, + "grad_norm": 4.15625, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.3369, + "step": 600 + }, + { + "epoch": 0.004449133690111483, + "grad_norm": 4.125, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.3436, + "step": 700 + }, + { + "epoch": 0.005084724217270266, + "grad_norm": 2.6875, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.3334, + "step": 800 + }, + { + "epoch": 0.0057203147444290494, + "grad_norm": 3.484375, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.3337, + "step": 900 + }, + { + "epoch": 0.006355905271587832, + "grad_norm": 4.65625, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.3052, + "step": 1000 + }, + { + "epoch": 0.006991495798746615, + "grad_norm": 3.453125, + "learning_rate": 2.2e-06, + "loss": 2.3237, + "step": 1100 + }, + { + "epoch": 0.007627086325905399, + "grad_norm": 3.390625, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.3015, + "step": 1200 + }, + { + "epoch": 0.008262676853064182, + "grad_norm": 2.8125, + "learning_rate": 2.6e-06, + "loss": 2.3166, + "step": 1300 + }, + { + "epoch": 0.008898267380222966, + "grad_norm": 3.484375, + "learning_rate": 2.8000000000000003e-06, + "loss": 2.2848, + "step": 1400 + }, + { + "epoch": 0.009533857907381748, + "grad_norm": 2.453125, + "learning_rate": 3e-06, + "loss": 2.2821, + "step": 1500 + }, + { + "epoch": 0.010169448434540531, + "grad_norm": 2.96875, + "learning_rate": 3.2000000000000003e-06, + "loss": 2.2781, + "step": 1600 + }, + { + "epoch": 0.010805038961699315, + "grad_norm": 3.0, + "learning_rate": 3.4000000000000005e-06, + "loss": 2.2595, + "step": 1700 + }, + { + "epoch": 0.011440629488858099, + "grad_norm": 2.28125, + "learning_rate": 3.6000000000000003e-06, + "loss": 2.2409, + "step": 1800 + }, + { + "epoch": 0.012076220016016881, + "grad_norm": 2.9375, + "learning_rate": 3.8000000000000005e-06, + "loss": 2.2393, + "step": 1900 + }, + { + "epoch": 0.012711810543175665, + "grad_norm": 2.34375, + "learning_rate": 4.000000000000001e-06, + "loss": 2.2361, + "step": 2000 + }, + { + "epoch": 0.013347401070334448, + "grad_norm": 2.234375, + "learning_rate": 4.2000000000000004e-06, + "loss": 2.2238, + "step": 2100 + }, + { + "epoch": 0.01398299159749323, + "grad_norm": 2.5625, + "learning_rate": 4.4e-06, + "loss": 2.2031, + "step": 2200 + }, + { + "epoch": 0.014618582124652014, + "grad_norm": 2.65625, + "learning_rate": 4.600000000000001e-06, + "loss": 2.1762, + "step": 2300 + }, + { + "epoch": 0.015254172651810798, + "grad_norm": 2.484375, + "learning_rate": 4.800000000000001e-06, + "loss": 2.1772, + "step": 2400 + }, + { + "epoch": 0.01588976317896958, + "grad_norm": 2.265625, + "learning_rate": 5e-06, + "loss": 2.147, + "step": 2500 + }, + { + "epoch": 0.016525353706128364, + "grad_norm": 2.390625, + "learning_rate": 5.2e-06, + "loss": 2.1295, + "step": 2600 + }, + { + "epoch": 0.017160944233287147, + "grad_norm": 2.28125, + "learning_rate": 5.400000000000001e-06, + "loss": 2.0829, + "step": 2700 + }, + { + "epoch": 0.01779653476044593, + "grad_norm": 2.734375, + "learning_rate": 5.600000000000001e-06, + "loss": 2.082, + "step": 2800 + }, + { + "epoch": 0.018432125287604715, + "grad_norm": 2.78125, + "learning_rate": 5.8e-06, + "loss": 2.02, + "step": 2900 + }, + { + "epoch": 0.019067715814763495, + "grad_norm": 3.53125, + "learning_rate": 6e-06, + "loss": 2.003, + "step": 3000 + }, + { + "epoch": 0.01970330634192228, + "grad_norm": 2.203125, + "learning_rate": 6.200000000000001e-06, + "loss": 1.9711, + "step": 3100 + }, + { + "epoch": 0.020338896869081063, + "grad_norm": 2.328125, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.9499, + "step": 3200 + }, + { + "epoch": 0.020974487396239846, + "grad_norm": 1.703125, + "learning_rate": 6.600000000000001e-06, + "loss": 1.9166, + "step": 3300 + }, + { + "epoch": 0.02161007792339863, + "grad_norm": 1.5703125, + "learning_rate": 6.800000000000001e-06, + "loss": 1.9366, + "step": 3400 + }, + { + "epoch": 0.022245668450557414, + "grad_norm": 1.015625, + "learning_rate": 7e-06, + "loss": 1.9282, + "step": 3500 + }, + { + "epoch": 0.022881258977716198, + "grad_norm": 1.265625, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.9043, + "step": 3600 + }, + { + "epoch": 0.023516849504874978, + "grad_norm": 1.421875, + "learning_rate": 7.4e-06, + "loss": 1.8715, + "step": 3700 + }, + { + "epoch": 0.024152440032033762, + "grad_norm": 1.0234375, + "learning_rate": 7.600000000000001e-06, + "loss": 1.8862, + "step": 3800 + }, + { + "epoch": 0.024788030559192546, + "grad_norm": 1.078125, + "learning_rate": 7.800000000000002e-06, + "loss": 1.8584, + "step": 3900 + }, + { + "epoch": 0.02542362108635133, + "grad_norm": 0.83203125, + "learning_rate": 8.000000000000001e-06, + "loss": 1.8515, + "step": 4000 + }, + { + "epoch": 0.026059211613510113, + "grad_norm": 0.8671875, + "learning_rate": 8.2e-06, + "loss": 1.8317, + "step": 4100 + }, + { + "epoch": 0.026694802140668897, + "grad_norm": 1.0390625, + "learning_rate": 8.400000000000001e-06, + "loss": 1.8488, + "step": 4200 + }, + { + "epoch": 0.027330392667827677, + "grad_norm": 0.84765625, + "learning_rate": 8.6e-06, + "loss": 1.8256, + "step": 4300 + }, + { + "epoch": 0.02796598319498646, + "grad_norm": 0.81640625, + "learning_rate": 8.8e-06, + "loss": 1.8079, + "step": 4400 + }, + { + "epoch": 0.028601573722145245, + "grad_norm": 0.7109375, + "learning_rate": 9e-06, + "loss": 1.8301, + "step": 4500 + }, + { + "epoch": 0.02923716424930403, + "grad_norm": 0.80078125, + "learning_rate": 9.200000000000002e-06, + "loss": 1.8207, + "step": 4600 + }, + { + "epoch": 0.029872754776462812, + "grad_norm": 0.73046875, + "learning_rate": 9.4e-06, + "loss": 1.8153, + "step": 4700 + }, + { + "epoch": 0.030508345303621596, + "grad_norm": 1.234375, + "learning_rate": 9.600000000000001e-06, + "loss": 1.7974, + "step": 4800 + }, + { + "epoch": 0.03114393583078038, + "grad_norm": 1.03125, + "learning_rate": 9.800000000000001e-06, + "loss": 1.7669, + "step": 4900 + }, + { + "epoch": 0.03177952635793916, + "grad_norm": 0.94921875, + "learning_rate": 1e-05, + "loss": 1.7712, + "step": 5000 + }, + { + "epoch": 0.03241511688509795, + "grad_norm": 0.5546875, + "learning_rate": 9.994871794871795e-06, + "loss": 1.7679, + "step": 5100 + }, + { + "epoch": 0.03305070741225673, + "grad_norm": 0.68359375, + "learning_rate": 9.98974358974359e-06, + "loss": 1.7683, + "step": 5200 + }, + { + "epoch": 0.03368629793941551, + "grad_norm": 1.734375, + "learning_rate": 9.984615384615386e-06, + "loss": 1.777, + "step": 5300 + }, + { + "epoch": 0.034321888466574295, + "grad_norm": 0.89453125, + "learning_rate": 9.97948717948718e-06, + "loss": 1.7545, + "step": 5400 + }, + { + "epoch": 0.034957478993733075, + "grad_norm": 0.4765625, + "learning_rate": 9.974358974358974e-06, + "loss": 1.7718, + "step": 5500 + }, + { + "epoch": 0.03559306952089186, + "grad_norm": 0.76953125, + "learning_rate": 9.96923076923077e-06, + "loss": 1.7687, + "step": 5600 + }, + { + "epoch": 0.03622866004805064, + "grad_norm": 0.77734375, + "learning_rate": 9.964102564102564e-06, + "loss": 1.7664, + "step": 5700 + }, + { + "epoch": 0.03686425057520943, + "grad_norm": 0.7265625, + "learning_rate": 9.95897435897436e-06, + "loss": 1.768, + "step": 5800 + }, + { + "epoch": 0.03749984110236821, + "grad_norm": 1.359375, + "learning_rate": 9.953846153846156e-06, + "loss": 1.7759, + "step": 5900 + }, + { + "epoch": 0.03813543162952699, + "grad_norm": 0.83984375, + "learning_rate": 9.94871794871795e-06, + "loss": 1.7379, + "step": 6000 + }, + { + "epoch": 0.03877102215668578, + "grad_norm": 0.62109375, + "learning_rate": 9.943589743589744e-06, + "loss": 1.7611, + "step": 6100 + }, + { + "epoch": 0.03940661268384456, + "grad_norm": 0.53515625, + "learning_rate": 9.93846153846154e-06, + "loss": 1.7453, + "step": 6200 + }, + { + "epoch": 0.040042203211003345, + "grad_norm": 0.51953125, + "learning_rate": 9.933333333333334e-06, + "loss": 1.7493, + "step": 6300 + }, + { + "epoch": 0.040677793738162125, + "grad_norm": 0.7265625, + "learning_rate": 9.92820512820513e-06, + "loss": 1.7308, + "step": 6400 + }, + { + "epoch": 0.04131338426532091, + "grad_norm": 0.5078125, + "learning_rate": 9.923076923076923e-06, + "loss": 1.7557, + "step": 6500 + }, + { + "epoch": 0.04194897479247969, + "grad_norm": 0.76171875, + "learning_rate": 9.91794871794872e-06, + "loss": 1.7244, + "step": 6600 + }, + { + "epoch": 0.04258456531963847, + "grad_norm": 0.93359375, + "learning_rate": 9.912820512820513e-06, + "loss": 1.7365, + "step": 6700 + }, + { + "epoch": 0.04322015584679726, + "grad_norm": 0.6796875, + "learning_rate": 9.907692307692309e-06, + "loss": 1.7416, + "step": 6800 + }, + { + "epoch": 0.04385574637395604, + "grad_norm": 0.8671875, + "learning_rate": 9.902564102564103e-06, + "loss": 1.744, + "step": 6900 + }, + { + "epoch": 0.04449133690111483, + "grad_norm": 0.40234375, + "learning_rate": 9.897435897435899e-06, + "loss": 1.7343, + "step": 7000 + }, + { + "epoch": 0.04512692742827361, + "grad_norm": 0.640625, + "learning_rate": 9.892307692307693e-06, + "loss": 1.7476, + "step": 7100 + }, + { + "epoch": 0.045762517955432395, + "grad_norm": 0.78515625, + "learning_rate": 9.887179487179489e-06, + "loss": 1.7382, + "step": 7200 + }, + { + "epoch": 0.046398108482591176, + "grad_norm": 0.64453125, + "learning_rate": 9.882051282051283e-06, + "loss": 1.7376, + "step": 7300 + }, + { + "epoch": 0.047033699009749956, + "grad_norm": 0.546875, + "learning_rate": 9.876923076923077e-06, + "loss": 1.73, + "step": 7400 + }, + { + "epoch": 0.04766928953690874, + "grad_norm": 0.875, + "learning_rate": 9.871794871794872e-06, + "loss": 1.7139, + "step": 7500 + }, + { + "epoch": 0.048304880064067524, + "grad_norm": 0.8125, + "learning_rate": 9.866666666666668e-06, + "loss": 1.7271, + "step": 7600 + }, + { + "epoch": 0.04894047059122631, + "grad_norm": 0.8125, + "learning_rate": 9.861538461538462e-06, + "loss": 1.728, + "step": 7700 + }, + { + "epoch": 0.04957606111838509, + "grad_norm": 1.078125, + "learning_rate": 9.856410256410256e-06, + "loss": 1.7238, + "step": 7800 + }, + { + "epoch": 0.05021165164554388, + "grad_norm": 0.703125, + "learning_rate": 9.851282051282052e-06, + "loss": 1.7245, + "step": 7900 + }, + { + "epoch": 0.05084724217270266, + "grad_norm": 0.6484375, + "learning_rate": 9.846153846153848e-06, + "loss": 1.7213, + "step": 8000 + }, + { + "epoch": 0.05148283269986144, + "grad_norm": 1.046875, + "learning_rate": 9.841025641025642e-06, + "loss": 1.7318, + "step": 8100 + }, + { + "epoch": 0.052118423227020226, + "grad_norm": 0.4375, + "learning_rate": 9.835897435897438e-06, + "loss": 1.7245, + "step": 8200 + }, + { + "epoch": 0.052754013754179006, + "grad_norm": 0.6015625, + "learning_rate": 9.830769230769232e-06, + "loss": 1.7085, + "step": 8300 + }, + { + "epoch": 0.053389604281337794, + "grad_norm": 0.5625, + "learning_rate": 9.825641025641026e-06, + "loss": 1.714, + "step": 8400 + }, + { + "epoch": 0.054025194808496574, + "grad_norm": 0.56640625, + "learning_rate": 9.820512820512821e-06, + "loss": 1.7158, + "step": 8500 + }, + { + "epoch": 0.054660785335655354, + "grad_norm": 0.6796875, + "learning_rate": 9.815384615384617e-06, + "loss": 1.7205, + "step": 8600 + }, + { + "epoch": 0.05529637586281414, + "grad_norm": 0.85546875, + "learning_rate": 9.810256410256411e-06, + "loss": 1.72, + "step": 8700 + }, + { + "epoch": 0.05593196638997292, + "grad_norm": 1.0546875, + "learning_rate": 9.805128205128205e-06, + "loss": 1.7039, + "step": 8800 + }, + { + "epoch": 0.05656755691713171, + "grad_norm": 0.8671875, + "learning_rate": 9.800000000000001e-06, + "loss": 1.7238, + "step": 8900 + }, + { + "epoch": 0.05720314744429049, + "grad_norm": 0.78125, + "learning_rate": 9.794871794871795e-06, + "loss": 1.7165, + "step": 9000 + }, + { + "epoch": 0.057838737971449276, + "grad_norm": 0.65625, + "learning_rate": 9.78974358974359e-06, + "loss": 1.7248, + "step": 9100 + }, + { + "epoch": 0.05847432849860806, + "grad_norm": 0.6328125, + "learning_rate": 9.784615384615387e-06, + "loss": 1.7327, + "step": 9200 + }, + { + "epoch": 0.05910991902576684, + "grad_norm": 0.5546875, + "learning_rate": 9.77948717948718e-06, + "loss": 1.726, + "step": 9300 + }, + { + "epoch": 0.059745509552925624, + "grad_norm": 0.78125, + "learning_rate": 9.774358974358975e-06, + "loss": 1.7145, + "step": 9400 + }, + { + "epoch": 0.060381100080084404, + "grad_norm": 0.65234375, + "learning_rate": 9.76923076923077e-06, + "loss": 1.7061, + "step": 9500 + }, + { + "epoch": 0.06101669060724319, + "grad_norm": 0.703125, + "learning_rate": 9.764102564102564e-06, + "loss": 1.7186, + "step": 9600 + }, + { + "epoch": 0.06165228113440197, + "grad_norm": 0.8203125, + "learning_rate": 9.75897435897436e-06, + "loss": 1.7153, + "step": 9700 + }, + { + "epoch": 0.06228787166156076, + "grad_norm": 1.0859375, + "learning_rate": 9.753846153846154e-06, + "loss": 1.7038, + "step": 9800 + }, + { + "epoch": 0.06292346218871954, + "grad_norm": 0.8984375, + "learning_rate": 9.74871794871795e-06, + "loss": 1.7135, + "step": 9900 + }, + { + "epoch": 0.06355905271587832, + "grad_norm": 0.6328125, + "learning_rate": 9.743589743589744e-06, + "loss": 1.6987, + "step": 10000 + }, + { + "epoch": 0.0641946432430371, + "grad_norm": 0.6875, + "learning_rate": 9.738461538461538e-06, + "loss": 1.6867, + "step": 10100 + }, + { + "epoch": 0.0648302337701959, + "grad_norm": 0.75390625, + "learning_rate": 9.733333333333334e-06, + "loss": 1.7, + "step": 10200 + }, + { + "epoch": 0.06546582429735467, + "grad_norm": 0.74609375, + "learning_rate": 9.72820512820513e-06, + "loss": 1.7057, + "step": 10300 + }, + { + "epoch": 0.06610141482451345, + "grad_norm": 0.65234375, + "learning_rate": 9.723076923076924e-06, + "loss": 1.7206, + "step": 10400 + }, + { + "epoch": 0.06673700535167224, + "grad_norm": 0.7734375, + "learning_rate": 9.71794871794872e-06, + "loss": 1.694, + "step": 10500 + }, + { + "epoch": 0.06737259587883102, + "grad_norm": 0.49609375, + "learning_rate": 9.712820512820513e-06, + "loss": 1.7058, + "step": 10600 + }, + { + "epoch": 0.06800818640598981, + "grad_norm": 0.66015625, + "learning_rate": 9.707692307692308e-06, + "loss": 1.7093, + "step": 10700 + }, + { + "epoch": 0.06864377693314859, + "grad_norm": 1.046875, + "learning_rate": 9.702564102564103e-06, + "loss": 1.6968, + "step": 10800 + }, + { + "epoch": 0.06927936746030737, + "grad_norm": 0.5390625, + "learning_rate": 9.697435897435899e-06, + "loss": 1.6991, + "step": 10900 + }, + { + "epoch": 0.06991495798746615, + "grad_norm": 0.92578125, + "learning_rate": 9.692307692307693e-06, + "loss": 1.7002, + "step": 11000 + }, + { + "epoch": 0.07055054851462494, + "grad_norm": 0.7421875, + "learning_rate": 9.687179487179487e-06, + "loss": 1.7096, + "step": 11100 + }, + { + "epoch": 0.07118613904178372, + "grad_norm": 0.703125, + "learning_rate": 9.682051282051283e-06, + "loss": 1.6926, + "step": 11200 + }, + { + "epoch": 0.0718217295689425, + "grad_norm": 0.59765625, + "learning_rate": 9.676923076923079e-06, + "loss": 1.6882, + "step": 11300 + }, + { + "epoch": 0.07245732009610129, + "grad_norm": 0.62109375, + "learning_rate": 9.671794871794873e-06, + "loss": 1.6937, + "step": 11400 + }, + { + "epoch": 0.07309291062326007, + "grad_norm": 1.2578125, + "learning_rate": 9.666666666666667e-06, + "loss": 1.7164, + "step": 11500 + }, + { + "epoch": 0.07372850115041886, + "grad_norm": 0.70703125, + "learning_rate": 9.661538461538462e-06, + "loss": 1.6886, + "step": 11600 + }, + { + "epoch": 0.07436409167757764, + "grad_norm": 0.5546875, + "learning_rate": 9.656410256410257e-06, + "loss": 1.7014, + "step": 11700 + }, + { + "epoch": 0.07499968220473642, + "grad_norm": 0.66015625, + "learning_rate": 9.651282051282052e-06, + "loss": 1.7044, + "step": 11800 + }, + { + "epoch": 0.0756352727318952, + "grad_norm": 0.8359375, + "learning_rate": 9.646153846153848e-06, + "loss": 1.7096, + "step": 11900 + }, + { + "epoch": 0.07627086325905398, + "grad_norm": 0.72265625, + "learning_rate": 9.641025641025642e-06, + "loss": 1.7007, + "step": 12000 + }, + { + "epoch": 0.07690645378621278, + "grad_norm": 0.93359375, + "learning_rate": 9.635897435897436e-06, + "loss": 1.6985, + "step": 12100 + }, + { + "epoch": 0.07754204431337156, + "grad_norm": 1.0703125, + "learning_rate": 9.630769230769232e-06, + "loss": 1.6963, + "step": 12200 + }, + { + "epoch": 0.07817763484053034, + "grad_norm": 0.6015625, + "learning_rate": 9.625641025641026e-06, + "loss": 1.6804, + "step": 12300 + }, + { + "epoch": 0.07881322536768912, + "grad_norm": 0.9609375, + "learning_rate": 9.620512820512822e-06, + "loss": 1.6936, + "step": 12400 + }, + { + "epoch": 0.07944881589484791, + "grad_norm": 0.90625, + "learning_rate": 9.615384615384616e-06, + "loss": 1.697, + "step": 12500 + }, + { + "epoch": 0.08008440642200669, + "grad_norm": 0.6796875, + "learning_rate": 9.610256410256411e-06, + "loss": 1.6945, + "step": 12600 + }, + { + "epoch": 0.08071999694916547, + "grad_norm": 0.6015625, + "learning_rate": 9.605128205128206e-06, + "loss": 1.6923, + "step": 12700 + }, + { + "epoch": 0.08135558747632425, + "grad_norm": 0.5546875, + "learning_rate": 9.600000000000001e-06, + "loss": 1.6909, + "step": 12800 + }, + { + "epoch": 0.08199117800348303, + "grad_norm": 0.64453125, + "learning_rate": 9.594871794871797e-06, + "loss": 1.688, + "step": 12900 + }, + { + "epoch": 0.08262676853064183, + "grad_norm": 0.66796875, + "learning_rate": 9.589743589743591e-06, + "loss": 1.6987, + "step": 13000 + }, + { + "epoch": 0.0832623590578006, + "grad_norm": 0.6015625, + "learning_rate": 9.584615384615385e-06, + "loss": 1.6974, + "step": 13100 + }, + { + "epoch": 0.08389794958495939, + "grad_norm": 0.6640625, + "learning_rate": 9.579487179487181e-06, + "loss": 1.6831, + "step": 13200 + }, + { + "epoch": 0.08453354011211817, + "grad_norm": 0.40625, + "learning_rate": 9.574358974358975e-06, + "loss": 1.6862, + "step": 13300 + }, + { + "epoch": 0.08516913063927695, + "grad_norm": 0.53515625, + "learning_rate": 9.569230769230769e-06, + "loss": 1.6805, + "step": 13400 + }, + { + "epoch": 0.08580472116643574, + "grad_norm": 0.73046875, + "learning_rate": 9.564102564102565e-06, + "loss": 1.6963, + "step": 13500 + }, + { + "epoch": 0.08644031169359452, + "grad_norm": 0.78515625, + "learning_rate": 9.55897435897436e-06, + "loss": 1.6835, + "step": 13600 + }, + { + "epoch": 0.0870759022207533, + "grad_norm": 0.55078125, + "learning_rate": 9.553846153846155e-06, + "loss": 1.6947, + "step": 13700 + }, + { + "epoch": 0.08771149274791208, + "grad_norm": 0.890625, + "learning_rate": 9.548717948717949e-06, + "loss": 1.6866, + "step": 13800 + }, + { + "epoch": 0.08834708327507086, + "grad_norm": 0.625, + "learning_rate": 9.543589743589744e-06, + "loss": 1.6842, + "step": 13900 + }, + { + "epoch": 0.08898267380222966, + "grad_norm": 0.5078125, + "learning_rate": 9.53846153846154e-06, + "loss": 1.6714, + "step": 14000 + }, + { + "epoch": 0.08961826432938844, + "grad_norm": 0.5859375, + "learning_rate": 9.533333333333334e-06, + "loss": 1.679, + "step": 14100 + }, + { + "epoch": 0.09025385485654722, + "grad_norm": 0.78125, + "learning_rate": 9.52820512820513e-06, + "loss": 1.6923, + "step": 14200 + }, + { + "epoch": 0.090889445383706, + "grad_norm": 0.51953125, + "learning_rate": 9.523076923076924e-06, + "loss": 1.6745, + "step": 14300 + }, + { + "epoch": 0.09152503591086479, + "grad_norm": 0.76953125, + "learning_rate": 9.517948717948718e-06, + "loss": 1.6871, + "step": 14400 + }, + { + "epoch": 0.09216062643802357, + "grad_norm": 0.71875, + "learning_rate": 9.512820512820514e-06, + "loss": 1.6795, + "step": 14500 + }, + { + "epoch": 0.09279621696518235, + "grad_norm": 1.140625, + "learning_rate": 9.50769230769231e-06, + "loss": 1.6779, + "step": 14600 + }, + { + "epoch": 0.09343180749234113, + "grad_norm": 1.0546875, + "learning_rate": 9.502564102564103e-06, + "loss": 1.6846, + "step": 14700 + }, + { + "epoch": 0.09406739801949991, + "grad_norm": 1.046875, + "learning_rate": 9.497435897435898e-06, + "loss": 1.6762, + "step": 14800 + }, + { + "epoch": 0.0947029885466587, + "grad_norm": 0.7421875, + "learning_rate": 9.492307692307693e-06, + "loss": 1.6929, + "step": 14900 + }, + { + "epoch": 0.09533857907381749, + "grad_norm": 0.734375, + "learning_rate": 9.487179487179487e-06, + "loss": 1.6791, + "step": 15000 + }, + { + "epoch": 0.09597416960097627, + "grad_norm": 0.78515625, + "learning_rate": 9.482051282051283e-06, + "loss": 1.677, + "step": 15100 + }, + { + "epoch": 0.09660976012813505, + "grad_norm": 0.83984375, + "learning_rate": 9.476923076923079e-06, + "loss": 1.6817, + "step": 15200 + }, + { + "epoch": 0.09724535065529383, + "grad_norm": 0.7265625, + "learning_rate": 9.471794871794873e-06, + "loss": 1.6632, + "step": 15300 + }, + { + "epoch": 0.09788094118245262, + "grad_norm": 0.55859375, + "learning_rate": 9.466666666666667e-06, + "loss": 1.681, + "step": 15400 + }, + { + "epoch": 0.0985165317096114, + "grad_norm": 0.6484375, + "learning_rate": 9.461538461538463e-06, + "loss": 1.681, + "step": 15500 + }, + { + "epoch": 0.09915212223677018, + "grad_norm": 0.60546875, + "learning_rate": 9.456410256410257e-06, + "loss": 1.6915, + "step": 15600 + }, + { + "epoch": 0.09978771276392896, + "grad_norm": 0.76953125, + "learning_rate": 9.451282051282052e-06, + "loss": 1.6797, + "step": 15700 + }, + { + "epoch": 0.10042330329108776, + "grad_norm": 0.7578125, + "learning_rate": 9.446153846153847e-06, + "loss": 1.664, + "step": 15800 + }, + { + "epoch": 0.10105889381824654, + "grad_norm": 0.77734375, + "learning_rate": 9.441025641025642e-06, + "loss": 1.6738, + "step": 15900 + }, + { + "epoch": 0.10169448434540532, + "grad_norm": 0.671875, + "learning_rate": 9.435897435897436e-06, + "loss": 1.6822, + "step": 16000 + }, + { + "epoch": 0.1023300748725641, + "grad_norm": 0.73828125, + "learning_rate": 9.43076923076923e-06, + "loss": 1.6828, + "step": 16100 + }, + { + "epoch": 0.10296566539972288, + "grad_norm": 0.498046875, + "learning_rate": 9.425641025641026e-06, + "loss": 1.6714, + "step": 16200 + }, + { + "epoch": 0.10360125592688167, + "grad_norm": 0.46484375, + "learning_rate": 9.420512820512822e-06, + "loss": 1.692, + "step": 16300 + }, + { + "epoch": 0.10423684645404045, + "grad_norm": 0.921875, + "learning_rate": 9.415384615384616e-06, + "loss": 1.6833, + "step": 16400 + }, + { + "epoch": 0.10487243698119923, + "grad_norm": 0.7578125, + "learning_rate": 9.410256410256412e-06, + "loss": 1.6835, + "step": 16500 + }, + { + "epoch": 0.10550802750835801, + "grad_norm": 0.8359375, + "learning_rate": 9.405128205128206e-06, + "loss": 1.6738, + "step": 16600 + }, + { + "epoch": 0.10614361803551679, + "grad_norm": 0.6796875, + "learning_rate": 9.4e-06, + "loss": 1.6769, + "step": 16700 + }, + { + "epoch": 0.10677920856267559, + "grad_norm": 1.078125, + "learning_rate": 9.394871794871796e-06, + "loss": 1.677, + "step": 16800 + }, + { + "epoch": 0.10741479908983437, + "grad_norm": 0.578125, + "learning_rate": 9.389743589743591e-06, + "loss": 1.6887, + "step": 16900 + }, + { + "epoch": 0.10805038961699315, + "grad_norm": 0.62109375, + "learning_rate": 9.384615384615385e-06, + "loss": 1.6889, + "step": 17000 + }, + { + "epoch": 0.10868598014415193, + "grad_norm": 0.828125, + "learning_rate": 9.37948717948718e-06, + "loss": 1.6689, + "step": 17100 + }, + { + "epoch": 0.10932157067131071, + "grad_norm": 0.84375, + "learning_rate": 9.374358974358975e-06, + "loss": 1.6851, + "step": 17200 + }, + { + "epoch": 0.1099571611984695, + "grad_norm": 0.71484375, + "learning_rate": 9.369230769230771e-06, + "loss": 1.6878, + "step": 17300 + }, + { + "epoch": 0.11059275172562828, + "grad_norm": 0.6640625, + "learning_rate": 9.364102564102565e-06, + "loss": 1.6609, + "step": 17400 + }, + { + "epoch": 0.11122834225278706, + "grad_norm": 0.7890625, + "learning_rate": 9.358974358974359e-06, + "loss": 1.6778, + "step": 17500 + }, + { + "epoch": 0.11186393277994584, + "grad_norm": 0.7265625, + "learning_rate": 9.353846153846155e-06, + "loss": 1.674, + "step": 17600 + }, + { + "epoch": 0.11249952330710464, + "grad_norm": 0.65625, + "learning_rate": 9.348717948717949e-06, + "loss": 1.6686, + "step": 17700 + }, + { + "epoch": 0.11313511383426342, + "grad_norm": 0.474609375, + "learning_rate": 9.343589743589745e-06, + "loss": 1.6728, + "step": 17800 + }, + { + "epoch": 0.1137707043614222, + "grad_norm": 0.482421875, + "learning_rate": 9.33846153846154e-06, + "loss": 1.6664, + "step": 17900 + }, + { + "epoch": 0.11440629488858098, + "grad_norm": 0.7890625, + "learning_rate": 9.333333333333334e-06, + "loss": 1.6746, + "step": 18000 + }, + { + "epoch": 0.11504188541573976, + "grad_norm": 1.1015625, + "learning_rate": 9.328205128205128e-06, + "loss": 1.6738, + "step": 18100 + }, + { + "epoch": 0.11567747594289855, + "grad_norm": 0.76171875, + "learning_rate": 9.323076923076924e-06, + "loss": 1.6628, + "step": 18200 + }, + { + "epoch": 0.11631306647005733, + "grad_norm": 0.48828125, + "learning_rate": 9.317948717948718e-06, + "loss": 1.6718, + "step": 18300 + }, + { + "epoch": 0.11694865699721611, + "grad_norm": 0.7890625, + "learning_rate": 9.312820512820514e-06, + "loss": 1.6789, + "step": 18400 + }, + { + "epoch": 0.1175842475243749, + "grad_norm": 0.94140625, + "learning_rate": 9.307692307692308e-06, + "loss": 1.6618, + "step": 18500 + }, + { + "epoch": 0.11821983805153367, + "grad_norm": 1.1640625, + "learning_rate": 9.302564102564104e-06, + "loss": 1.6648, + "step": 18600 + }, + { + "epoch": 0.11885542857869247, + "grad_norm": 0.69140625, + "learning_rate": 9.297435897435898e-06, + "loss": 1.683, + "step": 18700 + }, + { + "epoch": 0.11949101910585125, + "grad_norm": 0.6640625, + "learning_rate": 9.292307692307694e-06, + "loss": 1.6457, + "step": 18800 + }, + { + "epoch": 0.12012660963301003, + "grad_norm": 0.84375, + "learning_rate": 9.28717948717949e-06, + "loss": 1.6728, + "step": 18900 + }, + { + "epoch": 0.12076220016016881, + "grad_norm": 0.65234375, + "learning_rate": 9.282051282051283e-06, + "loss": 1.683, + "step": 19000 + }, + { + "epoch": 0.12139779068732759, + "grad_norm": 0.90625, + "learning_rate": 9.276923076923077e-06, + "loss": 1.6626, + "step": 19100 + }, + { + "epoch": 0.12203338121448638, + "grad_norm": 0.765625, + "learning_rate": 9.271794871794873e-06, + "loss": 1.6704, + "step": 19200 + }, + { + "epoch": 0.12266897174164516, + "grad_norm": 0.79296875, + "learning_rate": 9.266666666666667e-06, + "loss": 1.6608, + "step": 19300 + }, + { + "epoch": 0.12330456226880394, + "grad_norm": 0.62890625, + "learning_rate": 9.261538461538461e-06, + "loss": 1.6761, + "step": 19400 + }, + { + "epoch": 0.12394015279596272, + "grad_norm": 1.015625, + "learning_rate": 9.256410256410257e-06, + "loss": 1.6537, + "step": 19500 + }, + { + "epoch": 0.12457574332312152, + "grad_norm": 0.478515625, + "learning_rate": 9.251282051282053e-06, + "loss": 1.6732, + "step": 19600 + }, + { + "epoch": 0.1252113338502803, + "grad_norm": 0.90625, + "learning_rate": 9.246153846153847e-06, + "loss": 1.6708, + "step": 19700 + }, + { + "epoch": 0.12584692437743908, + "grad_norm": 0.5859375, + "learning_rate": 9.24102564102564e-06, + "loss": 1.6619, + "step": 19800 + }, + { + "epoch": 0.12648251490459786, + "grad_norm": 0.68359375, + "learning_rate": 9.235897435897437e-06, + "loss": 1.6666, + "step": 19900 + }, + { + "epoch": 0.12711810543175664, + "grad_norm": 0.58984375, + "learning_rate": 9.230769230769232e-06, + "loss": 1.6725, + "step": 20000 + }, + { + "epoch": 0.12775369595891542, + "grad_norm": 0.65234375, + "learning_rate": 9.225641025641026e-06, + "loss": 1.6689, + "step": 20100 + }, + { + "epoch": 0.1283892864860742, + "grad_norm": 0.83984375, + "learning_rate": 9.220512820512822e-06, + "loss": 1.6735, + "step": 20200 + }, + { + "epoch": 0.129024877013233, + "grad_norm": 0.69921875, + "learning_rate": 9.215384615384616e-06, + "loss": 1.6652, + "step": 20300 + }, + { + "epoch": 0.1296604675403918, + "grad_norm": 0.7578125, + "learning_rate": 9.21025641025641e-06, + "loss": 1.6662, + "step": 20400 + }, + { + "epoch": 0.13029605806755057, + "grad_norm": 0.6328125, + "learning_rate": 9.205128205128206e-06, + "loss": 1.6553, + "step": 20500 + }, + { + "epoch": 0.13093164859470935, + "grad_norm": 0.51171875, + "learning_rate": 9.200000000000002e-06, + "loss": 1.6571, + "step": 20600 + }, + { + "epoch": 0.13156723912186813, + "grad_norm": 0.478515625, + "learning_rate": 9.194871794871796e-06, + "loss": 1.6626, + "step": 20700 + }, + { + "epoch": 0.1322028296490269, + "grad_norm": 0.94921875, + "learning_rate": 9.18974358974359e-06, + "loss": 1.6703, + "step": 20800 + }, + { + "epoch": 0.1328384201761857, + "grad_norm": 0.6015625, + "learning_rate": 9.184615384615386e-06, + "loss": 1.6563, + "step": 20900 + }, + { + "epoch": 0.13347401070334447, + "grad_norm": 0.9609375, + "learning_rate": 9.17948717948718e-06, + "loss": 1.6549, + "step": 21000 + }, + { + "epoch": 0.13410960123050325, + "grad_norm": 0.62890625, + "learning_rate": 9.174358974358975e-06, + "loss": 1.6777, + "step": 21100 + }, + { + "epoch": 0.13474519175766203, + "grad_norm": 0.765625, + "learning_rate": 9.169230769230771e-06, + "loss": 1.6478, + "step": 21200 + }, + { + "epoch": 0.13538078228482084, + "grad_norm": 0.98828125, + "learning_rate": 9.164102564102565e-06, + "loss": 1.6671, + "step": 21300 + }, + { + "epoch": 0.13601637281197962, + "grad_norm": 0.78515625, + "learning_rate": 9.15897435897436e-06, + "loss": 1.6846, + "step": 21400 + }, + { + "epoch": 0.1366519633391384, + "grad_norm": 0.97265625, + "learning_rate": 9.153846153846155e-06, + "loss": 1.6869, + "step": 21500 + }, + { + "epoch": 0.13728755386629718, + "grad_norm": 0.69921875, + "learning_rate": 9.148717948717949e-06, + "loss": 1.6531, + "step": 21600 + }, + { + "epoch": 0.13792314439345596, + "grad_norm": 0.984375, + "learning_rate": 9.143589743589745e-06, + "loss": 1.669, + "step": 21700 + }, + { + "epoch": 0.13855873492061474, + "grad_norm": 0.65234375, + "learning_rate": 9.138461538461539e-06, + "loss": 1.6553, + "step": 21800 + }, + { + "epoch": 0.13919432544777352, + "grad_norm": 0.61328125, + "learning_rate": 9.133333333333335e-06, + "loss": 1.6653, + "step": 21900 + }, + { + "epoch": 0.1398299159749323, + "grad_norm": 0.671875, + "learning_rate": 9.128205128205129e-06, + "loss": 1.6617, + "step": 22000 + }, + { + "epoch": 0.14046550650209108, + "grad_norm": 0.69921875, + "learning_rate": 9.123076923076923e-06, + "loss": 1.6819, + "step": 22100 + }, + { + "epoch": 0.1411010970292499, + "grad_norm": 0.796875, + "learning_rate": 9.117948717948718e-06, + "loss": 1.6546, + "step": 22200 + }, + { + "epoch": 0.14173668755640867, + "grad_norm": 0.83203125, + "learning_rate": 9.112820512820514e-06, + "loss": 1.6635, + "step": 22300 + }, + { + "epoch": 0.14237227808356745, + "grad_norm": 0.5859375, + "learning_rate": 9.107692307692308e-06, + "loss": 1.6673, + "step": 22400 + }, + { + "epoch": 0.14300786861072623, + "grad_norm": 0.77734375, + "learning_rate": 9.102564102564104e-06, + "loss": 1.6683, + "step": 22500 + }, + { + "epoch": 0.143643459137885, + "grad_norm": 0.796875, + "learning_rate": 9.097435897435898e-06, + "loss": 1.666, + "step": 22600 + }, + { + "epoch": 0.1442790496650438, + "grad_norm": 0.6484375, + "learning_rate": 9.092307692307692e-06, + "loss": 1.6557, + "step": 22700 + }, + { + "epoch": 0.14491464019220257, + "grad_norm": 0.9453125, + "learning_rate": 9.087179487179488e-06, + "loss": 1.6566, + "step": 22800 + }, + { + "epoch": 0.14555023071936135, + "grad_norm": 0.65234375, + "learning_rate": 9.082051282051284e-06, + "loss": 1.6515, + "step": 22900 + }, + { + "epoch": 0.14618582124652013, + "grad_norm": 0.80859375, + "learning_rate": 9.076923076923078e-06, + "loss": 1.6468, + "step": 23000 + }, + { + "epoch": 0.1468214117736789, + "grad_norm": 0.66796875, + "learning_rate": 9.071794871794872e-06, + "loss": 1.6619, + "step": 23100 + }, + { + "epoch": 0.14745700230083772, + "grad_norm": 0.85546875, + "learning_rate": 9.066666666666667e-06, + "loss": 1.6479, + "step": 23200 + }, + { + "epoch": 0.1480925928279965, + "grad_norm": 0.625, + "learning_rate": 9.061538461538463e-06, + "loss": 1.6581, + "step": 23300 + }, + { + "epoch": 0.14872818335515528, + "grad_norm": 0.59765625, + "learning_rate": 9.056410256410257e-06, + "loss": 1.6668, + "step": 23400 + }, + { + "epoch": 0.14936377388231406, + "grad_norm": 0.6953125, + "learning_rate": 9.051282051282051e-06, + "loss": 1.664, + "step": 23500 + }, + { + "epoch": 0.14999936440947284, + "grad_norm": 0.87890625, + "learning_rate": 9.046153846153847e-06, + "loss": 1.6472, + "step": 23600 + }, + { + "epoch": 0.15063495493663162, + "grad_norm": 1.0078125, + "learning_rate": 9.041025641025641e-06, + "loss": 1.6529, + "step": 23700 + }, + { + "epoch": 0.1512705454637904, + "grad_norm": 0.5703125, + "learning_rate": 9.035897435897437e-06, + "loss": 1.6544, + "step": 23800 + }, + { + "epoch": 0.15190613599094918, + "grad_norm": 0.72265625, + "learning_rate": 9.030769230769233e-06, + "loss": 1.6549, + "step": 23900 + }, + { + "epoch": 0.15254172651810796, + "grad_norm": 0.65625, + "learning_rate": 9.025641025641027e-06, + "loss": 1.6627, + "step": 24000 + }, + { + "epoch": 0.15317731704526677, + "grad_norm": 0.7421875, + "learning_rate": 9.02051282051282e-06, + "loss": 1.646, + "step": 24100 + }, + { + "epoch": 0.15381290757242555, + "grad_norm": 0.7421875, + "learning_rate": 9.015384615384616e-06, + "loss": 1.6602, + "step": 24200 + }, + { + "epoch": 0.15444849809958433, + "grad_norm": 0.515625, + "learning_rate": 9.01025641025641e-06, + "loss": 1.6532, + "step": 24300 + }, + { + "epoch": 0.1550840886267431, + "grad_norm": 1.0625, + "learning_rate": 9.005128205128206e-06, + "loss": 1.6411, + "step": 24400 + }, + { + "epoch": 0.1557196791539019, + "grad_norm": 0.9765625, + "learning_rate": 9e-06, + "loss": 1.6684, + "step": 24500 + }, + { + "epoch": 0.15635526968106067, + "grad_norm": 0.8828125, + "learning_rate": 8.994871794871796e-06, + "loss": 1.6517, + "step": 24600 + }, + { + "epoch": 0.15699086020821945, + "grad_norm": 0.79296875, + "learning_rate": 8.98974358974359e-06, + "loss": 1.6509, + "step": 24700 + }, + { + "epoch": 0.15762645073537823, + "grad_norm": 0.458984375, + "learning_rate": 8.984615384615386e-06, + "loss": 1.6603, + "step": 24800 + }, + { + "epoch": 0.158262041262537, + "grad_norm": 0.7421875, + "learning_rate": 8.979487179487182e-06, + "loss": 1.661, + "step": 24900 + }, + { + "epoch": 0.15889763178969582, + "grad_norm": 0.84375, + "learning_rate": 8.974358974358976e-06, + "loss": 1.6679, + "step": 25000 + }, + { + "epoch": 0.1595332223168546, + "grad_norm": 0.609375, + "learning_rate": 8.96923076923077e-06, + "loss": 1.6555, + "step": 25100 + }, + { + "epoch": 0.16016881284401338, + "grad_norm": 0.515625, + "learning_rate": 8.964102564102565e-06, + "loss": 1.6652, + "step": 25200 + }, + { + "epoch": 0.16080440337117216, + "grad_norm": 0.7578125, + "learning_rate": 8.95897435897436e-06, + "loss": 1.6393, + "step": 25300 + }, + { + "epoch": 0.16143999389833094, + "grad_norm": 0.86328125, + "learning_rate": 8.953846153846153e-06, + "loss": 1.6479, + "step": 25400 + }, + { + "epoch": 0.16207558442548972, + "grad_norm": 0.9296875, + "learning_rate": 8.94871794871795e-06, + "loss": 1.6509, + "step": 25500 + }, + { + "epoch": 0.1627111749526485, + "grad_norm": 0.5390625, + "learning_rate": 8.943589743589745e-06, + "loss": 1.66, + "step": 25600 + }, + { + "epoch": 0.16334676547980728, + "grad_norm": 0.83203125, + "learning_rate": 8.938461538461539e-06, + "loss": 1.6697, + "step": 25700 + }, + { + "epoch": 0.16398235600696606, + "grad_norm": 0.84765625, + "learning_rate": 8.933333333333333e-06, + "loss": 1.6493, + "step": 25800 + }, + { + "epoch": 0.16461794653412484, + "grad_norm": 0.5, + "learning_rate": 8.928205128205129e-06, + "loss": 1.6503, + "step": 25900 + }, + { + "epoch": 0.16525353706128365, + "grad_norm": 0.609375, + "learning_rate": 8.923076923076925e-06, + "loss": 1.65, + "step": 26000 + }, + { + "epoch": 0.16588912758844243, + "grad_norm": 0.91015625, + "learning_rate": 8.917948717948719e-06, + "loss": 1.6532, + "step": 26100 + }, + { + "epoch": 0.1665247181156012, + "grad_norm": 0.76953125, + "learning_rate": 8.912820512820514e-06, + "loss": 1.6583, + "step": 26200 + }, + { + "epoch": 0.16716030864276, + "grad_norm": 0.70703125, + "learning_rate": 8.907692307692308e-06, + "loss": 1.6423, + "step": 26300 + }, + { + "epoch": 0.16779589916991877, + "grad_norm": 0.85546875, + "learning_rate": 8.902564102564102e-06, + "loss": 1.6699, + "step": 26400 + }, + { + "epoch": 0.16843148969707755, + "grad_norm": 0.66015625, + "learning_rate": 8.897435897435898e-06, + "loss": 1.6522, + "step": 26500 + }, + { + "epoch": 0.16906708022423633, + "grad_norm": 0.609375, + "learning_rate": 8.892307692307694e-06, + "loss": 1.6531, + "step": 26600 + }, + { + "epoch": 0.1697026707513951, + "grad_norm": 0.71875, + "learning_rate": 8.887179487179488e-06, + "loss": 1.6512, + "step": 26700 + }, + { + "epoch": 0.1703382612785539, + "grad_norm": 0.4765625, + "learning_rate": 8.882051282051282e-06, + "loss": 1.6158, + "step": 26800 + }, + { + "epoch": 0.1709738518057127, + "grad_norm": 1.015625, + "learning_rate": 8.876923076923078e-06, + "loss": 1.6516, + "step": 26900 + }, + { + "epoch": 0.17160944233287148, + "grad_norm": 0.85546875, + "learning_rate": 8.871794871794872e-06, + "loss": 1.6409, + "step": 27000 + }, + { + "epoch": 0.17224503286003026, + "grad_norm": 0.40234375, + "learning_rate": 8.866666666666668e-06, + "loss": 1.6526, + "step": 27100 + }, + { + "epoch": 0.17288062338718904, + "grad_norm": 0.6015625, + "learning_rate": 8.861538461538463e-06, + "loss": 1.6387, + "step": 27200 + }, + { + "epoch": 0.17351621391434782, + "grad_norm": 0.625, + "learning_rate": 8.856410256410257e-06, + "loss": 1.6583, + "step": 27300 + }, + { + "epoch": 0.1741518044415066, + "grad_norm": 0.79296875, + "learning_rate": 8.851282051282051e-06, + "loss": 1.64, + "step": 27400 + }, + { + "epoch": 0.17478739496866538, + "grad_norm": 1.2578125, + "learning_rate": 8.846153846153847e-06, + "loss": 1.6365, + "step": 27500 + }, + { + "epoch": 0.17542298549582416, + "grad_norm": 0.5, + "learning_rate": 8.841025641025641e-06, + "loss": 1.6424, + "step": 27600 + }, + { + "epoch": 0.17605857602298294, + "grad_norm": 0.58203125, + "learning_rate": 8.835897435897437e-06, + "loss": 1.646, + "step": 27700 + }, + { + "epoch": 0.17669416655014172, + "grad_norm": 0.9375, + "learning_rate": 8.830769230769231e-06, + "loss": 1.6514, + "step": 27800 + }, + { + "epoch": 0.17732975707730053, + "grad_norm": 0.470703125, + "learning_rate": 8.825641025641027e-06, + "loss": 1.6687, + "step": 27900 + }, + { + "epoch": 0.1779653476044593, + "grad_norm": 1.0625, + "learning_rate": 8.820512820512821e-06, + "loss": 1.6496, + "step": 28000 + }, + { + "epoch": 0.1786009381316181, + "grad_norm": 0.87109375, + "learning_rate": 8.815384615384615e-06, + "loss": 1.6357, + "step": 28100 + }, + { + "epoch": 0.17923652865877687, + "grad_norm": 0.68359375, + "learning_rate": 8.81025641025641e-06, + "loss": 1.6453, + "step": 28200 + }, + { + "epoch": 0.17987211918593565, + "grad_norm": 0.75390625, + "learning_rate": 8.805128205128206e-06, + "loss": 1.659, + "step": 28300 + }, + { + "epoch": 0.18050770971309443, + "grad_norm": 0.76171875, + "learning_rate": 8.8e-06, + "loss": 1.6527, + "step": 28400 + }, + { + "epoch": 0.1811433002402532, + "grad_norm": 0.63671875, + "learning_rate": 8.794871794871796e-06, + "loss": 1.6669, + "step": 28500 + }, + { + "epoch": 0.181778890767412, + "grad_norm": 0.76953125, + "learning_rate": 8.78974358974359e-06, + "loss": 1.6514, + "step": 28600 + }, + { + "epoch": 0.18241448129457077, + "grad_norm": 0.6171875, + "learning_rate": 8.784615384615386e-06, + "loss": 1.6655, + "step": 28700 + }, + { + "epoch": 0.18305007182172958, + "grad_norm": 0.71484375, + "learning_rate": 8.77948717948718e-06, + "loss": 1.6524, + "step": 28800 + }, + { + "epoch": 0.18368566234888836, + "grad_norm": 0.8828125, + "learning_rate": 8.774358974358976e-06, + "loss": 1.6286, + "step": 28900 + }, + { + "epoch": 0.18432125287604714, + "grad_norm": 0.73828125, + "learning_rate": 8.76923076923077e-06, + "loss": 1.639, + "step": 29000 + }, + { + "epoch": 0.18495684340320592, + "grad_norm": 0.7265625, + "learning_rate": 8.764102564102564e-06, + "loss": 1.6487, + "step": 29100 + }, + { + "epoch": 0.1855924339303647, + "grad_norm": 0.91796875, + "learning_rate": 8.75897435897436e-06, + "loss": 1.6576, + "step": 29200 + }, + { + "epoch": 0.18622802445752348, + "grad_norm": 0.66796875, + "learning_rate": 8.753846153846155e-06, + "loss": 1.6483, + "step": 29300 + }, + { + "epoch": 0.18686361498468226, + "grad_norm": 0.62109375, + "learning_rate": 8.74871794871795e-06, + "loss": 1.6399, + "step": 29400 + }, + { + "epoch": 0.18749920551184104, + "grad_norm": 0.7890625, + "learning_rate": 8.743589743589743e-06, + "loss": 1.6383, + "step": 29500 + }, + { + "epoch": 0.18813479603899982, + "grad_norm": 0.5546875, + "learning_rate": 8.73846153846154e-06, + "loss": 1.6486, + "step": 29600 + }, + { + "epoch": 0.1887703865661586, + "grad_norm": 0.67578125, + "learning_rate": 8.733333333333333e-06, + "loss": 1.6503, + "step": 29700 + }, + { + "epoch": 0.1894059770933174, + "grad_norm": 0.9609375, + "learning_rate": 8.728205128205129e-06, + "loss": 1.6361, + "step": 29800 + }, + { + "epoch": 0.1900415676204762, + "grad_norm": 0.9609375, + "learning_rate": 8.723076923076925e-06, + "loss": 1.6402, + "step": 29900 + }, + { + "epoch": 0.19067715814763497, + "grad_norm": 0.443359375, + "learning_rate": 8.717948717948719e-06, + "loss": 1.6462, + "step": 30000 + }, + { + "epoch": 0.19131274867479375, + "grad_norm": 0.482421875, + "learning_rate": 8.712820512820513e-06, + "loss": 1.6481, + "step": 30100 + }, + { + "epoch": 0.19194833920195253, + "grad_norm": 0.85546875, + "learning_rate": 8.707692307692309e-06, + "loss": 1.6497, + "step": 30200 + }, + { + "epoch": 0.19258392972911131, + "grad_norm": 0.73046875, + "learning_rate": 8.702564102564103e-06, + "loss": 1.6486, + "step": 30300 + }, + { + "epoch": 0.1932195202562701, + "grad_norm": 0.71484375, + "learning_rate": 8.697435897435898e-06, + "loss": 1.6494, + "step": 30400 + }, + { + "epoch": 0.19385511078342887, + "grad_norm": 0.57421875, + "learning_rate": 8.692307692307692e-06, + "loss": 1.6529, + "step": 30500 + }, + { + "epoch": 0.19449070131058765, + "grad_norm": 0.66796875, + "learning_rate": 8.687179487179488e-06, + "loss": 1.642, + "step": 30600 + }, + { + "epoch": 0.19512629183774646, + "grad_norm": 0.82421875, + "learning_rate": 8.682051282051282e-06, + "loss": 1.6206, + "step": 30700 + }, + { + "epoch": 0.19576188236490524, + "grad_norm": 0.82421875, + "learning_rate": 8.676923076923078e-06, + "loss": 1.6438, + "step": 30800 + }, + { + "epoch": 0.19639747289206402, + "grad_norm": 0.734375, + "learning_rate": 8.671794871794874e-06, + "loss": 1.6431, + "step": 30900 + }, + { + "epoch": 0.1970330634192228, + "grad_norm": 0.77734375, + "learning_rate": 8.666666666666668e-06, + "loss": 1.6529, + "step": 31000 + }, + { + "epoch": 0.19766865394638158, + "grad_norm": 0.671875, + "learning_rate": 8.661538461538462e-06, + "loss": 1.6476, + "step": 31100 + }, + { + "epoch": 0.19830424447354036, + "grad_norm": 0.640625, + "learning_rate": 8.656410256410258e-06, + "loss": 1.6645, + "step": 31200 + }, + { + "epoch": 0.19893983500069914, + "grad_norm": 0.8203125, + "learning_rate": 8.651282051282052e-06, + "loss": 1.6336, + "step": 31300 + }, + { + "epoch": 0.19957542552785792, + "grad_norm": 0.5390625, + "learning_rate": 8.646153846153846e-06, + "loss": 1.6391, + "step": 31400 + }, + { + "epoch": 0.2002110160550167, + "grad_norm": 0.6796875, + "learning_rate": 8.641025641025641e-06, + "loss": 1.6344, + "step": 31500 + }, + { + "epoch": 0.2008466065821755, + "grad_norm": 0.82421875, + "learning_rate": 8.635897435897437e-06, + "loss": 1.6469, + "step": 31600 + }, + { + "epoch": 0.2014821971093343, + "grad_norm": 0.8359375, + "learning_rate": 8.630769230769231e-06, + "loss": 1.6422, + "step": 31700 + }, + { + "epoch": 0.20211778763649307, + "grad_norm": 0.9140625, + "learning_rate": 8.625641025641025e-06, + "loss": 1.6404, + "step": 31800 + }, + { + "epoch": 0.20275337816365185, + "grad_norm": 0.75, + "learning_rate": 8.620512820512821e-06, + "loss": 1.6453, + "step": 31900 + }, + { + "epoch": 0.20338896869081063, + "grad_norm": 0.6484375, + "learning_rate": 8.615384615384617e-06, + "loss": 1.6445, + "step": 32000 + }, + { + "epoch": 0.20402455921796941, + "grad_norm": 0.44140625, + "learning_rate": 8.610256410256411e-06, + "loss": 1.6565, + "step": 32100 + }, + { + "epoch": 0.2046601497451282, + "grad_norm": 0.57421875, + "learning_rate": 8.605128205128207e-06, + "loss": 1.6426, + "step": 32200 + }, + { + "epoch": 0.20529574027228698, + "grad_norm": 0.59375, + "learning_rate": 8.6e-06, + "loss": 1.6288, + "step": 32300 + }, + { + "epoch": 0.20593133079944576, + "grad_norm": 0.54296875, + "learning_rate": 8.594871794871795e-06, + "loss": 1.6402, + "step": 32400 + }, + { + "epoch": 0.20656692132660454, + "grad_norm": 0.482421875, + "learning_rate": 8.58974358974359e-06, + "loss": 1.6308, + "step": 32500 + }, + { + "epoch": 0.20720251185376334, + "grad_norm": 0.671875, + "learning_rate": 8.584615384615386e-06, + "loss": 1.6371, + "step": 32600 + }, + { + "epoch": 0.20783810238092212, + "grad_norm": 0.8125, + "learning_rate": 8.57948717948718e-06, + "loss": 1.6449, + "step": 32700 + }, + { + "epoch": 0.2084736929080809, + "grad_norm": 0.71875, + "learning_rate": 8.574358974358974e-06, + "loss": 1.6521, + "step": 32800 + }, + { + "epoch": 0.20910928343523968, + "grad_norm": 1.09375, + "learning_rate": 8.56923076923077e-06, + "loss": 1.6418, + "step": 32900 + }, + { + "epoch": 0.20974487396239846, + "grad_norm": 0.66015625, + "learning_rate": 8.564102564102564e-06, + "loss": 1.6342, + "step": 33000 + }, + { + "epoch": 0.21038046448955725, + "grad_norm": 0.80078125, + "learning_rate": 8.55897435897436e-06, + "loss": 1.6413, + "step": 33100 + }, + { + "epoch": 0.21101605501671603, + "grad_norm": 0.7578125, + "learning_rate": 8.553846153846156e-06, + "loss": 1.6432, + "step": 33200 + }, + { + "epoch": 0.2116516455438748, + "grad_norm": 0.75390625, + "learning_rate": 8.54871794871795e-06, + "loss": 1.6419, + "step": 33300 + }, + { + "epoch": 0.21228723607103359, + "grad_norm": 0.7578125, + "learning_rate": 8.543589743589744e-06, + "loss": 1.6508, + "step": 33400 + }, + { + "epoch": 0.2129228265981924, + "grad_norm": 0.9921875, + "learning_rate": 8.53846153846154e-06, + "loss": 1.6431, + "step": 33500 + }, + { + "epoch": 0.21355841712535117, + "grad_norm": 0.58984375, + "learning_rate": 8.533333333333335e-06, + "loss": 1.6333, + "step": 33600 + }, + { + "epoch": 0.21419400765250995, + "grad_norm": 1.578125, + "learning_rate": 8.52820512820513e-06, + "loss": 1.659, + "step": 33700 + }, + { + "epoch": 0.21482959817966873, + "grad_norm": 0.73828125, + "learning_rate": 8.523076923076923e-06, + "loss": 1.6344, + "step": 33800 + }, + { + "epoch": 0.21546518870682752, + "grad_norm": 0.73046875, + "learning_rate": 8.517948717948719e-06, + "loss": 1.6409, + "step": 33900 + }, + { + "epoch": 0.2161007792339863, + "grad_norm": 0.9453125, + "learning_rate": 8.512820512820513e-06, + "loss": 1.6304, + "step": 34000 + }, + { + "epoch": 0.21673636976114508, + "grad_norm": 0.9140625, + "learning_rate": 8.507692307692307e-06, + "loss": 1.6352, + "step": 34100 + }, + { + "epoch": 0.21737196028830386, + "grad_norm": 0.6953125, + "learning_rate": 8.502564102564103e-06, + "loss": 1.6341, + "step": 34200 + }, + { + "epoch": 0.21800755081546264, + "grad_norm": 0.93359375, + "learning_rate": 8.497435897435899e-06, + "loss": 1.6448, + "step": 34300 + }, + { + "epoch": 0.21864314134262142, + "grad_norm": 0.53125, + "learning_rate": 8.492307692307693e-06, + "loss": 1.6371, + "step": 34400 + }, + { + "epoch": 0.21927873186978022, + "grad_norm": 0.78515625, + "learning_rate": 8.487179487179488e-06, + "loss": 1.6548, + "step": 34500 + }, + { + "epoch": 0.219914322396939, + "grad_norm": 0.765625, + "learning_rate": 8.482051282051283e-06, + "loss": 1.6575, + "step": 34600 + }, + { + "epoch": 0.22054991292409779, + "grad_norm": 1.09375, + "learning_rate": 8.476923076923078e-06, + "loss": 1.652, + "step": 34700 + }, + { + "epoch": 0.22118550345125657, + "grad_norm": 0.470703125, + "learning_rate": 8.471794871794872e-06, + "loss": 1.6636, + "step": 34800 + }, + { + "epoch": 0.22182109397841535, + "grad_norm": 0.7265625, + "learning_rate": 8.466666666666668e-06, + "loss": 1.6483, + "step": 34900 + }, + { + "epoch": 0.22245668450557413, + "grad_norm": 0.69140625, + "learning_rate": 8.461538461538462e-06, + "loss": 1.6532, + "step": 35000 + }, + { + "epoch": 0.2230922750327329, + "grad_norm": 0.90234375, + "learning_rate": 8.456410256410256e-06, + "loss": 1.6306, + "step": 35100 + }, + { + "epoch": 0.2237278655598917, + "grad_norm": 0.51953125, + "learning_rate": 8.451282051282052e-06, + "loss": 1.6137, + "step": 35200 + }, + { + "epoch": 0.22436345608705047, + "grad_norm": 0.78125, + "learning_rate": 8.446153846153848e-06, + "loss": 1.6424, + "step": 35300 + }, + { + "epoch": 0.22499904661420927, + "grad_norm": 0.65625, + "learning_rate": 8.441025641025642e-06, + "loss": 1.6641, + "step": 35400 + }, + { + "epoch": 0.22563463714136806, + "grad_norm": 0.6796875, + "learning_rate": 8.435897435897436e-06, + "loss": 1.6388, + "step": 35500 + }, + { + "epoch": 0.22627022766852684, + "grad_norm": 0.85546875, + "learning_rate": 8.430769230769231e-06, + "loss": 1.6259, + "step": 35600 + }, + { + "epoch": 0.22690581819568562, + "grad_norm": 0.703125, + "learning_rate": 8.425641025641026e-06, + "loss": 1.639, + "step": 35700 + }, + { + "epoch": 0.2275414087228444, + "grad_norm": 1.0078125, + "learning_rate": 8.420512820512821e-06, + "loss": 1.6384, + "step": 35800 + }, + { + "epoch": 0.22817699925000318, + "grad_norm": 0.5, + "learning_rate": 8.415384615384617e-06, + "loss": 1.6446, + "step": 35900 + }, + { + "epoch": 0.22881258977716196, + "grad_norm": 0.72265625, + "learning_rate": 8.410256410256411e-06, + "loss": 1.6237, + "step": 36000 + }, + { + "epoch": 0.22944818030432074, + "grad_norm": 0.76171875, + "learning_rate": 8.405128205128205e-06, + "loss": 1.6413, + "step": 36100 + }, + { + "epoch": 0.23008377083147952, + "grad_norm": 0.76171875, + "learning_rate": 8.400000000000001e-06, + "loss": 1.6305, + "step": 36200 + }, + { + "epoch": 0.2307193613586383, + "grad_norm": 0.69140625, + "learning_rate": 8.394871794871795e-06, + "loss": 1.6586, + "step": 36300 + }, + { + "epoch": 0.2313549518857971, + "grad_norm": 0.62109375, + "learning_rate": 8.38974358974359e-06, + "loss": 1.6388, + "step": 36400 + }, + { + "epoch": 0.23199054241295589, + "grad_norm": 0.83203125, + "learning_rate": 8.384615384615385e-06, + "loss": 1.6322, + "step": 36500 + }, + { + "epoch": 0.23262613294011467, + "grad_norm": 0.66015625, + "learning_rate": 8.37948717948718e-06, + "loss": 1.6377, + "step": 36600 + }, + { + "epoch": 0.23326172346727345, + "grad_norm": 0.78515625, + "learning_rate": 8.374358974358975e-06, + "loss": 1.6323, + "step": 36700 + }, + { + "epoch": 0.23389731399443223, + "grad_norm": 0.96875, + "learning_rate": 8.36923076923077e-06, + "loss": 1.6332, + "step": 36800 + }, + { + "epoch": 0.234532904521591, + "grad_norm": 0.466796875, + "learning_rate": 8.364102564102566e-06, + "loss": 1.6308, + "step": 36900 + }, + { + "epoch": 0.2351684950487498, + "grad_norm": 1.1640625, + "learning_rate": 8.35897435897436e-06, + "loss": 1.6235, + "step": 37000 + }, + { + "epoch": 0.23580408557590857, + "grad_norm": 0.69140625, + "learning_rate": 8.353846153846154e-06, + "loss": 1.6393, + "step": 37100 + }, + { + "epoch": 0.23643967610306735, + "grad_norm": 0.8359375, + "learning_rate": 8.34871794871795e-06, + "loss": 1.6371, + "step": 37200 + }, + { + "epoch": 0.23707526663022616, + "grad_norm": 0.984375, + "learning_rate": 8.343589743589744e-06, + "loss": 1.635, + "step": 37300 + }, + { + "epoch": 0.23771085715738494, + "grad_norm": 0.8359375, + "learning_rate": 8.338461538461538e-06, + "loss": 1.6295, + "step": 37400 + }, + { + "epoch": 0.23834644768454372, + "grad_norm": 0.74609375, + "learning_rate": 8.333333333333334e-06, + "loss": 1.6413, + "step": 37500 + }, + { + "epoch": 0.2389820382117025, + "grad_norm": 0.515625, + "learning_rate": 8.32820512820513e-06, + "loss": 1.6512, + "step": 37600 + }, + { + "epoch": 0.23961762873886128, + "grad_norm": 0.76953125, + "learning_rate": 8.323076923076924e-06, + "loss": 1.6558, + "step": 37700 + }, + { + "epoch": 0.24025321926602006, + "grad_norm": 0.62109375, + "learning_rate": 8.317948717948718e-06, + "loss": 1.6632, + "step": 37800 + }, + { + "epoch": 0.24088880979317884, + "grad_norm": 0.5390625, + "learning_rate": 8.312820512820513e-06, + "loss": 1.6344, + "step": 37900 + }, + { + "epoch": 0.24152440032033762, + "grad_norm": 0.9375, + "learning_rate": 8.307692307692309e-06, + "loss": 1.6351, + "step": 38000 + }, + { + "epoch": 0.2421599908474964, + "grad_norm": 0.89453125, + "learning_rate": 8.302564102564103e-06, + "loss": 1.6342, + "step": 38100 + }, + { + "epoch": 0.24279558137465518, + "grad_norm": 0.65625, + "learning_rate": 8.297435897435899e-06, + "loss": 1.632, + "step": 38200 + }, + { + "epoch": 0.243431171901814, + "grad_norm": 0.70703125, + "learning_rate": 8.292307692307693e-06, + "loss": 1.6476, + "step": 38300 + }, + { + "epoch": 0.24406676242897277, + "grad_norm": 0.890625, + "learning_rate": 8.287179487179487e-06, + "loss": 1.6275, + "step": 38400 + }, + { + "epoch": 0.24470235295613155, + "grad_norm": 0.921875, + "learning_rate": 8.282051282051283e-06, + "loss": 1.6432, + "step": 38500 + }, + { + "epoch": 0.24533794348329033, + "grad_norm": 0.7265625, + "learning_rate": 8.276923076923078e-06, + "loss": 1.6506, + "step": 38600 + }, + { + "epoch": 0.2459735340104491, + "grad_norm": 0.81640625, + "learning_rate": 8.271794871794873e-06, + "loss": 1.6399, + "step": 38700 + }, + { + "epoch": 0.2466091245376079, + "grad_norm": 0.67578125, + "learning_rate": 8.266666666666667e-06, + "loss": 1.6384, + "step": 38800 + }, + { + "epoch": 0.24724471506476667, + "grad_norm": 0.6875, + "learning_rate": 8.261538461538462e-06, + "loss": 1.6286, + "step": 38900 + }, + { + "epoch": 0.24788030559192545, + "grad_norm": 1.09375, + "learning_rate": 8.256410256410256e-06, + "loss": 1.6442, + "step": 39000 + }, + { + "epoch": 0.24851589611908423, + "grad_norm": 0.76953125, + "learning_rate": 8.251282051282052e-06, + "loss": 1.6467, + "step": 39100 + }, + { + "epoch": 0.24915148664624304, + "grad_norm": 0.7734375, + "learning_rate": 8.246153846153848e-06, + "loss": 1.6507, + "step": 39200 + }, + { + "epoch": 0.24978707717340182, + "grad_norm": 0.7578125, + "learning_rate": 8.241025641025642e-06, + "loss": 1.63, + "step": 39300 + }, + { + "epoch": 0.2504226677005606, + "grad_norm": 0.66015625, + "learning_rate": 8.235897435897436e-06, + "loss": 1.6294, + "step": 39400 + }, + { + "epoch": 0.2510582582277194, + "grad_norm": 0.78515625, + "learning_rate": 8.230769230769232e-06, + "loss": 1.6233, + "step": 39500 + }, + { + "epoch": 0.25169384875487816, + "grad_norm": 0.6484375, + "learning_rate": 8.225641025641027e-06, + "loss": 1.6528, + "step": 39600 + }, + { + "epoch": 0.25232943928203694, + "grad_norm": 0.8125, + "learning_rate": 8.220512820512822e-06, + "loss": 1.6369, + "step": 39700 + }, + { + "epoch": 0.2529650298091957, + "grad_norm": 0.51953125, + "learning_rate": 8.215384615384616e-06, + "loss": 1.6193, + "step": 39800 + }, + { + "epoch": 0.2536006203363545, + "grad_norm": 1.0078125, + "learning_rate": 8.210256410256411e-06, + "loss": 1.646, + "step": 39900 + }, + { + "epoch": 0.2542362108635133, + "grad_norm": 0.88671875, + "learning_rate": 8.205128205128205e-06, + "loss": 1.6195, + "step": 40000 + }, + { + "epoch": 0.25487180139067206, + "grad_norm": 0.56640625, + "learning_rate": 8.2e-06, + "loss": 1.6344, + "step": 40100 + }, + { + "epoch": 0.25550739191783084, + "grad_norm": 0.9609375, + "learning_rate": 8.194871794871795e-06, + "loss": 1.647, + "step": 40200 + }, + { + "epoch": 0.2561429824449896, + "grad_norm": 0.8984375, + "learning_rate": 8.189743589743591e-06, + "loss": 1.6453, + "step": 40300 + }, + { + "epoch": 0.2567785729721484, + "grad_norm": 0.6484375, + "learning_rate": 8.184615384615385e-06, + "loss": 1.6263, + "step": 40400 + }, + { + "epoch": 0.2574141634993072, + "grad_norm": 0.79296875, + "learning_rate": 8.17948717948718e-06, + "loss": 1.6456, + "step": 40500 + }, + { + "epoch": 0.258049754026466, + "grad_norm": 0.625, + "learning_rate": 8.174358974358975e-06, + "loss": 1.6372, + "step": 40600 + }, + { + "epoch": 0.2586853445536248, + "grad_norm": 0.83984375, + "learning_rate": 8.16923076923077e-06, + "loss": 1.6321, + "step": 40700 + }, + { + "epoch": 0.2593209350807836, + "grad_norm": 0.98046875, + "learning_rate": 8.164102564102565e-06, + "loss": 1.63, + "step": 40800 + }, + { + "epoch": 0.25995652560794236, + "grad_norm": 0.75390625, + "learning_rate": 8.15897435897436e-06, + "loss": 1.6338, + "step": 40900 + }, + { + "epoch": 0.26059211613510114, + "grad_norm": 0.69140625, + "learning_rate": 8.153846153846154e-06, + "loss": 1.6366, + "step": 41000 + }, + { + "epoch": 0.2612277066622599, + "grad_norm": 0.7109375, + "learning_rate": 8.148717948717948e-06, + "loss": 1.639, + "step": 41100 + }, + { + "epoch": 0.2618632971894187, + "grad_norm": 0.59375, + "learning_rate": 8.143589743589744e-06, + "loss": 1.6464, + "step": 41200 + }, + { + "epoch": 0.2624988877165775, + "grad_norm": 0.56640625, + "learning_rate": 8.13846153846154e-06, + "loss": 1.6243, + "step": 41300 + }, + { + "epoch": 0.26313447824373626, + "grad_norm": 0.91796875, + "learning_rate": 8.133333333333334e-06, + "loss": 1.6245, + "step": 41400 + }, + { + "epoch": 0.26377006877089504, + "grad_norm": 0.5546875, + "learning_rate": 8.12820512820513e-06, + "loss": 1.6267, + "step": 41500 + }, + { + "epoch": 0.2644056592980538, + "grad_norm": 0.82421875, + "learning_rate": 8.123076923076924e-06, + "loss": 1.6351, + "step": 41600 + }, + { + "epoch": 0.2650412498252126, + "grad_norm": 0.70703125, + "learning_rate": 8.117948717948718e-06, + "loss": 1.6419, + "step": 41700 + }, + { + "epoch": 0.2656768403523714, + "grad_norm": 0.66796875, + "learning_rate": 8.112820512820514e-06, + "loss": 1.6504, + "step": 41800 + }, + { + "epoch": 0.26631243087953016, + "grad_norm": 1.0859375, + "learning_rate": 8.10769230769231e-06, + "loss": 1.6221, + "step": 41900 + }, + { + "epoch": 0.26694802140668894, + "grad_norm": 0.8671875, + "learning_rate": 8.102564102564103e-06, + "loss": 1.6303, + "step": 42000 + }, + { + "epoch": 0.2675836119338477, + "grad_norm": 0.486328125, + "learning_rate": 8.097435897435897e-06, + "loss": 1.6425, + "step": 42100 + }, + { + "epoch": 0.2682192024610065, + "grad_norm": 1.015625, + "learning_rate": 8.092307692307693e-06, + "loss": 1.6439, + "step": 42200 + }, + { + "epoch": 0.2688547929881653, + "grad_norm": 0.76171875, + "learning_rate": 8.087179487179487e-06, + "loss": 1.6236, + "step": 42300 + }, + { + "epoch": 0.26949038351532406, + "grad_norm": 0.83984375, + "learning_rate": 8.082051282051283e-06, + "loss": 1.6187, + "step": 42400 + }, + { + "epoch": 0.2701259740424829, + "grad_norm": 0.86328125, + "learning_rate": 8.076923076923077e-06, + "loss": 1.6479, + "step": 42500 + }, + { + "epoch": 0.2707615645696417, + "grad_norm": 0.953125, + "learning_rate": 8.071794871794873e-06, + "loss": 1.6291, + "step": 42600 + }, + { + "epoch": 0.27139715509680046, + "grad_norm": 1.0234375, + "learning_rate": 8.066666666666667e-06, + "loss": 1.6303, + "step": 42700 + }, + { + "epoch": 0.27203274562395924, + "grad_norm": 0.83984375, + "learning_rate": 8.061538461538463e-06, + "loss": 1.644, + "step": 42800 + }, + { + "epoch": 0.272668336151118, + "grad_norm": 0.80859375, + "learning_rate": 8.056410256410258e-06, + "loss": 1.618, + "step": 42900 + }, + { + "epoch": 0.2733039266782768, + "grad_norm": 0.74609375, + "learning_rate": 8.051282051282052e-06, + "loss": 1.6366, + "step": 43000 + }, + { + "epoch": 0.2739395172054356, + "grad_norm": 0.80078125, + "learning_rate": 8.046153846153846e-06, + "loss": 1.639, + "step": 43100 + }, + { + "epoch": 0.27457510773259436, + "grad_norm": 0.80078125, + "learning_rate": 8.041025641025642e-06, + "loss": 1.6251, + "step": 43200 + }, + { + "epoch": 0.27521069825975314, + "grad_norm": 0.64453125, + "learning_rate": 8.035897435897436e-06, + "loss": 1.636, + "step": 43300 + }, + { + "epoch": 0.2758462887869119, + "grad_norm": 0.7734375, + "learning_rate": 8.03076923076923e-06, + "loss": 1.6392, + "step": 43400 + }, + { + "epoch": 0.2764818793140707, + "grad_norm": 0.62890625, + "learning_rate": 8.025641025641026e-06, + "loss": 1.6438, + "step": 43500 + }, + { + "epoch": 0.2771174698412295, + "grad_norm": 0.69921875, + "learning_rate": 8.020512820512822e-06, + "loss": 1.6318, + "step": 43600 + }, + { + "epoch": 0.27775306036838826, + "grad_norm": 0.96484375, + "learning_rate": 8.015384615384616e-06, + "loss": 1.6582, + "step": 43700 + }, + { + "epoch": 0.27838865089554704, + "grad_norm": 0.765625, + "learning_rate": 8.01025641025641e-06, + "loss": 1.6218, + "step": 43800 + }, + { + "epoch": 0.2790242414227058, + "grad_norm": 1.421875, + "learning_rate": 8.005128205128206e-06, + "loss": 1.6256, + "step": 43900 + }, + { + "epoch": 0.2796598319498646, + "grad_norm": 1.625, + "learning_rate": 8.000000000000001e-06, + "loss": 1.6213, + "step": 44000 + }, + { + "epoch": 0.2802954224770234, + "grad_norm": 0.859375, + "learning_rate": 7.994871794871795e-06, + "loss": 1.6373, + "step": 44100 + }, + { + "epoch": 0.28093101300418216, + "grad_norm": 0.7734375, + "learning_rate": 7.989743589743591e-06, + "loss": 1.6333, + "step": 44200 + }, + { + "epoch": 0.28156660353134094, + "grad_norm": 0.546875, + "learning_rate": 7.984615384615385e-06, + "loss": 1.6335, + "step": 44300 + }, + { + "epoch": 0.2822021940584998, + "grad_norm": 0.84375, + "learning_rate": 7.97948717948718e-06, + "loss": 1.6411, + "step": 44400 + }, + { + "epoch": 0.28283778458565856, + "grad_norm": 0.6875, + "learning_rate": 7.974358974358975e-06, + "loss": 1.635, + "step": 44500 + }, + { + "epoch": 0.28347337511281734, + "grad_norm": 0.63671875, + "learning_rate": 7.96923076923077e-06, + "loss": 1.6425, + "step": 44600 + }, + { + "epoch": 0.2841089656399761, + "grad_norm": 0.796875, + "learning_rate": 7.964102564102565e-06, + "loss": 1.623, + "step": 44700 + }, + { + "epoch": 0.2847445561671349, + "grad_norm": 0.73828125, + "learning_rate": 7.958974358974359e-06, + "loss": 1.6395, + "step": 44800 + }, + { + "epoch": 0.2853801466942937, + "grad_norm": 0.76953125, + "learning_rate": 7.953846153846155e-06, + "loss": 1.6331, + "step": 44900 + }, + { + "epoch": 0.28601573722145246, + "grad_norm": 0.87890625, + "learning_rate": 7.948717948717949e-06, + "loss": 1.6358, + "step": 45000 + }, + { + "epoch": 0.28665132774861124, + "grad_norm": 0.8359375, + "learning_rate": 7.943589743589744e-06, + "loss": 1.6369, + "step": 45100 + }, + { + "epoch": 0.28728691827577, + "grad_norm": 0.81640625, + "learning_rate": 7.93846153846154e-06, + "loss": 1.6279, + "step": 45200 + }, + { + "epoch": 0.2879225088029288, + "grad_norm": 0.8046875, + "learning_rate": 7.933333333333334e-06, + "loss": 1.6299, + "step": 45300 + }, + { + "epoch": 0.2885580993300876, + "grad_norm": 0.82421875, + "learning_rate": 7.928205128205128e-06, + "loss": 1.6322, + "step": 45400 + }, + { + "epoch": 0.28919368985724636, + "grad_norm": 0.875, + "learning_rate": 7.923076923076924e-06, + "loss": 1.6241, + "step": 45500 + }, + { + "epoch": 0.28982928038440514, + "grad_norm": 0.7734375, + "learning_rate": 7.91794871794872e-06, + "loss": 1.6197, + "step": 45600 + }, + { + "epoch": 0.2904648709115639, + "grad_norm": 0.9140625, + "learning_rate": 7.912820512820514e-06, + "loss": 1.6213, + "step": 45700 + }, + { + "epoch": 0.2911004614387227, + "grad_norm": 0.6875, + "learning_rate": 7.907692307692308e-06, + "loss": 1.6283, + "step": 45800 + }, + { + "epoch": 0.2917360519658815, + "grad_norm": 0.75390625, + "learning_rate": 7.902564102564104e-06, + "loss": 1.6331, + "step": 45900 + }, + { + "epoch": 0.29237164249304026, + "grad_norm": 1.0234375, + "learning_rate": 7.897435897435898e-06, + "loss": 1.6354, + "step": 46000 + }, + { + "epoch": 0.29300723302019904, + "grad_norm": 0.6015625, + "learning_rate": 7.892307692307692e-06, + "loss": 1.6243, + "step": 46100 + }, + { + "epoch": 0.2936428235473578, + "grad_norm": 0.486328125, + "learning_rate": 7.887179487179487e-06, + "loss": 1.6225, + "step": 46200 + }, + { + "epoch": 0.29427841407451666, + "grad_norm": 1.0078125, + "learning_rate": 7.882051282051283e-06, + "loss": 1.626, + "step": 46300 + }, + { + "epoch": 0.29491400460167544, + "grad_norm": 0.6015625, + "learning_rate": 7.876923076923077e-06, + "loss": 1.6093, + "step": 46400 + }, + { + "epoch": 0.2955495951288342, + "grad_norm": 0.84375, + "learning_rate": 7.871794871794873e-06, + "loss": 1.632, + "step": 46500 + }, + { + "epoch": 0.296185185655993, + "grad_norm": 0.7734375, + "learning_rate": 7.866666666666667e-06, + "loss": 1.6322, + "step": 46600 + }, + { + "epoch": 0.2968207761831518, + "grad_norm": 0.7578125, + "learning_rate": 7.861538461538463e-06, + "loss": 1.6305, + "step": 46700 + }, + { + "epoch": 0.29745636671031056, + "grad_norm": 0.79296875, + "learning_rate": 7.856410256410257e-06, + "loss": 1.6245, + "step": 46800 + }, + { + "epoch": 0.29809195723746934, + "grad_norm": 0.98828125, + "learning_rate": 7.851282051282053e-06, + "loss": 1.6194, + "step": 46900 + }, + { + "epoch": 0.2987275477646281, + "grad_norm": 0.6015625, + "learning_rate": 7.846153846153847e-06, + "loss": 1.6272, + "step": 47000 + }, + { + "epoch": 0.2993631382917869, + "grad_norm": 0.82421875, + "learning_rate": 7.84102564102564e-06, + "loss": 1.6377, + "step": 47100 + }, + { + "epoch": 0.2999987288189457, + "grad_norm": 0.91015625, + "learning_rate": 7.835897435897436e-06, + "loss": 1.6276, + "step": 47200 + }, + { + "epoch": 0.30063431934610446, + "grad_norm": 0.796875, + "learning_rate": 7.830769230769232e-06, + "loss": 1.6217, + "step": 47300 + }, + { + "epoch": 0.30126990987326324, + "grad_norm": 0.796875, + "learning_rate": 7.825641025641026e-06, + "loss": 1.6226, + "step": 47400 + }, + { + "epoch": 0.301905500400422, + "grad_norm": 0.78515625, + "learning_rate": 7.820512820512822e-06, + "loss": 1.6194, + "step": 47500 + }, + { + "epoch": 0.3025410909275808, + "grad_norm": 0.83203125, + "learning_rate": 7.815384615384616e-06, + "loss": 1.6229, + "step": 47600 + }, + { + "epoch": 0.3031766814547396, + "grad_norm": 0.63671875, + "learning_rate": 7.81025641025641e-06, + "loss": 1.6346, + "step": 47700 + }, + { + "epoch": 0.30381227198189836, + "grad_norm": 0.61328125, + "learning_rate": 7.805128205128206e-06, + "loss": 1.6313, + "step": 47800 + }, + { + "epoch": 0.30444786250905714, + "grad_norm": 0.65625, + "learning_rate": 7.800000000000002e-06, + "loss": 1.6418, + "step": 47900 + }, + { + "epoch": 0.3050834530362159, + "grad_norm": 0.83984375, + "learning_rate": 7.794871794871796e-06, + "loss": 1.616, + "step": 48000 + }, + { + "epoch": 0.3057190435633747, + "grad_norm": 0.72265625, + "learning_rate": 7.78974358974359e-06, + "loss": 1.6425, + "step": 48100 + }, + { + "epoch": 0.30635463409053354, + "grad_norm": 0.765625, + "learning_rate": 7.784615384615385e-06, + "loss": 1.6228, + "step": 48200 + }, + { + "epoch": 0.3069902246176923, + "grad_norm": 0.66015625, + "learning_rate": 7.77948717948718e-06, + "loss": 1.6098, + "step": 48300 + }, + { + "epoch": 0.3076258151448511, + "grad_norm": 0.7421875, + "learning_rate": 7.774358974358975e-06, + "loss": 1.6483, + "step": 48400 + }, + { + "epoch": 0.3082614056720099, + "grad_norm": 0.859375, + "learning_rate": 7.76923076923077e-06, + "loss": 1.6251, + "step": 48500 + }, + { + "epoch": 0.30889699619916866, + "grad_norm": 0.70703125, + "learning_rate": 7.764102564102565e-06, + "loss": 1.6371, + "step": 48600 + }, + { + "epoch": 0.30953258672632744, + "grad_norm": 0.72265625, + "learning_rate": 7.758974358974359e-06, + "loss": 1.6277, + "step": 48700 + }, + { + "epoch": 0.3101681772534862, + "grad_norm": 0.76171875, + "learning_rate": 7.753846153846155e-06, + "loss": 1.6302, + "step": 48800 + }, + { + "epoch": 0.310803767780645, + "grad_norm": 0.46484375, + "learning_rate": 7.74871794871795e-06, + "loss": 1.6416, + "step": 48900 + }, + { + "epoch": 0.3114393583078038, + "grad_norm": 0.73046875, + "learning_rate": 7.743589743589745e-06, + "loss": 1.6144, + "step": 49000 + }, + { + "epoch": 0.31207494883496256, + "grad_norm": 0.79296875, + "learning_rate": 7.738461538461539e-06, + "loss": 1.6316, + "step": 49100 + }, + { + "epoch": 0.31271053936212134, + "grad_norm": 0.8359375, + "learning_rate": 7.733333333333334e-06, + "loss": 1.6118, + "step": 49200 + }, + { + "epoch": 0.3133461298892801, + "grad_norm": 0.90234375, + "learning_rate": 7.728205128205128e-06, + "loss": 1.6303, + "step": 49300 + }, + { + "epoch": 0.3139817204164389, + "grad_norm": 0.7421875, + "learning_rate": 7.723076923076924e-06, + "loss": 1.6304, + "step": 49400 + }, + { + "epoch": 0.3146173109435977, + "grad_norm": 0.546875, + "learning_rate": 7.717948717948718e-06, + "loss": 1.6298, + "step": 49500 + }, + { + "epoch": 0.31525290147075646, + "grad_norm": 0.474609375, + "learning_rate": 7.712820512820514e-06, + "loss": 1.6223, + "step": 49600 + }, + { + "epoch": 0.31588849199791524, + "grad_norm": 0.76953125, + "learning_rate": 7.707692307692308e-06, + "loss": 1.6282, + "step": 49700 + }, + { + "epoch": 0.316524082525074, + "grad_norm": 0.76171875, + "learning_rate": 7.702564102564102e-06, + "loss": 1.6254, + "step": 49800 + }, + { + "epoch": 0.3171596730522328, + "grad_norm": 0.91796875, + "learning_rate": 7.697435897435898e-06, + "loss": 1.6377, + "step": 49900 + }, + { + "epoch": 0.31779526357939164, + "grad_norm": 0.87109375, + "learning_rate": 7.692307692307694e-06, + "loss": 1.6068, + "step": 50000 + }, + { + "epoch": 0.3184308541065504, + "grad_norm": 0.83203125, + "learning_rate": 7.687179487179488e-06, + "loss": 1.6144, + "step": 50100 + }, + { + "epoch": 0.3190664446337092, + "grad_norm": 0.98828125, + "learning_rate": 7.682051282051283e-06, + "loss": 1.6314, + "step": 50200 + }, + { + "epoch": 0.319702035160868, + "grad_norm": 1.125, + "learning_rate": 7.676923076923077e-06, + "loss": 1.6279, + "step": 50300 + }, + { + "epoch": 0.32033762568802676, + "grad_norm": 0.7421875, + "learning_rate": 7.671794871794871e-06, + "loss": 1.649, + "step": 50400 + }, + { + "epoch": 0.32097321621518554, + "grad_norm": 0.89453125, + "learning_rate": 7.666666666666667e-06, + "loss": 1.6387, + "step": 50500 + }, + { + "epoch": 0.3216088067423443, + "grad_norm": 0.72265625, + "learning_rate": 7.661538461538463e-06, + "loss": 1.627, + "step": 50600 + }, + { + "epoch": 0.3222443972695031, + "grad_norm": 0.6484375, + "learning_rate": 7.656410256410257e-06, + "loss": 1.6261, + "step": 50700 + }, + { + "epoch": 0.3228799877966619, + "grad_norm": 0.60546875, + "learning_rate": 7.651282051282051e-06, + "loss": 1.627, + "step": 50800 + }, + { + "epoch": 0.32351557832382066, + "grad_norm": 0.84765625, + "learning_rate": 7.646153846153847e-06, + "loss": 1.6219, + "step": 50900 + }, + { + "epoch": 0.32415116885097944, + "grad_norm": 0.81640625, + "learning_rate": 7.641025641025641e-06, + "loss": 1.6318, + "step": 51000 + }, + { + "epoch": 0.3247867593781382, + "grad_norm": 0.83203125, + "learning_rate": 7.635897435897437e-06, + "loss": 1.6109, + "step": 51100 + }, + { + "epoch": 0.325422349905297, + "grad_norm": 0.71484375, + "learning_rate": 7.630769230769232e-06, + "loss": 1.6273, + "step": 51200 + }, + { + "epoch": 0.3260579404324558, + "grad_norm": 0.703125, + "learning_rate": 7.6256410256410264e-06, + "loss": 1.6264, + "step": 51300 + }, + { + "epoch": 0.32669353095961456, + "grad_norm": 1.015625, + "learning_rate": 7.620512820512821e-06, + "loss": 1.6306, + "step": 51400 + }, + { + "epoch": 0.32732912148677334, + "grad_norm": 1.015625, + "learning_rate": 7.615384615384615e-06, + "loss": 1.6276, + "step": 51500 + }, + { + "epoch": 0.3279647120139321, + "grad_norm": 1.1640625, + "learning_rate": 7.610256410256411e-06, + "loss": 1.623, + "step": 51600 + }, + { + "epoch": 0.3286003025410909, + "grad_norm": 0.64453125, + "learning_rate": 7.605128205128206e-06, + "loss": 1.641, + "step": 51700 + }, + { + "epoch": 0.3292358930682497, + "grad_norm": 0.98046875, + "learning_rate": 7.600000000000001e-06, + "loss": 1.6255, + "step": 51800 + }, + { + "epoch": 0.3298714835954085, + "grad_norm": 0.9765625, + "learning_rate": 7.594871794871795e-06, + "loss": 1.6192, + "step": 51900 + }, + { + "epoch": 0.3305070741225673, + "grad_norm": 0.75390625, + "learning_rate": 7.58974358974359e-06, + "loss": 1.6209, + "step": 52000 + }, + { + "epoch": 0.3311426646497261, + "grad_norm": 0.95703125, + "learning_rate": 7.584615384615385e-06, + "loss": 1.6305, + "step": 52100 + }, + { + "epoch": 0.33177825517688486, + "grad_norm": 0.91796875, + "learning_rate": 7.5794871794871805e-06, + "loss": 1.6112, + "step": 52200 + }, + { + "epoch": 0.33241384570404364, + "grad_norm": 0.72265625, + "learning_rate": 7.574358974358975e-06, + "loss": 1.6185, + "step": 52300 + }, + { + "epoch": 0.3330494362312024, + "grad_norm": 1.046875, + "learning_rate": 7.5692307692307695e-06, + "loss": 1.6255, + "step": 52400 + }, + { + "epoch": 0.3336850267583612, + "grad_norm": 0.80859375, + "learning_rate": 7.564102564102564e-06, + "loss": 1.6124, + "step": 52500 + }, + { + "epoch": 0.33432061728552, + "grad_norm": 0.56640625, + "learning_rate": 7.558974358974359e-06, + "loss": 1.648, + "step": 52600 + }, + { + "epoch": 0.33495620781267876, + "grad_norm": 1.046875, + "learning_rate": 7.553846153846155e-06, + "loss": 1.6192, + "step": 52700 + }, + { + "epoch": 0.33559179833983754, + "grad_norm": 0.8359375, + "learning_rate": 7.54871794871795e-06, + "loss": 1.6265, + "step": 52800 + }, + { + "epoch": 0.3362273888669963, + "grad_norm": 0.8515625, + "learning_rate": 7.543589743589744e-06, + "loss": 1.6227, + "step": 52900 + }, + { + "epoch": 0.3368629793941551, + "grad_norm": 0.609375, + "learning_rate": 7.538461538461539e-06, + "loss": 1.6165, + "step": 53000 + }, + { + "epoch": 0.3374985699213139, + "grad_norm": 0.7421875, + "learning_rate": 7.533333333333334e-06, + "loss": 1.6203, + "step": 53100 + }, + { + "epoch": 0.33813416044847266, + "grad_norm": 0.72265625, + "learning_rate": 7.528205128205129e-06, + "loss": 1.6253, + "step": 53200 + }, + { + "epoch": 0.33876975097563145, + "grad_norm": 0.53125, + "learning_rate": 7.523076923076924e-06, + "loss": 1.6359, + "step": 53300 + }, + { + "epoch": 0.3394053415027902, + "grad_norm": 0.7265625, + "learning_rate": 7.5179487179487185e-06, + "loss": 1.6145, + "step": 53400 + }, + { + "epoch": 0.340040932029949, + "grad_norm": 0.9296875, + "learning_rate": 7.512820512820513e-06, + "loss": 1.6269, + "step": 53500 + }, + { + "epoch": 0.3406765225571078, + "grad_norm": 0.66015625, + "learning_rate": 7.507692307692308e-06, + "loss": 1.6113, + "step": 53600 + }, + { + "epoch": 0.34131211308426657, + "grad_norm": 0.7578125, + "learning_rate": 7.502564102564102e-06, + "loss": 1.6376, + "step": 53700 + }, + { + "epoch": 0.3419477036114254, + "grad_norm": 0.81640625, + "learning_rate": 7.497435897435899e-06, + "loss": 1.634, + "step": 53800 + }, + { + "epoch": 0.3425832941385842, + "grad_norm": 0.6796875, + "learning_rate": 7.492307692307693e-06, + "loss": 1.6335, + "step": 53900 + }, + { + "epoch": 0.34321888466574296, + "grad_norm": 0.62109375, + "learning_rate": 7.487179487179488e-06, + "loss": 1.6334, + "step": 54000 + }, + { + "epoch": 0.34385447519290174, + "grad_norm": 0.734375, + "learning_rate": 7.482051282051283e-06, + "loss": 1.6347, + "step": 54100 + }, + { + "epoch": 0.3444900657200605, + "grad_norm": 0.58984375, + "learning_rate": 7.476923076923077e-06, + "loss": 1.6233, + "step": 54200 + }, + { + "epoch": 0.3451256562472193, + "grad_norm": 1.0703125, + "learning_rate": 7.4717948717948726e-06, + "loss": 1.6291, + "step": 54300 + }, + { + "epoch": 0.3457612467743781, + "grad_norm": 0.796875, + "learning_rate": 7.4666666666666675e-06, + "loss": 1.6182, + "step": 54400 + }, + { + "epoch": 0.34639683730153686, + "grad_norm": 1.4609375, + "learning_rate": 7.461538461538462e-06, + "loss": 1.6365, + "step": 54500 + }, + { + "epoch": 0.34703242782869564, + "grad_norm": 1.0078125, + "learning_rate": 7.456410256410257e-06, + "loss": 1.623, + "step": 54600 + }, + { + "epoch": 0.3476680183558544, + "grad_norm": 0.89453125, + "learning_rate": 7.451282051282051e-06, + "loss": 1.6104, + "step": 54700 + }, + { + "epoch": 0.3483036088830132, + "grad_norm": 1.2890625, + "learning_rate": 7.446153846153846e-06, + "loss": 1.6203, + "step": 54800 + }, + { + "epoch": 0.348939199410172, + "grad_norm": 0.63671875, + "learning_rate": 7.441025641025642e-06, + "loss": 1.6089, + "step": 54900 + }, + { + "epoch": 0.34957478993733077, + "grad_norm": 0.78515625, + "learning_rate": 7.435897435897437e-06, + "loss": 1.6191, + "step": 55000 + }, + { + "epoch": 0.35021038046448955, + "grad_norm": 0.65234375, + "learning_rate": 7.430769230769232e-06, + "loss": 1.626, + "step": 55100 + }, + { + "epoch": 0.3508459709916483, + "grad_norm": 0.796875, + "learning_rate": 7.425641025641026e-06, + "loss": 1.6189, + "step": 55200 + }, + { + "epoch": 0.3514815615188071, + "grad_norm": 0.5546875, + "learning_rate": 7.420512820512821e-06, + "loss": 1.6273, + "step": 55300 + }, + { + "epoch": 0.3521171520459659, + "grad_norm": 0.94140625, + "learning_rate": 7.4153846153846164e-06, + "loss": 1.6278, + "step": 55400 + }, + { + "epoch": 0.35275274257312467, + "grad_norm": 0.75390625, + "learning_rate": 7.410256410256411e-06, + "loss": 1.6254, + "step": 55500 + }, + { + "epoch": 0.35338833310028345, + "grad_norm": 0.5078125, + "learning_rate": 7.405128205128206e-06, + "loss": 1.6146, + "step": 55600 + }, + { + "epoch": 0.3540239236274423, + "grad_norm": 0.58984375, + "learning_rate": 7.4e-06, + "loss": 1.6295, + "step": 55700 + }, + { + "epoch": 0.35465951415460106, + "grad_norm": 0.94921875, + "learning_rate": 7.394871794871795e-06, + "loss": 1.6237, + "step": 55800 + }, + { + "epoch": 0.35529510468175984, + "grad_norm": 0.953125, + "learning_rate": 7.38974358974359e-06, + "loss": 1.619, + "step": 55900 + }, + { + "epoch": 0.3559306952089186, + "grad_norm": 0.62109375, + "learning_rate": 7.384615384615386e-06, + "loss": 1.631, + "step": 56000 + }, + { + "epoch": 0.3565662857360774, + "grad_norm": 0.75390625, + "learning_rate": 7.37948717948718e-06, + "loss": 1.6142, + "step": 56100 + }, + { + "epoch": 0.3572018762632362, + "grad_norm": 0.7421875, + "learning_rate": 7.374358974358975e-06, + "loss": 1.619, + "step": 56200 + }, + { + "epoch": 0.35783746679039496, + "grad_norm": 1.140625, + "learning_rate": 7.36923076923077e-06, + "loss": 1.6294, + "step": 56300 + }, + { + "epoch": 0.35847305731755374, + "grad_norm": 0.93359375, + "learning_rate": 7.364102564102565e-06, + "loss": 1.6206, + "step": 56400 + }, + { + "epoch": 0.3591086478447125, + "grad_norm": 0.59765625, + "learning_rate": 7.35897435897436e-06, + "loss": 1.646, + "step": 56500 + }, + { + "epoch": 0.3597442383718713, + "grad_norm": 1.09375, + "learning_rate": 7.353846153846154e-06, + "loss": 1.6298, + "step": 56600 + }, + { + "epoch": 0.3603798288990301, + "grad_norm": 0.9765625, + "learning_rate": 7.348717948717949e-06, + "loss": 1.6214, + "step": 56700 + }, + { + "epoch": 0.36101541942618887, + "grad_norm": 0.50390625, + "learning_rate": 7.343589743589744e-06, + "loss": 1.6305, + "step": 56800 + }, + { + "epoch": 0.36165100995334765, + "grad_norm": 0.59765625, + "learning_rate": 7.338461538461539e-06, + "loss": 1.6268, + "step": 56900 + }, + { + "epoch": 0.3622866004805064, + "grad_norm": 0.6484375, + "learning_rate": 7.333333333333333e-06, + "loss": 1.6267, + "step": 57000 + }, + { + "epoch": 0.3629221910076652, + "grad_norm": 0.953125, + "learning_rate": 7.328205128205129e-06, + "loss": 1.6278, + "step": 57100 + }, + { + "epoch": 0.363557781534824, + "grad_norm": 0.76171875, + "learning_rate": 7.323076923076924e-06, + "loss": 1.612, + "step": 57200 + }, + { + "epoch": 0.36419337206198277, + "grad_norm": 0.8984375, + "learning_rate": 7.317948717948719e-06, + "loss": 1.6283, + "step": 57300 + }, + { + "epoch": 0.36482896258914155, + "grad_norm": 0.9765625, + "learning_rate": 7.312820512820514e-06, + "loss": 1.6327, + "step": 57400 + }, + { + "epoch": 0.36546455311630033, + "grad_norm": 1.046875, + "learning_rate": 7.307692307692308e-06, + "loss": 1.6393, + "step": 57500 + }, + { + "epoch": 0.36610014364345916, + "grad_norm": 0.98828125, + "learning_rate": 7.302564102564103e-06, + "loss": 1.6296, + "step": 57600 + }, + { + "epoch": 0.36673573417061794, + "grad_norm": 0.98828125, + "learning_rate": 7.297435897435898e-06, + "loss": 1.6276, + "step": 57700 + }, + { + "epoch": 0.3673713246977767, + "grad_norm": 0.6796875, + "learning_rate": 7.292307692307693e-06, + "loss": 1.6174, + "step": 57800 + }, + { + "epoch": 0.3680069152249355, + "grad_norm": 1.1171875, + "learning_rate": 7.287179487179487e-06, + "loss": 1.6387, + "step": 57900 + }, + { + "epoch": 0.3686425057520943, + "grad_norm": 0.69140625, + "learning_rate": 7.282051282051282e-06, + "loss": 1.6251, + "step": 58000 + }, + { + "epoch": 0.36927809627925307, + "grad_norm": 0.458984375, + "learning_rate": 7.276923076923077e-06, + "loss": 1.6321, + "step": 58100 + }, + { + "epoch": 0.36991368680641185, + "grad_norm": 0.68359375, + "learning_rate": 7.271794871794873e-06, + "loss": 1.6283, + "step": 58200 + }, + { + "epoch": 0.3705492773335706, + "grad_norm": 0.62890625, + "learning_rate": 7.266666666666668e-06, + "loss": 1.6319, + "step": 58300 + }, + { + "epoch": 0.3711848678607294, + "grad_norm": 0.79296875, + "learning_rate": 7.261538461538462e-06, + "loss": 1.6195, + "step": 58400 + }, + { + "epoch": 0.3718204583878882, + "grad_norm": 0.62109375, + "learning_rate": 7.256410256410257e-06, + "loss": 1.6246, + "step": 58500 + }, + { + "epoch": 0.37245604891504697, + "grad_norm": 0.80078125, + "learning_rate": 7.2512820512820515e-06, + "loss": 1.61, + "step": 58600 + }, + { + "epoch": 0.37309163944220575, + "grad_norm": 0.76953125, + "learning_rate": 7.246153846153847e-06, + "loss": 1.6302, + "step": 58700 + }, + { + "epoch": 0.3737272299693645, + "grad_norm": 0.6953125, + "learning_rate": 7.241025641025642e-06, + "loss": 1.6367, + "step": 58800 + }, + { + "epoch": 0.3743628204965233, + "grad_norm": 0.78125, + "learning_rate": 7.235897435897436e-06, + "loss": 1.6333, + "step": 58900 + }, + { + "epoch": 0.3749984110236821, + "grad_norm": 0.90234375, + "learning_rate": 7.230769230769231e-06, + "loss": 1.6266, + "step": 59000 + }, + { + "epoch": 0.37563400155084087, + "grad_norm": 0.62890625, + "learning_rate": 7.225641025641026e-06, + "loss": 1.6174, + "step": 59100 + }, + { + "epoch": 0.37626959207799965, + "grad_norm": 0.87890625, + "learning_rate": 7.220512820512822e-06, + "loss": 1.6243, + "step": 59200 + }, + { + "epoch": 0.37690518260515843, + "grad_norm": 0.97265625, + "learning_rate": 7.215384615384617e-06, + "loss": 1.6376, + "step": 59300 + }, + { + "epoch": 0.3775407731323172, + "grad_norm": 1.03125, + "learning_rate": 7.210256410256411e-06, + "loss": 1.6324, + "step": 59400 + }, + { + "epoch": 0.37817636365947604, + "grad_norm": 0.80078125, + "learning_rate": 7.205128205128206e-06, + "loss": 1.6226, + "step": 59500 + }, + { + "epoch": 0.3788119541866348, + "grad_norm": 0.87890625, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.6299, + "step": 59600 + }, + { + "epoch": 0.3794475447137936, + "grad_norm": 0.734375, + "learning_rate": 7.1948717948717946e-06, + "loss": 1.6221, + "step": 59700 + }, + { + "epoch": 0.3800831352409524, + "grad_norm": 0.74609375, + "learning_rate": 7.189743589743591e-06, + "loss": 1.6129, + "step": 59800 + }, + { + "epoch": 0.38071872576811117, + "grad_norm": 0.455078125, + "learning_rate": 7.184615384615385e-06, + "loss": 1.626, + "step": 59900 + }, + { + "epoch": 0.38135431629526995, + "grad_norm": 0.52734375, + "learning_rate": 7.17948717948718e-06, + "loss": 1.6091, + "step": 60000 + }, + { + "epoch": 0.3819899068224287, + "grad_norm": 0.466796875, + "learning_rate": 7.174358974358975e-06, + "loss": 1.619, + "step": 60100 + }, + { + "epoch": 0.3826254973495875, + "grad_norm": 0.79296875, + "learning_rate": 7.169230769230769e-06, + "loss": 1.624, + "step": 60200 + }, + { + "epoch": 0.3832610878767463, + "grad_norm": 0.4921875, + "learning_rate": 7.164102564102565e-06, + "loss": 1.6376, + "step": 60300 + }, + { + "epoch": 0.38389667840390507, + "grad_norm": 0.6171875, + "learning_rate": 7.15897435897436e-06, + "loss": 1.6373, + "step": 60400 + }, + { + "epoch": 0.38453226893106385, + "grad_norm": 0.58203125, + "learning_rate": 7.153846153846155e-06, + "loss": 1.6022, + "step": 60500 + }, + { + "epoch": 0.38516785945822263, + "grad_norm": 0.64453125, + "learning_rate": 7.1487179487179495e-06, + "loss": 1.626, + "step": 60600 + }, + { + "epoch": 0.3858034499853814, + "grad_norm": 0.68359375, + "learning_rate": 7.1435897435897436e-06, + "loss": 1.6359, + "step": 60700 + }, + { + "epoch": 0.3864390405125402, + "grad_norm": 0.75390625, + "learning_rate": 7.1384615384615385e-06, + "loss": 1.6121, + "step": 60800 + }, + { + "epoch": 0.38707463103969897, + "grad_norm": 0.55859375, + "learning_rate": 7.133333333333334e-06, + "loss": 1.6308, + "step": 60900 + }, + { + "epoch": 0.38771022156685775, + "grad_norm": 0.7578125, + "learning_rate": 7.128205128205129e-06, + "loss": 1.6218, + "step": 61000 + }, + { + "epoch": 0.38834581209401653, + "grad_norm": 0.921875, + "learning_rate": 7.123076923076924e-06, + "loss": 1.6201, + "step": 61100 + }, + { + "epoch": 0.3889814026211753, + "grad_norm": 0.77734375, + "learning_rate": 7.117948717948718e-06, + "loss": 1.6238, + "step": 61200 + }, + { + "epoch": 0.3896169931483341, + "grad_norm": 0.72265625, + "learning_rate": 7.112820512820513e-06, + "loss": 1.6237, + "step": 61300 + }, + { + "epoch": 0.3902525836754929, + "grad_norm": 0.4609375, + "learning_rate": 7.107692307692309e-06, + "loss": 1.6169, + "step": 61400 + }, + { + "epoch": 0.3908881742026517, + "grad_norm": 0.7265625, + "learning_rate": 7.102564102564104e-06, + "loss": 1.6405, + "step": 61500 + }, + { + "epoch": 0.3915237647298105, + "grad_norm": 1.1953125, + "learning_rate": 7.0974358974358985e-06, + "loss": 1.6304, + "step": 61600 + }, + { + "epoch": 0.39215935525696927, + "grad_norm": 1.0625, + "learning_rate": 7.0923076923076926e-06, + "loss": 1.6279, + "step": 61700 + }, + { + "epoch": 0.39279494578412805, + "grad_norm": 0.84765625, + "learning_rate": 7.0871794871794875e-06, + "loss": 1.6094, + "step": 61800 + }, + { + "epoch": 0.3934305363112868, + "grad_norm": 0.65234375, + "learning_rate": 7.082051282051282e-06, + "loss": 1.6176, + "step": 61900 + }, + { + "epoch": 0.3940661268384456, + "grad_norm": 0.98046875, + "learning_rate": 7.076923076923078e-06, + "loss": 1.6207, + "step": 62000 + }, + { + "epoch": 0.3947017173656044, + "grad_norm": 1.0546875, + "learning_rate": 7.071794871794872e-06, + "loss": 1.6209, + "step": 62100 + }, + { + "epoch": 0.39533730789276317, + "grad_norm": 0.734375, + "learning_rate": 7.066666666666667e-06, + "loss": 1.6333, + "step": 62200 + }, + { + "epoch": 0.39597289841992195, + "grad_norm": 1.2265625, + "learning_rate": 7.061538461538462e-06, + "loss": 1.6369, + "step": 62300 + }, + { + "epoch": 0.39660848894708073, + "grad_norm": 0.75390625, + "learning_rate": 7.056410256410257e-06, + "loss": 1.6412, + "step": 62400 + }, + { + "epoch": 0.3972440794742395, + "grad_norm": 0.77734375, + "learning_rate": 7.051282051282053e-06, + "loss": 1.6256, + "step": 62500 + }, + { + "epoch": 0.3978796700013983, + "grad_norm": 0.66796875, + "learning_rate": 7.046153846153847e-06, + "loss": 1.6118, + "step": 62600 + }, + { + "epoch": 0.39851526052855707, + "grad_norm": 0.84375, + "learning_rate": 7.0410256410256415e-06, + "loss": 1.6338, + "step": 62700 + }, + { + "epoch": 0.39915085105571585, + "grad_norm": 1.5078125, + "learning_rate": 7.0358974358974364e-06, + "loss": 1.6091, + "step": 62800 + }, + { + "epoch": 0.39978644158287463, + "grad_norm": 0.59375, + "learning_rate": 7.030769230769231e-06, + "loss": 1.6314, + "step": 62900 + }, + { + "epoch": 0.4004220321100334, + "grad_norm": 0.82421875, + "learning_rate": 7.025641025641025e-06, + "loss": 1.6059, + "step": 63000 + }, + { + "epoch": 0.4010576226371922, + "grad_norm": 0.859375, + "learning_rate": 7.020512820512821e-06, + "loss": 1.6246, + "step": 63100 + }, + { + "epoch": 0.401693213164351, + "grad_norm": 1.1953125, + "learning_rate": 7.015384615384616e-06, + "loss": 1.6295, + "step": 63200 + }, + { + "epoch": 0.4023288036915098, + "grad_norm": 0.70703125, + "learning_rate": 7.010256410256411e-06, + "loss": 1.6275, + "step": 63300 + }, + { + "epoch": 0.4029643942186686, + "grad_norm": 0.98046875, + "learning_rate": 7.005128205128206e-06, + "loss": 1.6077, + "step": 63400 + }, + { + "epoch": 0.40359998474582737, + "grad_norm": 0.64453125, + "learning_rate": 7e-06, + "loss": 1.6033, + "step": 63500 + }, + { + "epoch": 0.40423557527298615, + "grad_norm": 1.0859375, + "learning_rate": 6.994871794871796e-06, + "loss": 1.6069, + "step": 63600 + }, + { + "epoch": 0.4048711658001449, + "grad_norm": 0.73828125, + "learning_rate": 6.9897435897435905e-06, + "loss": 1.6142, + "step": 63700 + }, + { + "epoch": 0.4055067563273037, + "grad_norm": 0.5546875, + "learning_rate": 6.9846153846153854e-06, + "loss": 1.6277, + "step": 63800 + }, + { + "epoch": 0.4061423468544625, + "grad_norm": 0.6953125, + "learning_rate": 6.9794871794871795e-06, + "loss": 1.6277, + "step": 63900 + }, + { + "epoch": 0.40677793738162127, + "grad_norm": 0.734375, + "learning_rate": 6.974358974358974e-06, + "loss": 1.6231, + "step": 64000 + }, + { + "epoch": 0.40741352790878005, + "grad_norm": 0.7734375, + "learning_rate": 6.96923076923077e-06, + "loss": 1.6271, + "step": 64100 + }, + { + "epoch": 0.40804911843593883, + "grad_norm": 0.69140625, + "learning_rate": 6.964102564102565e-06, + "loss": 1.619, + "step": 64200 + }, + { + "epoch": 0.4086847089630976, + "grad_norm": 1.1015625, + "learning_rate": 6.95897435897436e-06, + "loss": 1.6364, + "step": 64300 + }, + { + "epoch": 0.4093202994902564, + "grad_norm": 1.0078125, + "learning_rate": 6.953846153846154e-06, + "loss": 1.6403, + "step": 64400 + }, + { + "epoch": 0.40995589001741517, + "grad_norm": 0.734375, + "learning_rate": 6.948717948717949e-06, + "loss": 1.6099, + "step": 64500 + }, + { + "epoch": 0.41059148054457395, + "grad_norm": 0.91796875, + "learning_rate": 6.943589743589744e-06, + "loss": 1.6325, + "step": 64600 + }, + { + "epoch": 0.41122707107173273, + "grad_norm": 0.96484375, + "learning_rate": 6.9384615384615395e-06, + "loss": 1.6174, + "step": 64700 + }, + { + "epoch": 0.4118626615988915, + "grad_norm": 0.8828125, + "learning_rate": 6.9333333333333344e-06, + "loss": 1.6233, + "step": 64800 + }, + { + "epoch": 0.4124982521260503, + "grad_norm": 0.94921875, + "learning_rate": 6.9282051282051285e-06, + "loss": 1.6225, + "step": 64900 + }, + { + "epoch": 0.41313384265320907, + "grad_norm": 1.0234375, + "learning_rate": 6.923076923076923e-06, + "loss": 1.6309, + "step": 65000 + }, + { + "epoch": 0.4137694331803679, + "grad_norm": 0.59375, + "learning_rate": 6.917948717948718e-06, + "loss": 1.6209, + "step": 65100 + }, + { + "epoch": 0.4144050237075267, + "grad_norm": 0.70703125, + "learning_rate": 6.912820512820514e-06, + "loss": 1.6248, + "step": 65200 + }, + { + "epoch": 0.41504061423468547, + "grad_norm": 0.76953125, + "learning_rate": 6.907692307692309e-06, + "loss": 1.624, + "step": 65300 + }, + { + "epoch": 0.41567620476184425, + "grad_norm": 1.125, + "learning_rate": 6.902564102564103e-06, + "loss": 1.6319, + "step": 65400 + }, + { + "epoch": 0.41631179528900303, + "grad_norm": 0.8046875, + "learning_rate": 6.897435897435898e-06, + "loss": 1.6336, + "step": 65500 + }, + { + "epoch": 0.4169473858161618, + "grad_norm": 0.67578125, + "learning_rate": 6.892307692307693e-06, + "loss": 1.5998, + "step": 65600 + }, + { + "epoch": 0.4175829763433206, + "grad_norm": 0.5703125, + "learning_rate": 6.887179487179488e-06, + "loss": 1.6258, + "step": 65700 + }, + { + "epoch": 0.41821856687047937, + "grad_norm": 0.8515625, + "learning_rate": 6.882051282051283e-06, + "loss": 1.62, + "step": 65800 + }, + { + "epoch": 0.41885415739763815, + "grad_norm": 1.1484375, + "learning_rate": 6.8769230769230775e-06, + "loss": 1.6249, + "step": 65900 + }, + { + "epoch": 0.41948974792479693, + "grad_norm": 0.83203125, + "learning_rate": 6.871794871794872e-06, + "loss": 1.6178, + "step": 66000 + }, + { + "epoch": 0.4201253384519557, + "grad_norm": 0.8828125, + "learning_rate": 6.866666666666667e-06, + "loss": 1.623, + "step": 66100 + }, + { + "epoch": 0.4207609289791145, + "grad_norm": 0.7421875, + "learning_rate": 6.861538461538461e-06, + "loss": 1.6172, + "step": 66200 + }, + { + "epoch": 0.42139651950627327, + "grad_norm": 0.8671875, + "learning_rate": 6.856410256410257e-06, + "loss": 1.6335, + "step": 66300 + }, + { + "epoch": 0.42203211003343205, + "grad_norm": 1.03125, + "learning_rate": 6.851282051282052e-06, + "loss": 1.6238, + "step": 66400 + }, + { + "epoch": 0.42266770056059083, + "grad_norm": 0.7890625, + "learning_rate": 6.846153846153847e-06, + "loss": 1.6213, + "step": 66500 + }, + { + "epoch": 0.4233032910877496, + "grad_norm": 0.9453125, + "learning_rate": 6.841025641025642e-06, + "loss": 1.608, + "step": 66600 + }, + { + "epoch": 0.4239388816149084, + "grad_norm": 0.7109375, + "learning_rate": 6.835897435897436e-06, + "loss": 1.6241, + "step": 66700 + }, + { + "epoch": 0.42457447214206717, + "grad_norm": 0.7265625, + "learning_rate": 6.830769230769231e-06, + "loss": 1.6229, + "step": 66800 + }, + { + "epoch": 0.42521006266922595, + "grad_norm": 0.7890625, + "learning_rate": 6.8256410256410265e-06, + "loss": 1.5989, + "step": 66900 + }, + { + "epoch": 0.4258456531963848, + "grad_norm": 0.82421875, + "learning_rate": 6.820512820512821e-06, + "loss": 1.616, + "step": 67000 + }, + { + "epoch": 0.42648124372354357, + "grad_norm": 2.59375, + "learning_rate": 6.815384615384616e-06, + "loss": 1.6209, + "step": 67100 + }, + { + "epoch": 0.42711683425070235, + "grad_norm": 0.84375, + "learning_rate": 6.81025641025641e-06, + "loss": 1.6152, + "step": 67200 + }, + { + "epoch": 0.42775242477786113, + "grad_norm": 0.89453125, + "learning_rate": 6.805128205128205e-06, + "loss": 1.6277, + "step": 67300 + }, + { + "epoch": 0.4283880153050199, + "grad_norm": 0.8828125, + "learning_rate": 6.800000000000001e-06, + "loss": 1.6131, + "step": 67400 + }, + { + "epoch": 0.4290236058321787, + "grad_norm": 0.625, + "learning_rate": 6.794871794871796e-06, + "loss": 1.6216, + "step": 67500 + }, + { + "epoch": 0.42965919635933747, + "grad_norm": 0.76171875, + "learning_rate": 6.789743589743591e-06, + "loss": 1.641, + "step": 67600 + }, + { + "epoch": 0.43029478688649625, + "grad_norm": 0.89453125, + "learning_rate": 6.784615384615385e-06, + "loss": 1.6123, + "step": 67700 + }, + { + "epoch": 0.43093037741365503, + "grad_norm": 0.97265625, + "learning_rate": 6.77948717948718e-06, + "loss": 1.6135, + "step": 67800 + }, + { + "epoch": 0.4315659679408138, + "grad_norm": 0.6953125, + "learning_rate": 6.774358974358975e-06, + "loss": 1.6264, + "step": 67900 + }, + { + "epoch": 0.4322015584679726, + "grad_norm": 0.83203125, + "learning_rate": 6.76923076923077e-06, + "loss": 1.625, + "step": 68000 + }, + { + "epoch": 0.43283714899513137, + "grad_norm": 0.7734375, + "learning_rate": 6.764102564102564e-06, + "loss": 1.6164, + "step": 68100 + }, + { + "epoch": 0.43347273952229015, + "grad_norm": 0.890625, + "learning_rate": 6.758974358974359e-06, + "loss": 1.6189, + "step": 68200 + }, + { + "epoch": 0.43410833004944893, + "grad_norm": 0.84765625, + "learning_rate": 6.753846153846154e-06, + "loss": 1.6089, + "step": 68300 + }, + { + "epoch": 0.4347439205766077, + "grad_norm": 1.0546875, + "learning_rate": 6.748717948717949e-06, + "loss": 1.624, + "step": 68400 + }, + { + "epoch": 0.4353795111037665, + "grad_norm": 0.57421875, + "learning_rate": 6.743589743589745e-06, + "loss": 1.6255, + "step": 68500 + }, + { + "epoch": 0.4360151016309253, + "grad_norm": 0.55859375, + "learning_rate": 6.738461538461539e-06, + "loss": 1.6231, + "step": 68600 + }, + { + "epoch": 0.43665069215808405, + "grad_norm": 0.80859375, + "learning_rate": 6.733333333333334e-06, + "loss": 1.6272, + "step": 68700 + }, + { + "epoch": 0.43728628268524283, + "grad_norm": 0.67578125, + "learning_rate": 6.728205128205129e-06, + "loss": 1.615, + "step": 68800 + }, + { + "epoch": 0.43792187321240167, + "grad_norm": 1.171875, + "learning_rate": 6.723076923076924e-06, + "loss": 1.602, + "step": 68900 + }, + { + "epoch": 0.43855746373956045, + "grad_norm": 0.87109375, + "learning_rate": 6.717948717948718e-06, + "loss": 1.6188, + "step": 69000 + }, + { + "epoch": 0.43919305426671923, + "grad_norm": 0.54296875, + "learning_rate": 6.712820512820513e-06, + "loss": 1.6165, + "step": 69100 + }, + { + "epoch": 0.439828644793878, + "grad_norm": 0.62890625, + "learning_rate": 6.707692307692308e-06, + "loss": 1.6149, + "step": 69200 + }, + { + "epoch": 0.4404642353210368, + "grad_norm": 0.7109375, + "learning_rate": 6.702564102564103e-06, + "loss": 1.6197, + "step": 69300 + }, + { + "epoch": 0.44109982584819557, + "grad_norm": 0.73046875, + "learning_rate": 6.697435897435898e-06, + "loss": 1.6023, + "step": 69400 + }, + { + "epoch": 0.44173541637535435, + "grad_norm": 0.92578125, + "learning_rate": 6.692307692307692e-06, + "loss": 1.6237, + "step": 69500 + }, + { + "epoch": 0.44237100690251313, + "grad_norm": 0.96875, + "learning_rate": 6.687179487179488e-06, + "loss": 1.6323, + "step": 69600 + }, + { + "epoch": 0.4430065974296719, + "grad_norm": 1.03125, + "learning_rate": 6.682051282051283e-06, + "loss": 1.6327, + "step": 69700 + }, + { + "epoch": 0.4436421879568307, + "grad_norm": 0.78515625, + "learning_rate": 6.676923076923078e-06, + "loss": 1.6022, + "step": 69800 + }, + { + "epoch": 0.44427777848398947, + "grad_norm": 1.0078125, + "learning_rate": 6.671794871794873e-06, + "loss": 1.612, + "step": 69900 + }, + { + "epoch": 0.44491336901114825, + "grad_norm": 0.5859375, + "learning_rate": 6.666666666666667e-06, + "loss": 1.6142, + "step": 70000 + }, + { + "epoch": 0.44554895953830703, + "grad_norm": 0.671875, + "learning_rate": 6.661538461538462e-06, + "loss": 1.6161, + "step": 70100 + }, + { + "epoch": 0.4461845500654658, + "grad_norm": 1.1875, + "learning_rate": 6.656410256410257e-06, + "loss": 1.6224, + "step": 70200 + }, + { + "epoch": 0.4468201405926246, + "grad_norm": 0.83203125, + "learning_rate": 6.651282051282052e-06, + "loss": 1.6251, + "step": 70300 + }, + { + "epoch": 0.4474557311197834, + "grad_norm": 0.85546875, + "learning_rate": 6.646153846153846e-06, + "loss": 1.6293, + "step": 70400 + }, + { + "epoch": 0.44809132164694215, + "grad_norm": 0.671875, + "learning_rate": 6.641025641025641e-06, + "loss": 1.6059, + "step": 70500 + }, + { + "epoch": 0.44872691217410093, + "grad_norm": 0.671875, + "learning_rate": 6.635897435897436e-06, + "loss": 1.6126, + "step": 70600 + }, + { + "epoch": 0.4493625027012597, + "grad_norm": 0.953125, + "learning_rate": 6.630769230769232e-06, + "loss": 1.6285, + "step": 70700 + }, + { + "epoch": 0.44999809322841855, + "grad_norm": 0.8515625, + "learning_rate": 6.625641025641027e-06, + "loss": 1.6192, + "step": 70800 + }, + { + "epoch": 0.45063368375557733, + "grad_norm": 0.98046875, + "learning_rate": 6.620512820512821e-06, + "loss": 1.6384, + "step": 70900 + }, + { + "epoch": 0.4512692742827361, + "grad_norm": 0.91015625, + "learning_rate": 6.615384615384616e-06, + "loss": 1.6262, + "step": 71000 + }, + { + "epoch": 0.4519048648098949, + "grad_norm": 0.79296875, + "learning_rate": 6.6102564102564105e-06, + "loss": 1.6247, + "step": 71100 + }, + { + "epoch": 0.45254045533705367, + "grad_norm": 0.6484375, + "learning_rate": 6.605128205128206e-06, + "loss": 1.6194, + "step": 71200 + }, + { + "epoch": 0.45317604586421245, + "grad_norm": 0.79296875, + "learning_rate": 6.600000000000001e-06, + "loss": 1.6058, + "step": 71300 + }, + { + "epoch": 0.45381163639137123, + "grad_norm": 0.78515625, + "learning_rate": 6.594871794871795e-06, + "loss": 1.6114, + "step": 71400 + }, + { + "epoch": 0.45444722691853, + "grad_norm": 0.984375, + "learning_rate": 6.58974358974359e-06, + "loss": 1.6256, + "step": 71500 + }, + { + "epoch": 0.4550828174456888, + "grad_norm": 0.78125, + "learning_rate": 6.584615384615385e-06, + "loss": 1.6191, + "step": 71600 + }, + { + "epoch": 0.4557184079728476, + "grad_norm": 0.7109375, + "learning_rate": 6.57948717948718e-06, + "loss": 1.6058, + "step": 71700 + }, + { + "epoch": 0.45635399850000635, + "grad_norm": 0.89453125, + "learning_rate": 6.574358974358976e-06, + "loss": 1.6137, + "step": 71800 + }, + { + "epoch": 0.45698958902716513, + "grad_norm": 1.0078125, + "learning_rate": 6.56923076923077e-06, + "loss": 1.6105, + "step": 71900 + }, + { + "epoch": 0.4576251795543239, + "grad_norm": 1.09375, + "learning_rate": 6.564102564102565e-06, + "loss": 1.6335, + "step": 72000 + }, + { + "epoch": 0.4582607700814827, + "grad_norm": 0.8203125, + "learning_rate": 6.5589743589743595e-06, + "loss": 1.6285, + "step": 72100 + }, + { + "epoch": 0.4588963606086415, + "grad_norm": 0.671875, + "learning_rate": 6.553846153846154e-06, + "loss": 1.6156, + "step": 72200 + }, + { + "epoch": 0.45953195113580025, + "grad_norm": 0.87109375, + "learning_rate": 6.548717948717949e-06, + "loss": 1.605, + "step": 72300 + }, + { + "epoch": 0.46016754166295903, + "grad_norm": 1.0546875, + "learning_rate": 6.543589743589744e-06, + "loss": 1.6195, + "step": 72400 + }, + { + "epoch": 0.4608031321901178, + "grad_norm": 0.61328125, + "learning_rate": 6.538461538461539e-06, + "loss": 1.6225, + "step": 72500 + }, + { + "epoch": 0.4614387227172766, + "grad_norm": 1.1640625, + "learning_rate": 6.533333333333334e-06, + "loss": 1.6157, + "step": 72600 + }, + { + "epoch": 0.46207431324443543, + "grad_norm": 0.90625, + "learning_rate": 6.528205128205128e-06, + "loss": 1.6234, + "step": 72700 + }, + { + "epoch": 0.4627099037715942, + "grad_norm": 0.91796875, + "learning_rate": 6.523076923076923e-06, + "loss": 1.6382, + "step": 72800 + }, + { + "epoch": 0.463345494298753, + "grad_norm": 0.78515625, + "learning_rate": 6.517948717948719e-06, + "loss": 1.6067, + "step": 72900 + }, + { + "epoch": 0.46398108482591177, + "grad_norm": 0.84765625, + "learning_rate": 6.512820512820514e-06, + "loss": 1.6205, + "step": 73000 + }, + { + "epoch": 0.46461667535307055, + "grad_norm": 0.96875, + "learning_rate": 6.5076923076923085e-06, + "loss": 1.6172, + "step": 73100 + }, + { + "epoch": 0.46525226588022933, + "grad_norm": 0.71484375, + "learning_rate": 6.5025641025641026e-06, + "loss": 1.6187, + "step": 73200 + }, + { + "epoch": 0.4658878564073881, + "grad_norm": 0.859375, + "learning_rate": 6.4974358974358975e-06, + "loss": 1.6047, + "step": 73300 + }, + { + "epoch": 0.4665234469345469, + "grad_norm": 0.80078125, + "learning_rate": 6.492307692307693e-06, + "loss": 1.6084, + "step": 73400 + }, + { + "epoch": 0.4671590374617057, + "grad_norm": 0.92578125, + "learning_rate": 6.487179487179488e-06, + "loss": 1.6262, + "step": 73500 + }, + { + "epoch": 0.46779462798886445, + "grad_norm": 0.9453125, + "learning_rate": 6.482051282051283e-06, + "loss": 1.6148, + "step": 73600 + }, + { + "epoch": 0.46843021851602323, + "grad_norm": 0.9609375, + "learning_rate": 6.476923076923077e-06, + "loss": 1.6225, + "step": 73700 + }, + { + "epoch": 0.469065809043182, + "grad_norm": 1.0625, + "learning_rate": 6.471794871794872e-06, + "loss": 1.6238, + "step": 73800 + }, + { + "epoch": 0.4697013995703408, + "grad_norm": 0.921875, + "learning_rate": 6.466666666666667e-06, + "loss": 1.6082, + "step": 73900 + }, + { + "epoch": 0.4703369900974996, + "grad_norm": 0.6484375, + "learning_rate": 6.461538461538463e-06, + "loss": 1.6232, + "step": 74000 + }, + { + "epoch": 0.47097258062465835, + "grad_norm": 0.59765625, + "learning_rate": 6.4564102564102575e-06, + "loss": 1.6253, + "step": 74100 + }, + { + "epoch": 0.47160817115181713, + "grad_norm": 0.86328125, + "learning_rate": 6.4512820512820516e-06, + "loss": 1.618, + "step": 74200 + }, + { + "epoch": 0.4722437616789759, + "grad_norm": 0.58984375, + "learning_rate": 6.4461538461538465e-06, + "loss": 1.6153, + "step": 74300 + }, + { + "epoch": 0.4728793522061347, + "grad_norm": 0.74609375, + "learning_rate": 6.441025641025641e-06, + "loss": 1.6182, + "step": 74400 + }, + { + "epoch": 0.4735149427332935, + "grad_norm": 0.80078125, + "learning_rate": 6.435897435897437e-06, + "loss": 1.6308, + "step": 74500 + }, + { + "epoch": 0.4741505332604523, + "grad_norm": 0.859375, + "learning_rate": 6.430769230769231e-06, + "loss": 1.6259, + "step": 74600 + }, + { + "epoch": 0.4747861237876111, + "grad_norm": 1.1171875, + "learning_rate": 6.425641025641026e-06, + "loss": 1.6183, + "step": 74700 + }, + { + "epoch": 0.47542171431476987, + "grad_norm": 0.890625, + "learning_rate": 6.420512820512821e-06, + "loss": 1.6124, + "step": 74800 + }, + { + "epoch": 0.47605730484192865, + "grad_norm": 0.62109375, + "learning_rate": 6.415384615384616e-06, + "loss": 1.6082, + "step": 74900 + }, + { + "epoch": 0.47669289536908743, + "grad_norm": 0.95703125, + "learning_rate": 6.410256410256412e-06, + "loss": 1.6288, + "step": 75000 + }, + { + "epoch": 0.4773284858962462, + "grad_norm": 0.68359375, + "learning_rate": 6.405128205128206e-06, + "loss": 1.6089, + "step": 75100 + }, + { + "epoch": 0.477964076423405, + "grad_norm": 0.67578125, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.6039, + "step": 75200 + }, + { + "epoch": 0.4785996669505638, + "grad_norm": 0.81640625, + "learning_rate": 6.3948717948717955e-06, + "loss": 1.6228, + "step": 75300 + }, + { + "epoch": 0.47923525747772255, + "grad_norm": 0.62890625, + "learning_rate": 6.38974358974359e-06, + "loss": 1.6295, + "step": 75400 + }, + { + "epoch": 0.47987084800488133, + "grad_norm": 0.7734375, + "learning_rate": 6.384615384615384e-06, + "loss": 1.6375, + "step": 75500 + }, + { + "epoch": 0.4805064385320401, + "grad_norm": 0.8515625, + "learning_rate": 6.37948717948718e-06, + "loss": 1.621, + "step": 75600 + }, + { + "epoch": 0.4811420290591989, + "grad_norm": 0.76171875, + "learning_rate": 6.374358974358975e-06, + "loss": 1.6176, + "step": 75700 + }, + { + "epoch": 0.4817776195863577, + "grad_norm": 0.55078125, + "learning_rate": 6.36923076923077e-06, + "loss": 1.6037, + "step": 75800 + }, + { + "epoch": 0.48241321011351646, + "grad_norm": 0.66796875, + "learning_rate": 6.364102564102565e-06, + "loss": 1.6202, + "step": 75900 + }, + { + "epoch": 0.48304880064067524, + "grad_norm": 0.84375, + "learning_rate": 6.358974358974359e-06, + "loss": 1.6072, + "step": 76000 + }, + { + "epoch": 0.483684391167834, + "grad_norm": 0.89453125, + "learning_rate": 6.353846153846155e-06, + "loss": 1.6253, + "step": 76100 + }, + { + "epoch": 0.4843199816949928, + "grad_norm": 0.70703125, + "learning_rate": 6.3487179487179495e-06, + "loss": 1.6154, + "step": 76200 + }, + { + "epoch": 0.4849555722221516, + "grad_norm": 0.79296875, + "learning_rate": 6.3435897435897444e-06, + "loss": 1.6045, + "step": 76300 + }, + { + "epoch": 0.48559116274931036, + "grad_norm": 0.86328125, + "learning_rate": 6.3384615384615385e-06, + "loss": 1.63, + "step": 76400 + }, + { + "epoch": 0.4862267532764692, + "grad_norm": 1.140625, + "learning_rate": 6.333333333333333e-06, + "loss": 1.615, + "step": 76500 + }, + { + "epoch": 0.486862343803628, + "grad_norm": 1.03125, + "learning_rate": 6.328205128205128e-06, + "loss": 1.6177, + "step": 76600 + }, + { + "epoch": 0.48749793433078675, + "grad_norm": 0.9140625, + "learning_rate": 6.323076923076924e-06, + "loss": 1.6147, + "step": 76700 + }, + { + "epoch": 0.48813352485794553, + "grad_norm": 0.5390625, + "learning_rate": 6.317948717948719e-06, + "loss": 1.6187, + "step": 76800 + }, + { + "epoch": 0.4887691153851043, + "grad_norm": 0.71484375, + "learning_rate": 6.312820512820513e-06, + "loss": 1.6278, + "step": 76900 + }, + { + "epoch": 0.4894047059122631, + "grad_norm": 0.92578125, + "learning_rate": 6.307692307692308e-06, + "loss": 1.613, + "step": 77000 + }, + { + "epoch": 0.4900402964394219, + "grad_norm": 0.76171875, + "learning_rate": 6.302564102564103e-06, + "loss": 1.6121, + "step": 77100 + }, + { + "epoch": 0.49067588696658065, + "grad_norm": 0.97265625, + "learning_rate": 6.2974358974358985e-06, + "loss": 1.6129, + "step": 77200 + }, + { + "epoch": 0.49131147749373943, + "grad_norm": 0.70703125, + "learning_rate": 6.2923076923076934e-06, + "loss": 1.6027, + "step": 77300 + }, + { + "epoch": 0.4919470680208982, + "grad_norm": 0.734375, + "learning_rate": 6.2871794871794875e-06, + "loss": 1.6215, + "step": 77400 + }, + { + "epoch": 0.492582658548057, + "grad_norm": 1.09375, + "learning_rate": 6.282051282051282e-06, + "loss": 1.6139, + "step": 77500 + }, + { + "epoch": 0.4932182490752158, + "grad_norm": 0.71875, + "learning_rate": 6.276923076923077e-06, + "loss": 1.6079, + "step": 77600 + }, + { + "epoch": 0.49385383960237456, + "grad_norm": 0.73828125, + "learning_rate": 6.271794871794872e-06, + "loss": 1.6265, + "step": 77700 + }, + { + "epoch": 0.49448943012953334, + "grad_norm": 0.8046875, + "learning_rate": 6.266666666666668e-06, + "loss": 1.6138, + "step": 77800 + }, + { + "epoch": 0.4951250206566921, + "grad_norm": 0.69140625, + "learning_rate": 6.261538461538462e-06, + "loss": 1.6229, + "step": 77900 + }, + { + "epoch": 0.4957606111838509, + "grad_norm": 0.96875, + "learning_rate": 6.256410256410257e-06, + "loss": 1.6074, + "step": 78000 + }, + { + "epoch": 0.4963962017110097, + "grad_norm": 0.8359375, + "learning_rate": 6.251282051282052e-06, + "loss": 1.61, + "step": 78100 + }, + { + "epoch": 0.49703179223816846, + "grad_norm": 0.9375, + "learning_rate": 6.246153846153846e-06, + "loss": 1.6369, + "step": 78200 + }, + { + "epoch": 0.4976673827653273, + "grad_norm": 1.0390625, + "learning_rate": 6.2410256410256424e-06, + "loss": 1.6319, + "step": 78300 + }, + { + "epoch": 0.4983029732924861, + "grad_norm": 0.72265625, + "learning_rate": 6.2358974358974365e-06, + "loss": 1.6078, + "step": 78400 + }, + { + "epoch": 0.49893856381964485, + "grad_norm": 1.1171875, + "learning_rate": 6.230769230769231e-06, + "loss": 1.6118, + "step": 78500 + }, + { + "epoch": 0.49957415434680363, + "grad_norm": 0.80859375, + "learning_rate": 6.225641025641026e-06, + "loss": 1.6093, + "step": 78600 + }, + { + "epoch": 0.5002097448739624, + "grad_norm": 0.71875, + "learning_rate": 6.22051282051282e-06, + "loss": 1.6282, + "step": 78700 + }, + { + "epoch": 0.5008453354011212, + "grad_norm": 0.921875, + "learning_rate": 6.215384615384615e-06, + "loss": 1.6049, + "step": 78800 + }, + { + "epoch": 0.50148092592828, + "grad_norm": 0.80078125, + "learning_rate": 6.210256410256411e-06, + "loss": 1.6071, + "step": 78900 + }, + { + "epoch": 0.5021165164554388, + "grad_norm": 0.98046875, + "learning_rate": 6.205128205128206e-06, + "loss": 1.6211, + "step": 79000 + }, + { + "epoch": 0.5027521069825975, + "grad_norm": 0.84375, + "learning_rate": 6.200000000000001e-06, + "loss": 1.6098, + "step": 79100 + }, + { + "epoch": 0.5033876975097563, + "grad_norm": 0.765625, + "learning_rate": 6.194871794871795e-06, + "loss": 1.6244, + "step": 79200 + }, + { + "epoch": 0.5040232880369151, + "grad_norm": 0.5390625, + "learning_rate": 6.18974358974359e-06, + "loss": 1.6257, + "step": 79300 + }, + { + "epoch": 0.5046588785640739, + "grad_norm": 0.8828125, + "learning_rate": 6.1846153846153855e-06, + "loss": 1.6124, + "step": 79400 + }, + { + "epoch": 0.5052944690912327, + "grad_norm": 0.83203125, + "learning_rate": 6.17948717948718e-06, + "loss": 1.6267, + "step": 79500 + }, + { + "epoch": 0.5059300596183914, + "grad_norm": 1.2734375, + "learning_rate": 6.174358974358975e-06, + "loss": 1.6178, + "step": 79600 + }, + { + "epoch": 0.5065656501455502, + "grad_norm": 0.8125, + "learning_rate": 6.169230769230769e-06, + "loss": 1.6265, + "step": 79700 + }, + { + "epoch": 0.507201240672709, + "grad_norm": 1.15625, + "learning_rate": 6.164102564102564e-06, + "loss": 1.6248, + "step": 79800 + }, + { + "epoch": 0.5078368311998678, + "grad_norm": 0.8046875, + "learning_rate": 6.15897435897436e-06, + "loss": 1.6117, + "step": 79900 + }, + { + "epoch": 0.5084724217270266, + "grad_norm": 0.7265625, + "learning_rate": 6.153846153846155e-06, + "loss": 1.6164, + "step": 80000 + }, + { + "epoch": 0.5091080122541853, + "grad_norm": 0.7734375, + "learning_rate": 6.14871794871795e-06, + "loss": 1.6257, + "step": 80100 + }, + { + "epoch": 0.5097436027813441, + "grad_norm": 0.57421875, + "learning_rate": 6.143589743589744e-06, + "loss": 1.6203, + "step": 80200 + }, + { + "epoch": 0.5103791933085029, + "grad_norm": 1.15625, + "learning_rate": 6.138461538461539e-06, + "loss": 1.6308, + "step": 80300 + }, + { + "epoch": 0.5110147838356617, + "grad_norm": 1.0234375, + "learning_rate": 6.133333333333334e-06, + "loss": 1.6082, + "step": 80400 + }, + { + "epoch": 0.5116503743628205, + "grad_norm": 1.078125, + "learning_rate": 6.128205128205129e-06, + "loss": 1.6221, + "step": 80500 + }, + { + "epoch": 0.5122859648899792, + "grad_norm": 0.7421875, + "learning_rate": 6.123076923076923e-06, + "loss": 1.6169, + "step": 80600 + }, + { + "epoch": 0.512921555417138, + "grad_norm": 0.81640625, + "learning_rate": 6.117948717948718e-06, + "loss": 1.5958, + "step": 80700 + }, + { + "epoch": 0.5135571459442968, + "grad_norm": 0.8828125, + "learning_rate": 6.112820512820513e-06, + "loss": 1.6164, + "step": 80800 + }, + { + "epoch": 0.5141927364714556, + "grad_norm": 0.77734375, + "learning_rate": 6.107692307692308e-06, + "loss": 1.6204, + "step": 80900 + }, + { + "epoch": 0.5148283269986144, + "grad_norm": 0.72265625, + "learning_rate": 6.102564102564104e-06, + "loss": 1.6124, + "step": 81000 + }, + { + "epoch": 0.5154639175257731, + "grad_norm": 0.828125, + "learning_rate": 6.097435897435898e-06, + "loss": 1.6359, + "step": 81100 + }, + { + "epoch": 0.516099508052932, + "grad_norm": 1.0703125, + "learning_rate": 6.092307692307693e-06, + "loss": 1.625, + "step": 81200 + }, + { + "epoch": 0.5167350985800908, + "grad_norm": 0.8046875, + "learning_rate": 6.087179487179488e-06, + "loss": 1.6147, + "step": 81300 + }, + { + "epoch": 0.5173706891072496, + "grad_norm": 0.66015625, + "learning_rate": 6.082051282051283e-06, + "loss": 1.6221, + "step": 81400 + }, + { + "epoch": 0.5180062796344084, + "grad_norm": 1.0234375, + "learning_rate": 6.076923076923077e-06, + "loss": 1.6185, + "step": 81500 + }, + { + "epoch": 0.5186418701615672, + "grad_norm": 1.296875, + "learning_rate": 6.071794871794872e-06, + "loss": 1.6126, + "step": 81600 + }, + { + "epoch": 0.5192774606887259, + "grad_norm": 0.56640625, + "learning_rate": 6.066666666666667e-06, + "loss": 1.6242, + "step": 81700 + }, + { + "epoch": 0.5199130512158847, + "grad_norm": 0.609375, + "learning_rate": 6.061538461538462e-06, + "loss": 1.6178, + "step": 81800 + }, + { + "epoch": 0.5205486417430435, + "grad_norm": 0.875, + "learning_rate": 6.056410256410257e-06, + "loss": 1.6235, + "step": 81900 + }, + { + "epoch": 0.5211842322702023, + "grad_norm": 0.71484375, + "learning_rate": 6.051282051282051e-06, + "loss": 1.6243, + "step": 82000 + }, + { + "epoch": 0.521819822797361, + "grad_norm": 0.89453125, + "learning_rate": 6.046153846153847e-06, + "loss": 1.637, + "step": 82100 + }, + { + "epoch": 0.5224554133245198, + "grad_norm": 0.83203125, + "learning_rate": 6.041025641025642e-06, + "loss": 1.5933, + "step": 82200 + }, + { + "epoch": 0.5230910038516786, + "grad_norm": 0.95703125, + "learning_rate": 6.035897435897437e-06, + "loss": 1.6261, + "step": 82300 + }, + { + "epoch": 0.5237265943788374, + "grad_norm": 0.79296875, + "learning_rate": 6.030769230769231e-06, + "loss": 1.629, + "step": 82400 + }, + { + "epoch": 0.5243621849059962, + "grad_norm": 0.64453125, + "learning_rate": 6.025641025641026e-06, + "loss": 1.616, + "step": 82500 + }, + { + "epoch": 0.524997775433155, + "grad_norm": 0.66015625, + "learning_rate": 6.0205128205128206e-06, + "loss": 1.6094, + "step": 82600 + }, + { + "epoch": 0.5256333659603137, + "grad_norm": 0.609375, + "learning_rate": 6.015384615384616e-06, + "loss": 1.6166, + "step": 82700 + }, + { + "epoch": 0.5262689564874725, + "grad_norm": 0.8828125, + "learning_rate": 6.010256410256411e-06, + "loss": 1.6264, + "step": 82800 + }, + { + "epoch": 0.5269045470146313, + "grad_norm": 0.921875, + "learning_rate": 6.005128205128205e-06, + "loss": 1.6136, + "step": 82900 + }, + { + "epoch": 0.5275401375417901, + "grad_norm": 0.8125, + "learning_rate": 6e-06, + "loss": 1.6208, + "step": 83000 + }, + { + "epoch": 0.5281757280689489, + "grad_norm": 0.6953125, + "learning_rate": 5.994871794871795e-06, + "loss": 1.6156, + "step": 83100 + }, + { + "epoch": 0.5288113185961076, + "grad_norm": 1.0, + "learning_rate": 5.989743589743591e-06, + "loss": 1.6168, + "step": 83200 + }, + { + "epoch": 0.5294469091232664, + "grad_norm": 0.60546875, + "learning_rate": 5.984615384615386e-06, + "loss": 1.6293, + "step": 83300 + }, + { + "epoch": 0.5300824996504252, + "grad_norm": 0.890625, + "learning_rate": 5.97948717948718e-06, + "loss": 1.6071, + "step": 83400 + }, + { + "epoch": 0.530718090177584, + "grad_norm": 0.6875, + "learning_rate": 5.974358974358975e-06, + "loss": 1.6244, + "step": 83500 + }, + { + "epoch": 0.5313536807047428, + "grad_norm": 0.58984375, + "learning_rate": 5.9692307692307695e-06, + "loss": 1.6089, + "step": 83600 + }, + { + "epoch": 0.5319892712319015, + "grad_norm": 1.0, + "learning_rate": 5.9641025641025644e-06, + "loss": 1.6285, + "step": 83700 + }, + { + "epoch": 0.5326248617590603, + "grad_norm": 0.84375, + "learning_rate": 5.95897435897436e-06, + "loss": 1.6131, + "step": 83800 + }, + { + "epoch": 0.5332604522862191, + "grad_norm": 0.91015625, + "learning_rate": 5.953846153846154e-06, + "loss": 1.6274, + "step": 83900 + }, + { + "epoch": 0.5338960428133779, + "grad_norm": 0.75, + "learning_rate": 5.948717948717949e-06, + "loss": 1.6138, + "step": 84000 + }, + { + "epoch": 0.5345316333405367, + "grad_norm": 0.85546875, + "learning_rate": 5.943589743589744e-06, + "loss": 1.6348, + "step": 84100 + }, + { + "epoch": 0.5351672238676954, + "grad_norm": 0.94140625, + "learning_rate": 5.938461538461538e-06, + "loss": 1.6115, + "step": 84200 + }, + { + "epoch": 0.5358028143948542, + "grad_norm": 0.84765625, + "learning_rate": 5.933333333333335e-06, + "loss": 1.6196, + "step": 84300 + }, + { + "epoch": 0.536438404922013, + "grad_norm": 0.54296875, + "learning_rate": 5.928205128205129e-06, + "loss": 1.6257, + "step": 84400 + }, + { + "epoch": 0.5370739954491718, + "grad_norm": 1.046875, + "learning_rate": 5.923076923076924e-06, + "loss": 1.6163, + "step": 84500 + }, + { + "epoch": 0.5377095859763306, + "grad_norm": 0.82421875, + "learning_rate": 5.9179487179487185e-06, + "loss": 1.6273, + "step": 84600 + }, + { + "epoch": 0.5383451765034893, + "grad_norm": 0.578125, + "learning_rate": 5.912820512820513e-06, + "loss": 1.634, + "step": 84700 + }, + { + "epoch": 0.5389807670306481, + "grad_norm": 0.5234375, + "learning_rate": 5.907692307692308e-06, + "loss": 1.6107, + "step": 84800 + }, + { + "epoch": 0.539616357557807, + "grad_norm": 0.828125, + "learning_rate": 5.902564102564103e-06, + "loss": 1.6259, + "step": 84900 + }, + { + "epoch": 0.5402519480849658, + "grad_norm": 0.9296875, + "learning_rate": 5.897435897435898e-06, + "loss": 1.6044, + "step": 85000 + }, + { + "epoch": 0.5408875386121246, + "grad_norm": 0.83203125, + "learning_rate": 5.892307692307693e-06, + "loss": 1.6064, + "step": 85100 + }, + { + "epoch": 0.5415231291392834, + "grad_norm": 0.79296875, + "learning_rate": 5.887179487179487e-06, + "loss": 1.6197, + "step": 85200 + }, + { + "epoch": 0.5421587196664421, + "grad_norm": 0.84375, + "learning_rate": 5.882051282051282e-06, + "loss": 1.619, + "step": 85300 + }, + { + "epoch": 0.5427943101936009, + "grad_norm": 0.98046875, + "learning_rate": 5.876923076923078e-06, + "loss": 1.5951, + "step": 85400 + }, + { + "epoch": 0.5434299007207597, + "grad_norm": 0.8828125, + "learning_rate": 5.871794871794873e-06, + "loss": 1.6088, + "step": 85500 + }, + { + "epoch": 0.5440654912479185, + "grad_norm": 1.0078125, + "learning_rate": 5.8666666666666675e-06, + "loss": 1.6291, + "step": 85600 + }, + { + "epoch": 0.5447010817750773, + "grad_norm": 1.203125, + "learning_rate": 5.861538461538462e-06, + "loss": 1.6296, + "step": 85700 + }, + { + "epoch": 0.545336672302236, + "grad_norm": 0.455078125, + "learning_rate": 5.8564102564102565e-06, + "loss": 1.6253, + "step": 85800 + }, + { + "epoch": 0.5459722628293948, + "grad_norm": 0.68359375, + "learning_rate": 5.851282051282052e-06, + "loss": 1.6294, + "step": 85900 + }, + { + "epoch": 0.5466078533565536, + "grad_norm": 0.66015625, + "learning_rate": 5.846153846153847e-06, + "loss": 1.6221, + "step": 86000 + }, + { + "epoch": 0.5472434438837124, + "grad_norm": 0.79296875, + "learning_rate": 5.841025641025642e-06, + "loss": 1.6114, + "step": 86100 + }, + { + "epoch": 0.5478790344108712, + "grad_norm": 0.92578125, + "learning_rate": 5.835897435897436e-06, + "loss": 1.6158, + "step": 86200 + }, + { + "epoch": 0.5485146249380299, + "grad_norm": 0.671875, + "learning_rate": 5.830769230769231e-06, + "loss": 1.6054, + "step": 86300 + }, + { + "epoch": 0.5491502154651887, + "grad_norm": 0.828125, + "learning_rate": 5.825641025641026e-06, + "loss": 1.623, + "step": 86400 + }, + { + "epoch": 0.5497858059923475, + "grad_norm": 0.85546875, + "learning_rate": 5.820512820512822e-06, + "loss": 1.6108, + "step": 86500 + }, + { + "epoch": 0.5504213965195063, + "grad_norm": 0.6875, + "learning_rate": 5.815384615384616e-06, + "loss": 1.6228, + "step": 86600 + }, + { + "epoch": 0.5510569870466651, + "grad_norm": 0.7109375, + "learning_rate": 5.8102564102564106e-06, + "loss": 1.633, + "step": 86700 + }, + { + "epoch": 0.5516925775738238, + "grad_norm": 0.78515625, + "learning_rate": 5.8051282051282055e-06, + "loss": 1.6187, + "step": 86800 + }, + { + "epoch": 0.5523281681009826, + "grad_norm": 1.0859375, + "learning_rate": 5.8e-06, + "loss": 1.6125, + "step": 86900 + }, + { + "epoch": 0.5529637586281414, + "grad_norm": 0.8203125, + "learning_rate": 5.794871794871796e-06, + "loss": 1.6183, + "step": 87000 + }, + { + "epoch": 0.5535993491553002, + "grad_norm": 0.66796875, + "learning_rate": 5.78974358974359e-06, + "loss": 1.6264, + "step": 87100 + }, + { + "epoch": 0.554234939682459, + "grad_norm": 0.80078125, + "learning_rate": 5.784615384615385e-06, + "loss": 1.6289, + "step": 87200 + }, + { + "epoch": 0.5548705302096177, + "grad_norm": 0.65625, + "learning_rate": 5.77948717948718e-06, + "loss": 1.6042, + "step": 87300 + }, + { + "epoch": 0.5555061207367765, + "grad_norm": 0.8671875, + "learning_rate": 5.774358974358975e-06, + "loss": 1.6234, + "step": 87400 + }, + { + "epoch": 0.5561417112639353, + "grad_norm": 0.9921875, + "learning_rate": 5.769230769230769e-06, + "loss": 1.6233, + "step": 87500 + }, + { + "epoch": 0.5567773017910941, + "grad_norm": 0.6171875, + "learning_rate": 5.764102564102565e-06, + "loss": 1.6176, + "step": 87600 + }, + { + "epoch": 0.5574128923182529, + "grad_norm": 0.890625, + "learning_rate": 5.7589743589743596e-06, + "loss": 1.6216, + "step": 87700 + }, + { + "epoch": 0.5580484828454116, + "grad_norm": 0.921875, + "learning_rate": 5.7538461538461545e-06, + "loss": 1.6321, + "step": 87800 + }, + { + "epoch": 0.5586840733725704, + "grad_norm": 1.0078125, + "learning_rate": 5.748717948717949e-06, + "loss": 1.6245, + "step": 87900 + }, + { + "epoch": 0.5593196638997292, + "grad_norm": 0.9921875, + "learning_rate": 5.743589743589743e-06, + "loss": 1.6279, + "step": 88000 + }, + { + "epoch": 0.559955254426888, + "grad_norm": 0.85546875, + "learning_rate": 5.738461538461539e-06, + "loss": 1.6187, + "step": 88100 + }, + { + "epoch": 0.5605908449540468, + "grad_norm": 0.625, + "learning_rate": 5.733333333333334e-06, + "loss": 1.6266, + "step": 88200 + }, + { + "epoch": 0.5612264354812055, + "grad_norm": 0.625, + "learning_rate": 5.728205128205129e-06, + "loss": 1.6191, + "step": 88300 + }, + { + "epoch": 0.5618620260083643, + "grad_norm": 0.73828125, + "learning_rate": 5.723076923076923e-06, + "loss": 1.609, + "step": 88400 + }, + { + "epoch": 0.5624976165355231, + "grad_norm": 0.9296875, + "learning_rate": 5.717948717948718e-06, + "loss": 1.6182, + "step": 88500 + }, + { + "epoch": 0.5631332070626819, + "grad_norm": 0.8359375, + "learning_rate": 5.712820512820513e-06, + "loss": 1.6317, + "step": 88600 + }, + { + "epoch": 0.5637687975898408, + "grad_norm": 0.91796875, + "learning_rate": 5.7076923076923086e-06, + "loss": 1.6025, + "step": 88700 + }, + { + "epoch": 0.5644043881169996, + "grad_norm": 0.70703125, + "learning_rate": 5.7025641025641035e-06, + "loss": 1.6301, + "step": 88800 + }, + { + "epoch": 0.5650399786441583, + "grad_norm": 0.71875, + "learning_rate": 5.6974358974358975e-06, + "loss": 1.6327, + "step": 88900 + }, + { + "epoch": 0.5656755691713171, + "grad_norm": 0.6640625, + "learning_rate": 5.692307692307692e-06, + "loss": 1.6271, + "step": 89000 + }, + { + "epoch": 0.5663111596984759, + "grad_norm": 0.60546875, + "learning_rate": 5.687179487179487e-06, + "loss": 1.6137, + "step": 89100 + }, + { + "epoch": 0.5669467502256347, + "grad_norm": 1.2109375, + "learning_rate": 5.682051282051283e-06, + "loss": 1.6217, + "step": 89200 + }, + { + "epoch": 0.5675823407527935, + "grad_norm": 1.0859375, + "learning_rate": 5.676923076923078e-06, + "loss": 1.6228, + "step": 89300 + }, + { + "epoch": 0.5682179312799522, + "grad_norm": 1.203125, + "learning_rate": 5.671794871794872e-06, + "loss": 1.6288, + "step": 89400 + }, + { + "epoch": 0.568853521807111, + "grad_norm": 0.99609375, + "learning_rate": 5.666666666666667e-06, + "loss": 1.6124, + "step": 89500 + }, + { + "epoch": 0.5694891123342698, + "grad_norm": 1.0390625, + "learning_rate": 5.661538461538462e-06, + "loss": 1.6081, + "step": 89600 + }, + { + "epoch": 0.5701247028614286, + "grad_norm": 0.77734375, + "learning_rate": 5.6564102564102575e-06, + "loss": 1.625, + "step": 89700 + }, + { + "epoch": 0.5707602933885874, + "grad_norm": 0.66015625, + "learning_rate": 5.6512820512820524e-06, + "loss": 1.6106, + "step": 89800 + }, + { + "epoch": 0.5713958839157461, + "grad_norm": 1.03125, + "learning_rate": 5.6461538461538465e-06, + "loss": 1.6322, + "step": 89900 + }, + { + "epoch": 0.5720314744429049, + "grad_norm": 0.8984375, + "learning_rate": 5.641025641025641e-06, + "loss": 1.633, + "step": 90000 + }, + { + "epoch": 0.5726670649700637, + "grad_norm": 0.625, + "learning_rate": 5.635897435897436e-06, + "loss": 1.6196, + "step": 90100 + }, + { + "epoch": 0.5733026554972225, + "grad_norm": 0.62109375, + "learning_rate": 5.63076923076923e-06, + "loss": 1.645, + "step": 90200 + }, + { + "epoch": 0.5739382460243813, + "grad_norm": 0.89453125, + "learning_rate": 5.625641025641027e-06, + "loss": 1.6166, + "step": 90300 + }, + { + "epoch": 0.57457383655154, + "grad_norm": 0.890625, + "learning_rate": 5.620512820512821e-06, + "loss": 1.6207, + "step": 90400 + }, + { + "epoch": 0.5752094270786988, + "grad_norm": 1.0234375, + "learning_rate": 5.615384615384616e-06, + "loss": 1.6138, + "step": 90500 + }, + { + "epoch": 0.5758450176058576, + "grad_norm": 1.0, + "learning_rate": 5.610256410256411e-06, + "loss": 1.6108, + "step": 90600 + }, + { + "epoch": 0.5764806081330164, + "grad_norm": 0.703125, + "learning_rate": 5.605128205128205e-06, + "loss": 1.6291, + "step": 90700 + }, + { + "epoch": 0.5771161986601752, + "grad_norm": 0.71875, + "learning_rate": 5.600000000000001e-06, + "loss": 1.6296, + "step": 90800 + }, + { + "epoch": 0.5777517891873339, + "grad_norm": 0.75, + "learning_rate": 5.5948717948717955e-06, + "loss": 1.6183, + "step": 90900 + }, + { + "epoch": 0.5783873797144927, + "grad_norm": 1.046875, + "learning_rate": 5.58974358974359e-06, + "loss": 1.6069, + "step": 91000 + }, + { + "epoch": 0.5790229702416515, + "grad_norm": 0.87109375, + "learning_rate": 5.584615384615385e-06, + "loss": 1.608, + "step": 91100 + }, + { + "epoch": 0.5796585607688103, + "grad_norm": 0.7109375, + "learning_rate": 5.579487179487179e-06, + "loss": 1.6259, + "step": 91200 + }, + { + "epoch": 0.5802941512959691, + "grad_norm": 1.0390625, + "learning_rate": 5.574358974358974e-06, + "loss": 1.6293, + "step": 91300 + }, + { + "epoch": 0.5809297418231278, + "grad_norm": 0.65625, + "learning_rate": 5.56923076923077e-06, + "loss": 1.6288, + "step": 91400 + }, + { + "epoch": 0.5815653323502866, + "grad_norm": 0.6953125, + "learning_rate": 5.564102564102565e-06, + "loss": 1.619, + "step": 91500 + }, + { + "epoch": 0.5822009228774454, + "grad_norm": 0.96875, + "learning_rate": 5.55897435897436e-06, + "loss": 1.6338, + "step": 91600 + }, + { + "epoch": 0.5828365134046042, + "grad_norm": 0.6328125, + "learning_rate": 5.553846153846154e-06, + "loss": 1.6173, + "step": 91700 + }, + { + "epoch": 0.583472103931763, + "grad_norm": 1.0390625, + "learning_rate": 5.548717948717949e-06, + "loss": 1.6255, + "step": 91800 + }, + { + "epoch": 0.5841076944589217, + "grad_norm": 0.490234375, + "learning_rate": 5.5435897435897445e-06, + "loss": 1.5943, + "step": 91900 + }, + { + "epoch": 0.5847432849860805, + "grad_norm": 1.1484375, + "learning_rate": 5.538461538461539e-06, + "loss": 1.6178, + "step": 92000 + }, + { + "epoch": 0.5853788755132393, + "grad_norm": 1.1640625, + "learning_rate": 5.533333333333334e-06, + "loss": 1.6184, + "step": 92100 + }, + { + "epoch": 0.5860144660403981, + "grad_norm": 0.765625, + "learning_rate": 5.528205128205128e-06, + "loss": 1.6053, + "step": 92200 + }, + { + "epoch": 0.5866500565675569, + "grad_norm": 1.0, + "learning_rate": 5.523076923076923e-06, + "loss": 1.6308, + "step": 92300 + }, + { + "epoch": 0.5872856470947156, + "grad_norm": 1.0625, + "learning_rate": 5.517948717948718e-06, + "loss": 1.626, + "step": 92400 + }, + { + "epoch": 0.5879212376218745, + "grad_norm": 1.25, + "learning_rate": 5.512820512820514e-06, + "loss": 1.623, + "step": 92500 + }, + { + "epoch": 0.5885568281490333, + "grad_norm": 1.140625, + "learning_rate": 5.507692307692308e-06, + "loss": 1.631, + "step": 92600 + }, + { + "epoch": 0.5891924186761921, + "grad_norm": 0.921875, + "learning_rate": 5.502564102564103e-06, + "loss": 1.6301, + "step": 92700 + }, + { + "epoch": 0.5898280092033509, + "grad_norm": 0.76171875, + "learning_rate": 5.497435897435898e-06, + "loss": 1.6266, + "step": 92800 + }, + { + "epoch": 0.5904635997305097, + "grad_norm": 0.78515625, + "learning_rate": 5.492307692307693e-06, + "loss": 1.6179, + "step": 92900 + }, + { + "epoch": 0.5910991902576684, + "grad_norm": 0.8125, + "learning_rate": 5.487179487179488e-06, + "loss": 1.625, + "step": 93000 + }, + { + "epoch": 0.5917347807848272, + "grad_norm": 0.69921875, + "learning_rate": 5.4820512820512824e-06, + "loss": 1.6181, + "step": 93100 + }, + { + "epoch": 0.592370371311986, + "grad_norm": 0.75, + "learning_rate": 5.476923076923077e-06, + "loss": 1.6304, + "step": 93200 + }, + { + "epoch": 0.5930059618391448, + "grad_norm": 0.79296875, + "learning_rate": 5.471794871794872e-06, + "loss": 1.6004, + "step": 93300 + }, + { + "epoch": 0.5936415523663036, + "grad_norm": 0.70703125, + "learning_rate": 5.466666666666667e-06, + "loss": 1.6335, + "step": 93400 + }, + { + "epoch": 0.5942771428934623, + "grad_norm": 1.0859375, + "learning_rate": 5.461538461538461e-06, + "loss": 1.616, + "step": 93500 + }, + { + "epoch": 0.5949127334206211, + "grad_norm": 0.7109375, + "learning_rate": 5.456410256410257e-06, + "loss": 1.6155, + "step": 93600 + }, + { + "epoch": 0.5955483239477799, + "grad_norm": 0.7265625, + "learning_rate": 5.451282051282052e-06, + "loss": 1.6241, + "step": 93700 + }, + { + "epoch": 0.5961839144749387, + "grad_norm": 0.51171875, + "learning_rate": 5.446153846153847e-06, + "loss": 1.6173, + "step": 93800 + }, + { + "epoch": 0.5968195050020975, + "grad_norm": 0.7578125, + "learning_rate": 5.441025641025642e-06, + "loss": 1.6389, + "step": 93900 + }, + { + "epoch": 0.5974550955292562, + "grad_norm": 0.73828125, + "learning_rate": 5.435897435897436e-06, + "loss": 1.6153, + "step": 94000 + }, + { + "epoch": 0.598090686056415, + "grad_norm": 0.83984375, + "learning_rate": 5.430769230769231e-06, + "loss": 1.6275, + "step": 94100 + }, + { + "epoch": 0.5987262765835738, + "grad_norm": 1.0859375, + "learning_rate": 5.425641025641026e-06, + "loss": 1.6286, + "step": 94200 + }, + { + "epoch": 0.5993618671107326, + "grad_norm": 0.84765625, + "learning_rate": 5.420512820512821e-06, + "loss": 1.617, + "step": 94300 + }, + { + "epoch": 0.5999974576378914, + "grad_norm": 0.8359375, + "learning_rate": 5.415384615384615e-06, + "loss": 1.6307, + "step": 94400 + }, + { + "epoch": 0.6006330481650501, + "grad_norm": 0.7734375, + "learning_rate": 5.41025641025641e-06, + "loss": 1.6397, + "step": 94500 + }, + { + "epoch": 0.6012686386922089, + "grad_norm": 1.2421875, + "learning_rate": 5.405128205128205e-06, + "loss": 1.6146, + "step": 94600 + }, + { + "epoch": 0.6019042292193677, + "grad_norm": 0.9609375, + "learning_rate": 5.400000000000001e-06, + "loss": 1.628, + "step": 94700 + }, + { + "epoch": 0.6025398197465265, + "grad_norm": 0.859375, + "learning_rate": 5.394871794871796e-06, + "loss": 1.6315, + "step": 94800 + }, + { + "epoch": 0.6031754102736853, + "grad_norm": 0.640625, + "learning_rate": 5.38974358974359e-06, + "loss": 1.6189, + "step": 94900 + }, + { + "epoch": 0.603811000800844, + "grad_norm": 0.91015625, + "learning_rate": 5.384615384615385e-06, + "loss": 1.6219, + "step": 95000 + }, + { + "epoch": 0.6044465913280028, + "grad_norm": 0.92578125, + "learning_rate": 5.3794871794871796e-06, + "loss": 1.613, + "step": 95100 + }, + { + "epoch": 0.6050821818551616, + "grad_norm": 1.328125, + "learning_rate": 5.374358974358975e-06, + "loss": 1.6054, + "step": 95200 + }, + { + "epoch": 0.6057177723823204, + "grad_norm": 1.203125, + "learning_rate": 5.36923076923077e-06, + "loss": 1.6211, + "step": 95300 + }, + { + "epoch": 0.6063533629094792, + "grad_norm": 1.0078125, + "learning_rate": 5.364102564102564e-06, + "loss": 1.6168, + "step": 95400 + }, + { + "epoch": 0.606988953436638, + "grad_norm": 0.85546875, + "learning_rate": 5.358974358974359e-06, + "loss": 1.6289, + "step": 95500 + }, + { + "epoch": 0.6076245439637967, + "grad_norm": 0.83984375, + "learning_rate": 5.353846153846154e-06, + "loss": 1.612, + "step": 95600 + }, + { + "epoch": 0.6082601344909555, + "grad_norm": 0.8046875, + "learning_rate": 5.34871794871795e-06, + "loss": 1.6154, + "step": 95700 + }, + { + "epoch": 0.6088957250181143, + "grad_norm": 1.0234375, + "learning_rate": 5.343589743589745e-06, + "loss": 1.6178, + "step": 95800 + }, + { + "epoch": 0.6095313155452731, + "grad_norm": 0.58984375, + "learning_rate": 5.338461538461539e-06, + "loss": 1.6332, + "step": 95900 + }, + { + "epoch": 0.6101669060724318, + "grad_norm": 0.67578125, + "learning_rate": 5.333333333333334e-06, + "loss": 1.6216, + "step": 96000 + }, + { + "epoch": 0.6108024965995906, + "grad_norm": 0.890625, + "learning_rate": 5.3282051282051286e-06, + "loss": 1.6099, + "step": 96100 + }, + { + "epoch": 0.6114380871267494, + "grad_norm": 0.8984375, + "learning_rate": 5.323076923076923e-06, + "loss": 1.636, + "step": 96200 + }, + { + "epoch": 0.6120736776539083, + "grad_norm": 0.94921875, + "learning_rate": 5.317948717948719e-06, + "loss": 1.6261, + "step": 96300 + }, + { + "epoch": 0.6127092681810671, + "grad_norm": 0.71484375, + "learning_rate": 5.312820512820513e-06, + "loss": 1.6299, + "step": 96400 + }, + { + "epoch": 0.6133448587082259, + "grad_norm": 0.5859375, + "learning_rate": 5.307692307692308e-06, + "loss": 1.6277, + "step": 96500 + }, + { + "epoch": 0.6139804492353846, + "grad_norm": 0.73046875, + "learning_rate": 5.302564102564103e-06, + "loss": 1.6248, + "step": 96600 + }, + { + "epoch": 0.6146160397625434, + "grad_norm": 0.9453125, + "learning_rate": 5.297435897435897e-06, + "loss": 1.6107, + "step": 96700 + }, + { + "epoch": 0.6152516302897022, + "grad_norm": 0.875, + "learning_rate": 5.292307692307693e-06, + "loss": 1.6303, + "step": 96800 + }, + { + "epoch": 0.615887220816861, + "grad_norm": 1.015625, + "learning_rate": 5.287179487179488e-06, + "loss": 1.6134, + "step": 96900 + }, + { + "epoch": 0.6165228113440198, + "grad_norm": 0.73046875, + "learning_rate": 5.282051282051283e-06, + "loss": 1.6326, + "step": 97000 + }, + { + "epoch": 0.6171584018711785, + "grad_norm": 0.63671875, + "learning_rate": 5.2769230769230775e-06, + "loss": 1.6081, + "step": 97100 + }, + { + "epoch": 0.6177939923983373, + "grad_norm": 1.265625, + "learning_rate": 5.271794871794872e-06, + "loss": 1.6208, + "step": 97200 + }, + { + "epoch": 0.6184295829254961, + "grad_norm": 1.0859375, + "learning_rate": 5.2666666666666665e-06, + "loss": 1.6238, + "step": 97300 + }, + { + "epoch": 0.6190651734526549, + "grad_norm": 1.0390625, + "learning_rate": 5.261538461538462e-06, + "loss": 1.6219, + "step": 97400 + }, + { + "epoch": 0.6197007639798137, + "grad_norm": 0.875, + "learning_rate": 5.256410256410257e-06, + "loss": 1.6219, + "step": 97500 + }, + { + "epoch": 0.6203363545069724, + "grad_norm": 0.64453125, + "learning_rate": 5.251282051282052e-06, + "loss": 1.6101, + "step": 97600 + }, + { + "epoch": 0.6209719450341312, + "grad_norm": 0.97265625, + "learning_rate": 5.246153846153846e-06, + "loss": 1.6172, + "step": 97700 + }, + { + "epoch": 0.62160753556129, + "grad_norm": 0.96484375, + "learning_rate": 5.241025641025641e-06, + "loss": 1.6167, + "step": 97800 + }, + { + "epoch": 0.6222431260884488, + "grad_norm": 0.59375, + "learning_rate": 5.235897435897437e-06, + "loss": 1.616, + "step": 97900 + }, + { + "epoch": 0.6228787166156076, + "grad_norm": 0.68359375, + "learning_rate": 5.230769230769232e-06, + "loss": 1.6145, + "step": 98000 + }, + { + "epoch": 0.6235143071427663, + "grad_norm": 0.69140625, + "learning_rate": 5.2256410256410265e-06, + "loss": 1.6299, + "step": 98100 + }, + { + "epoch": 0.6241498976699251, + "grad_norm": 0.80078125, + "learning_rate": 5.220512820512821e-06, + "loss": 1.6195, + "step": 98200 + }, + { + "epoch": 0.6247854881970839, + "grad_norm": 0.859375, + "learning_rate": 5.2153846153846155e-06, + "loss": 1.6244, + "step": 98300 + }, + { + "epoch": 0.6254210787242427, + "grad_norm": 1.1875, + "learning_rate": 5.21025641025641e-06, + "loss": 1.6223, + "step": 98400 + }, + { + "epoch": 0.6260566692514015, + "grad_norm": 0.6796875, + "learning_rate": 5.205128205128206e-06, + "loss": 1.6134, + "step": 98500 + }, + { + "epoch": 0.6266922597785602, + "grad_norm": 0.87109375, + "learning_rate": 5.2e-06, + "loss": 1.6261, + "step": 98600 + }, + { + "epoch": 0.627327850305719, + "grad_norm": 0.75, + "learning_rate": 5.194871794871795e-06, + "loss": 1.6227, + "step": 98700 + }, + { + "epoch": 0.6279634408328778, + "grad_norm": 0.68359375, + "learning_rate": 5.18974358974359e-06, + "loss": 1.6215, + "step": 98800 + }, + { + "epoch": 0.6285990313600366, + "grad_norm": 0.703125, + "learning_rate": 5.184615384615385e-06, + "loss": 1.6231, + "step": 98900 + }, + { + "epoch": 0.6292346218871954, + "grad_norm": 0.82421875, + "learning_rate": 5.179487179487181e-06, + "loss": 1.6289, + "step": 99000 + }, + { + "epoch": 0.6298702124143541, + "grad_norm": 0.85546875, + "learning_rate": 5.174358974358975e-06, + "loss": 1.6127, + "step": 99100 + }, + { + "epoch": 0.6305058029415129, + "grad_norm": 1.28125, + "learning_rate": 5.16923076923077e-06, + "loss": 1.6208, + "step": 99200 + }, + { + "epoch": 0.6311413934686717, + "grad_norm": 0.78515625, + "learning_rate": 5.1641025641025645e-06, + "loss": 1.6161, + "step": 99300 + }, + { + "epoch": 0.6317769839958305, + "grad_norm": 1.1328125, + "learning_rate": 5.158974358974359e-06, + "loss": 1.6175, + "step": 99400 + }, + { + "epoch": 0.6324125745229893, + "grad_norm": 0.65625, + "learning_rate": 5.1538461538461534e-06, + "loss": 1.6174, + "step": 99500 + }, + { + "epoch": 0.633048165050148, + "grad_norm": 0.9375, + "learning_rate": 5.148717948717949e-06, + "loss": 1.6236, + "step": 99600 + }, + { + "epoch": 0.6336837555773068, + "grad_norm": 0.8515625, + "learning_rate": 5.143589743589744e-06, + "loss": 1.6216, + "step": 99700 + }, + { + "epoch": 0.6343193461044656, + "grad_norm": 0.75390625, + "learning_rate": 5.138461538461539e-06, + "loss": 1.6239, + "step": 99800 + }, + { + "epoch": 0.6349549366316244, + "grad_norm": 0.70703125, + "learning_rate": 5.133333333333334e-06, + "loss": 1.6215, + "step": 99900 + }, + { + "epoch": 0.6355905271587833, + "grad_norm": 0.7421875, + "learning_rate": 5.128205128205128e-06, + "loss": 1.5993, + "step": 100000 + }, + { + "epoch": 0.6362261176859421, + "grad_norm": 0.6328125, + "learning_rate": 5.123076923076924e-06, + "loss": 1.608, + "step": 100100 + }, + { + "epoch": 0.6368617082131008, + "grad_norm": 0.71484375, + "learning_rate": 5.1179487179487186e-06, + "loss": 1.6131, + "step": 100200 + }, + { + "epoch": 0.6374972987402596, + "grad_norm": 0.99609375, + "learning_rate": 5.1128205128205135e-06, + "loss": 1.626, + "step": 100300 + }, + { + "epoch": 0.6381328892674184, + "grad_norm": 1.03125, + "learning_rate": 5.1076923076923075e-06, + "loss": 1.6142, + "step": 100400 + }, + { + "epoch": 0.6387684797945772, + "grad_norm": 0.69921875, + "learning_rate": 5.1025641025641024e-06, + "loss": 1.6346, + "step": 100500 + }, + { + "epoch": 0.639404070321736, + "grad_norm": 0.828125, + "learning_rate": 5.097435897435898e-06, + "loss": 1.604, + "step": 100600 + }, + { + "epoch": 0.6400396608488947, + "grad_norm": 0.7265625, + "learning_rate": 5.092307692307693e-06, + "loss": 1.629, + "step": 100700 + }, + { + "epoch": 0.6406752513760535, + "grad_norm": 0.66796875, + "learning_rate": 5.087179487179488e-06, + "loss": 1.6069, + "step": 100800 + }, + { + "epoch": 0.6413108419032123, + "grad_norm": 0.9296875, + "learning_rate": 5.082051282051282e-06, + "loss": 1.6207, + "step": 100900 + }, + { + "epoch": 0.6419464324303711, + "grad_norm": 0.6640625, + "learning_rate": 5.076923076923077e-06, + "loss": 1.6052, + "step": 101000 + }, + { + "epoch": 0.6425820229575299, + "grad_norm": 0.95703125, + "learning_rate": 5.071794871794872e-06, + "loss": 1.6145, + "step": 101100 + }, + { + "epoch": 0.6432176134846886, + "grad_norm": 0.7265625, + "learning_rate": 5.0666666666666676e-06, + "loss": 1.6133, + "step": 101200 + }, + { + "epoch": 0.6438532040118474, + "grad_norm": 0.6875, + "learning_rate": 5.0615384615384625e-06, + "loss": 1.6339, + "step": 101300 + }, + { + "epoch": 0.6444887945390062, + "grad_norm": 0.8046875, + "learning_rate": 5.0564102564102565e-06, + "loss": 1.5978, + "step": 101400 + }, + { + "epoch": 0.645124385066165, + "grad_norm": 0.93359375, + "learning_rate": 5.051282051282051e-06, + "loss": 1.6232, + "step": 101500 + }, + { + "epoch": 0.6457599755933238, + "grad_norm": 0.7890625, + "learning_rate": 5.046153846153846e-06, + "loss": 1.6328, + "step": 101600 + }, + { + "epoch": 0.6463955661204825, + "grad_norm": 1.109375, + "learning_rate": 5.041025641025642e-06, + "loss": 1.6116, + "step": 101700 + }, + { + "epoch": 0.6470311566476413, + "grad_norm": 0.84765625, + "learning_rate": 5.035897435897437e-06, + "loss": 1.6376, + "step": 101800 + }, + { + "epoch": 0.6476667471748001, + "grad_norm": 0.85546875, + "learning_rate": 5.030769230769231e-06, + "loss": 1.614, + "step": 101900 + }, + { + "epoch": 0.6483023377019589, + "grad_norm": 0.7890625, + "learning_rate": 5.025641025641026e-06, + "loss": 1.616, + "step": 102000 + }, + { + "epoch": 0.6489379282291177, + "grad_norm": 1.0859375, + "learning_rate": 5.020512820512821e-06, + "loss": 1.6242, + "step": 102100 + }, + { + "epoch": 0.6495735187562764, + "grad_norm": 0.890625, + "learning_rate": 5.015384615384616e-06, + "loss": 1.6139, + "step": 102200 + }, + { + "epoch": 0.6502091092834352, + "grad_norm": 0.5859375, + "learning_rate": 5.0102564102564115e-06, + "loss": 1.6071, + "step": 102300 + }, + { + "epoch": 0.650844699810594, + "grad_norm": 0.8671875, + "learning_rate": 5.0051282051282055e-06, + "loss": 1.6095, + "step": 102400 + }, + { + "epoch": 0.6514802903377528, + "grad_norm": 0.76953125, + "learning_rate": 5e-06, + "loss": 1.6153, + "step": 102500 + }, + { + "epoch": 0.6521158808649116, + "grad_norm": 0.8671875, + "learning_rate": 4.994871794871795e-06, + "loss": 1.6096, + "step": 102600 + }, + { + "epoch": 0.6527514713920703, + "grad_norm": 0.6171875, + "learning_rate": 4.98974358974359e-06, + "loss": 1.6182, + "step": 102700 + }, + { + "epoch": 0.6533870619192291, + "grad_norm": 1.109375, + "learning_rate": 4.984615384615385e-06, + "loss": 1.6078, + "step": 102800 + }, + { + "epoch": 0.6540226524463879, + "grad_norm": 0.58984375, + "learning_rate": 4.97948717948718e-06, + "loss": 1.6203, + "step": 102900 + }, + { + "epoch": 0.6546582429735467, + "grad_norm": 1.203125, + "learning_rate": 4.974358974358975e-06, + "loss": 1.625, + "step": 103000 + }, + { + "epoch": 0.6552938335007055, + "grad_norm": 0.76953125, + "learning_rate": 4.96923076923077e-06, + "loss": 1.6202, + "step": 103100 + }, + { + "epoch": 0.6559294240278642, + "grad_norm": 0.78515625, + "learning_rate": 4.964102564102565e-06, + "loss": 1.6192, + "step": 103200 + }, + { + "epoch": 0.656565014555023, + "grad_norm": 1.03125, + "learning_rate": 4.95897435897436e-06, + "loss": 1.6077, + "step": 103300 + }, + { + "epoch": 0.6572006050821818, + "grad_norm": 0.71875, + "learning_rate": 4.9538461538461545e-06, + "loss": 1.6113, + "step": 103400 + }, + { + "epoch": 0.6578361956093406, + "grad_norm": 0.68359375, + "learning_rate": 4.948717948717949e-06, + "loss": 1.6292, + "step": 103500 + }, + { + "epoch": 0.6584717861364994, + "grad_norm": 1.0546875, + "learning_rate": 4.943589743589744e-06, + "loss": 1.616, + "step": 103600 + }, + { + "epoch": 0.6591073766636582, + "grad_norm": 0.70703125, + "learning_rate": 4.938461538461538e-06, + "loss": 1.6265, + "step": 103700 + }, + { + "epoch": 0.659742967190817, + "grad_norm": 0.5859375, + "learning_rate": 4.933333333333334e-06, + "loss": 1.6158, + "step": 103800 + }, + { + "epoch": 0.6603785577179758, + "grad_norm": 0.9609375, + "learning_rate": 4.928205128205128e-06, + "loss": 1.6274, + "step": 103900 + }, + { + "epoch": 0.6610141482451346, + "grad_norm": 0.640625, + "learning_rate": 4.923076923076924e-06, + "loss": 1.6122, + "step": 104000 + }, + { + "epoch": 0.6616497387722934, + "grad_norm": 0.90234375, + "learning_rate": 4.917948717948719e-06, + "loss": 1.6128, + "step": 104100 + }, + { + "epoch": 0.6622853292994522, + "grad_norm": 0.6015625, + "learning_rate": 4.912820512820513e-06, + "loss": 1.6175, + "step": 104200 + }, + { + "epoch": 0.6629209198266109, + "grad_norm": 1.390625, + "learning_rate": 4.907692307692309e-06, + "loss": 1.6262, + "step": 104300 + }, + { + "epoch": 0.6635565103537697, + "grad_norm": 0.953125, + "learning_rate": 4.902564102564103e-06, + "loss": 1.6097, + "step": 104400 + }, + { + "epoch": 0.6641921008809285, + "grad_norm": 1.015625, + "learning_rate": 4.8974358974358975e-06, + "loss": 1.6119, + "step": 104500 + }, + { + "epoch": 0.6648276914080873, + "grad_norm": 0.84765625, + "learning_rate": 4.892307692307693e-06, + "loss": 1.6309, + "step": 104600 + }, + { + "epoch": 0.6654632819352461, + "grad_norm": 0.77734375, + "learning_rate": 4.887179487179487e-06, + "loss": 1.6189, + "step": 104700 + }, + { + "epoch": 0.6660988724624048, + "grad_norm": 0.7578125, + "learning_rate": 4.882051282051282e-06, + "loss": 1.6162, + "step": 104800 + }, + { + "epoch": 0.6667344629895636, + "grad_norm": 0.98828125, + "learning_rate": 4.876923076923077e-06, + "loss": 1.634, + "step": 104900 + }, + { + "epoch": 0.6673700535167224, + "grad_norm": 0.8046875, + "learning_rate": 4.871794871794872e-06, + "loss": 1.6159, + "step": 105000 + }, + { + "epoch": 0.6680056440438812, + "grad_norm": 0.88671875, + "learning_rate": 4.866666666666667e-06, + "loss": 1.6145, + "step": 105100 + }, + { + "epoch": 0.66864123457104, + "grad_norm": 0.85546875, + "learning_rate": 4.861538461538462e-06, + "loss": 1.6403, + "step": 105200 + }, + { + "epoch": 0.6692768250981987, + "grad_norm": 0.76953125, + "learning_rate": 4.856410256410257e-06, + "loss": 1.6177, + "step": 105300 + }, + { + "epoch": 0.6699124156253575, + "grad_norm": 0.87109375, + "learning_rate": 4.851282051282052e-06, + "loss": 1.6122, + "step": 105400 + }, + { + "epoch": 0.6705480061525163, + "grad_norm": 1.03125, + "learning_rate": 4.8461538461538465e-06, + "loss": 1.6122, + "step": 105500 + }, + { + "epoch": 0.6711835966796751, + "grad_norm": 0.8359375, + "learning_rate": 4.8410256410256414e-06, + "loss": 1.6299, + "step": 105600 + }, + { + "epoch": 0.6718191872068339, + "grad_norm": 0.984375, + "learning_rate": 4.835897435897436e-06, + "loss": 1.6227, + "step": 105700 + }, + { + "epoch": 0.6724547777339926, + "grad_norm": 0.70703125, + "learning_rate": 4.830769230769231e-06, + "loss": 1.6101, + "step": 105800 + }, + { + "epoch": 0.6730903682611514, + "grad_norm": 1.0390625, + "learning_rate": 4.825641025641026e-06, + "loss": 1.628, + "step": 105900 + }, + { + "epoch": 0.6737259587883102, + "grad_norm": 1.0546875, + "learning_rate": 4.820512820512821e-06, + "loss": 1.6062, + "step": 106000 + }, + { + "epoch": 0.674361549315469, + "grad_norm": 0.6640625, + "learning_rate": 4.815384615384616e-06, + "loss": 1.6278, + "step": 106100 + }, + { + "epoch": 0.6749971398426278, + "grad_norm": 1.09375, + "learning_rate": 4.810256410256411e-06, + "loss": 1.6188, + "step": 106200 + }, + { + "epoch": 0.6756327303697865, + "grad_norm": 0.84375, + "learning_rate": 4.805128205128206e-06, + "loss": 1.6195, + "step": 106300 + }, + { + "epoch": 0.6762683208969453, + "grad_norm": 0.94140625, + "learning_rate": 4.800000000000001e-06, + "loss": 1.6191, + "step": 106400 + }, + { + "epoch": 0.6769039114241041, + "grad_norm": 0.77734375, + "learning_rate": 4.7948717948717955e-06, + "loss": 1.6235, + "step": 106500 + }, + { + "epoch": 0.6775395019512629, + "grad_norm": 1.15625, + "learning_rate": 4.7897435897435904e-06, + "loss": 1.6166, + "step": 106600 + }, + { + "epoch": 0.6781750924784217, + "grad_norm": 0.734375, + "learning_rate": 4.7846153846153845e-06, + "loss": 1.5958, + "step": 106700 + }, + { + "epoch": 0.6788106830055805, + "grad_norm": 0.72265625, + "learning_rate": 4.77948717948718e-06, + "loss": 1.622, + "step": 106800 + }, + { + "epoch": 0.6794462735327392, + "grad_norm": 0.94921875, + "learning_rate": 4.774358974358974e-06, + "loss": 1.6315, + "step": 106900 + }, + { + "epoch": 0.680081864059898, + "grad_norm": 0.83203125, + "learning_rate": 4.76923076923077e-06, + "loss": 1.6155, + "step": 107000 + }, + { + "epoch": 0.6807174545870568, + "grad_norm": 1.171875, + "learning_rate": 4.764102564102565e-06, + "loss": 1.6098, + "step": 107100 + }, + { + "epoch": 0.6813530451142156, + "grad_norm": 0.98828125, + "learning_rate": 4.758974358974359e-06, + "loss": 1.6073, + "step": 107200 + }, + { + "epoch": 0.6819886356413744, + "grad_norm": 0.75390625, + "learning_rate": 4.753846153846155e-06, + "loss": 1.6177, + "step": 107300 + }, + { + "epoch": 0.6826242261685331, + "grad_norm": 1.0625, + "learning_rate": 4.748717948717949e-06, + "loss": 1.6186, + "step": 107400 + }, + { + "epoch": 0.6832598166956919, + "grad_norm": 0.97265625, + "learning_rate": 4.743589743589744e-06, + "loss": 1.6383, + "step": 107500 + }, + { + "epoch": 0.6838954072228508, + "grad_norm": 0.5546875, + "learning_rate": 4.738461538461539e-06, + "loss": 1.6377, + "step": 107600 + }, + { + "epoch": 0.6845309977500096, + "grad_norm": 0.63671875, + "learning_rate": 4.7333333333333335e-06, + "loss": 1.616, + "step": 107700 + }, + { + "epoch": 0.6851665882771684, + "grad_norm": 1.0390625, + "learning_rate": 4.728205128205128e-06, + "loss": 1.6348, + "step": 107800 + }, + { + "epoch": 0.6858021788043271, + "grad_norm": 1.015625, + "learning_rate": 4.723076923076923e-06, + "loss": 1.6176, + "step": 107900 + }, + { + "epoch": 0.6864377693314859, + "grad_norm": 0.95703125, + "learning_rate": 4.717948717948718e-06, + "loss": 1.6137, + "step": 108000 + }, + { + "epoch": 0.6870733598586447, + "grad_norm": 0.828125, + "learning_rate": 4.712820512820513e-06, + "loss": 1.6239, + "step": 108100 + }, + { + "epoch": 0.6877089503858035, + "grad_norm": 0.71484375, + "learning_rate": 4.707692307692308e-06, + "loss": 1.6267, + "step": 108200 + }, + { + "epoch": 0.6883445409129623, + "grad_norm": 0.80859375, + "learning_rate": 4.702564102564103e-06, + "loss": 1.6296, + "step": 108300 + }, + { + "epoch": 0.688980131440121, + "grad_norm": 1.0390625, + "learning_rate": 4.697435897435898e-06, + "loss": 1.6124, + "step": 108400 + }, + { + "epoch": 0.6896157219672798, + "grad_norm": 0.9296875, + "learning_rate": 4.692307692307693e-06, + "loss": 1.6137, + "step": 108500 + }, + { + "epoch": 0.6902513124944386, + "grad_norm": 0.7421875, + "learning_rate": 4.6871794871794876e-06, + "loss": 1.6152, + "step": 108600 + }, + { + "epoch": 0.6908869030215974, + "grad_norm": 0.7578125, + "learning_rate": 4.6820512820512825e-06, + "loss": 1.6073, + "step": 108700 + }, + { + "epoch": 0.6915224935487562, + "grad_norm": 0.78515625, + "learning_rate": 4.676923076923077e-06, + "loss": 1.6117, + "step": 108800 + }, + { + "epoch": 0.692158084075915, + "grad_norm": 1.0625, + "learning_rate": 4.671794871794872e-06, + "loss": 1.6196, + "step": 108900 + }, + { + "epoch": 0.6927936746030737, + "grad_norm": 0.953125, + "learning_rate": 4.666666666666667e-06, + "loss": 1.6081, + "step": 109000 + }, + { + "epoch": 0.6934292651302325, + "grad_norm": 1.921875, + "learning_rate": 4.661538461538462e-06, + "loss": 1.6223, + "step": 109100 + }, + { + "epoch": 0.6940648556573913, + "grad_norm": 0.765625, + "learning_rate": 4.656410256410257e-06, + "loss": 1.6228, + "step": 109200 + }, + { + "epoch": 0.6947004461845501, + "grad_norm": 0.75, + "learning_rate": 4.651282051282052e-06, + "loss": 1.6186, + "step": 109300 + }, + { + "epoch": 0.6953360367117088, + "grad_norm": 0.90234375, + "learning_rate": 4.646153846153847e-06, + "loss": 1.6278, + "step": 109400 + }, + { + "epoch": 0.6959716272388676, + "grad_norm": 0.953125, + "learning_rate": 4.641025641025642e-06, + "loss": 1.6158, + "step": 109500 + }, + { + "epoch": 0.6966072177660264, + "grad_norm": 0.94921875, + "learning_rate": 4.6358974358974366e-06, + "loss": 1.6168, + "step": 109600 + }, + { + "epoch": 0.6972428082931852, + "grad_norm": 0.84765625, + "learning_rate": 4.630769230769231e-06, + "loss": 1.629, + "step": 109700 + }, + { + "epoch": 0.697878398820344, + "grad_norm": 0.76953125, + "learning_rate": 4.625641025641026e-06, + "loss": 1.6176, + "step": 109800 + }, + { + "epoch": 0.6985139893475028, + "grad_norm": 0.625, + "learning_rate": 4.62051282051282e-06, + "loss": 1.6137, + "step": 109900 + }, + { + "epoch": 0.6991495798746615, + "grad_norm": 0.9765625, + "learning_rate": 4.615384615384616e-06, + "loss": 1.609, + "step": 110000 + }, + { + "epoch": 0.6997851704018203, + "grad_norm": 0.8828125, + "learning_rate": 4.610256410256411e-06, + "loss": 1.6135, + "step": 110100 + }, + { + "epoch": 0.7004207609289791, + "grad_norm": 0.984375, + "learning_rate": 4.605128205128205e-06, + "loss": 1.6305, + "step": 110200 + }, + { + "epoch": 0.7010563514561379, + "grad_norm": 0.90234375, + "learning_rate": 4.600000000000001e-06, + "loss": 1.6075, + "step": 110300 + }, + { + "epoch": 0.7016919419832967, + "grad_norm": 0.94140625, + "learning_rate": 4.594871794871795e-06, + "loss": 1.6183, + "step": 110400 + }, + { + "epoch": 0.7023275325104554, + "grad_norm": 0.80078125, + "learning_rate": 4.58974358974359e-06, + "loss": 1.6249, + "step": 110500 + }, + { + "epoch": 0.7029631230376142, + "grad_norm": 1.03125, + "learning_rate": 4.5846153846153855e-06, + "loss": 1.6048, + "step": 110600 + }, + { + "epoch": 0.703598713564773, + "grad_norm": 0.921875, + "learning_rate": 4.57948717948718e-06, + "loss": 1.641, + "step": 110700 + }, + { + "epoch": 0.7042343040919318, + "grad_norm": 0.76953125, + "learning_rate": 4.5743589743589745e-06, + "loss": 1.6341, + "step": 110800 + }, + { + "epoch": 0.7048698946190906, + "grad_norm": 0.71875, + "learning_rate": 4.569230769230769e-06, + "loss": 1.6187, + "step": 110900 + }, + { + "epoch": 0.7055054851462493, + "grad_norm": 1.109375, + "learning_rate": 4.564102564102564e-06, + "loss": 1.6084, + "step": 111000 + }, + { + "epoch": 0.7061410756734081, + "grad_norm": 0.8515625, + "learning_rate": 4.558974358974359e-06, + "loss": 1.6216, + "step": 111100 + }, + { + "epoch": 0.7067766662005669, + "grad_norm": 0.90625, + "learning_rate": 4.553846153846154e-06, + "loss": 1.6216, + "step": 111200 + }, + { + "epoch": 0.7074122567277257, + "grad_norm": 0.6484375, + "learning_rate": 4.548717948717949e-06, + "loss": 1.6371, + "step": 111300 + }, + { + "epoch": 0.7080478472548846, + "grad_norm": 0.859375, + "learning_rate": 4.543589743589744e-06, + "loss": 1.6226, + "step": 111400 + }, + { + "epoch": 0.7086834377820433, + "grad_norm": 0.7265625, + "learning_rate": 4.538461538461539e-06, + "loss": 1.6266, + "step": 111500 + }, + { + "epoch": 0.7093190283092021, + "grad_norm": 0.921875, + "learning_rate": 4.533333333333334e-06, + "loss": 1.6185, + "step": 111600 + }, + { + "epoch": 0.7099546188363609, + "grad_norm": 0.89453125, + "learning_rate": 4.528205128205129e-06, + "loss": 1.6345, + "step": 111700 + }, + { + "epoch": 0.7105902093635197, + "grad_norm": 0.765625, + "learning_rate": 4.5230769230769235e-06, + "loss": 1.6351, + "step": 111800 + }, + { + "epoch": 0.7112257998906785, + "grad_norm": 1.203125, + "learning_rate": 4.517948717948718e-06, + "loss": 1.6055, + "step": 111900 + }, + { + "epoch": 0.7118613904178372, + "grad_norm": 0.87109375, + "learning_rate": 4.512820512820513e-06, + "loss": 1.6265, + "step": 112000 + }, + { + "epoch": 0.712496980944996, + "grad_norm": 0.671875, + "learning_rate": 4.507692307692308e-06, + "loss": 1.6411, + "step": 112100 + }, + { + "epoch": 0.7131325714721548, + "grad_norm": 0.69140625, + "learning_rate": 4.502564102564103e-06, + "loss": 1.627, + "step": 112200 + }, + { + "epoch": 0.7137681619993136, + "grad_norm": 1.125, + "learning_rate": 4.497435897435898e-06, + "loss": 1.618, + "step": 112300 + }, + { + "epoch": 0.7144037525264724, + "grad_norm": 0.9921875, + "learning_rate": 4.492307692307693e-06, + "loss": 1.6237, + "step": 112400 + }, + { + "epoch": 0.7150393430536311, + "grad_norm": 1.015625, + "learning_rate": 4.487179487179488e-06, + "loss": 1.6341, + "step": 112500 + }, + { + "epoch": 0.7156749335807899, + "grad_norm": 1.0859375, + "learning_rate": 4.482051282051283e-06, + "loss": 1.6358, + "step": 112600 + }, + { + "epoch": 0.7163105241079487, + "grad_norm": 0.93359375, + "learning_rate": 4.476923076923077e-06, + "loss": 1.6074, + "step": 112700 + }, + { + "epoch": 0.7169461146351075, + "grad_norm": 1.0390625, + "learning_rate": 4.4717948717948725e-06, + "loss": 1.624, + "step": 112800 + }, + { + "epoch": 0.7175817051622663, + "grad_norm": 0.73828125, + "learning_rate": 4.4666666666666665e-06, + "loss": 1.6122, + "step": 112900 + }, + { + "epoch": 0.718217295689425, + "grad_norm": 0.73046875, + "learning_rate": 4.461538461538462e-06, + "loss": 1.6184, + "step": 113000 + }, + { + "epoch": 0.7188528862165838, + "grad_norm": 1.1875, + "learning_rate": 4.456410256410257e-06, + "loss": 1.6202, + "step": 113100 + }, + { + "epoch": 0.7194884767437426, + "grad_norm": 0.6640625, + "learning_rate": 4.451282051282051e-06, + "loss": 1.6271, + "step": 113200 + }, + { + "epoch": 0.7201240672709014, + "grad_norm": 0.91796875, + "learning_rate": 4.446153846153847e-06, + "loss": 1.6147, + "step": 113300 + }, + { + "epoch": 0.7207596577980602, + "grad_norm": 0.65625, + "learning_rate": 4.441025641025641e-06, + "loss": 1.6253, + "step": 113400 + }, + { + "epoch": 0.721395248325219, + "grad_norm": 0.97265625, + "learning_rate": 4.435897435897436e-06, + "loss": 1.6078, + "step": 113500 + }, + { + "epoch": 0.7220308388523777, + "grad_norm": 0.7109375, + "learning_rate": 4.430769230769232e-06, + "loss": 1.633, + "step": 113600 + }, + { + "epoch": 0.7226664293795365, + "grad_norm": 0.80078125, + "learning_rate": 4.425641025641026e-06, + "loss": 1.6203, + "step": 113700 + }, + { + "epoch": 0.7233020199066953, + "grad_norm": 0.95703125, + "learning_rate": 4.420512820512821e-06, + "loss": 1.6073, + "step": 113800 + }, + { + "epoch": 0.7239376104338541, + "grad_norm": 0.76171875, + "learning_rate": 4.4153846153846155e-06, + "loss": 1.62, + "step": 113900 + }, + { + "epoch": 0.7245732009610129, + "grad_norm": 0.5859375, + "learning_rate": 4.4102564102564104e-06, + "loss": 1.6269, + "step": 114000 + }, + { + "epoch": 0.7252087914881716, + "grad_norm": 0.890625, + "learning_rate": 4.405128205128205e-06, + "loss": 1.636, + "step": 114100 + }, + { + "epoch": 0.7258443820153304, + "grad_norm": 1.2265625, + "learning_rate": 4.4e-06, + "loss": 1.6242, + "step": 114200 + }, + { + "epoch": 0.7264799725424892, + "grad_norm": 0.7421875, + "learning_rate": 4.394871794871795e-06, + "loss": 1.6222, + "step": 114300 + }, + { + "epoch": 0.727115563069648, + "grad_norm": 0.75390625, + "learning_rate": 4.38974358974359e-06, + "loss": 1.6353, + "step": 114400 + }, + { + "epoch": 0.7277511535968068, + "grad_norm": 0.89453125, + "learning_rate": 4.384615384615385e-06, + "loss": 1.6359, + "step": 114500 + }, + { + "epoch": 0.7283867441239655, + "grad_norm": 0.99609375, + "learning_rate": 4.37948717948718e-06, + "loss": 1.62, + "step": 114600 + }, + { + "epoch": 0.7290223346511243, + "grad_norm": 0.796875, + "learning_rate": 4.374358974358975e-06, + "loss": 1.6287, + "step": 114700 + }, + { + "epoch": 0.7296579251782831, + "grad_norm": 0.84765625, + "learning_rate": 4.36923076923077e-06, + "loss": 1.6362, + "step": 114800 + }, + { + "epoch": 0.7302935157054419, + "grad_norm": 0.6796875, + "learning_rate": 4.3641025641025645e-06, + "loss": 1.6261, + "step": 114900 + }, + { + "epoch": 0.7309291062326007, + "grad_norm": 0.7265625, + "learning_rate": 4.358974358974359e-06, + "loss": 1.6185, + "step": 115000 + }, + { + "epoch": 0.7315646967597595, + "grad_norm": 0.578125, + "learning_rate": 4.353846153846154e-06, + "loss": 1.6173, + "step": 115100 + }, + { + "epoch": 0.7322002872869183, + "grad_norm": 0.8515625, + "learning_rate": 4.348717948717949e-06, + "loss": 1.627, + "step": 115200 + }, + { + "epoch": 0.7328358778140771, + "grad_norm": 1.234375, + "learning_rate": 4.343589743589744e-06, + "loss": 1.643, + "step": 115300 + }, + { + "epoch": 0.7334714683412359, + "grad_norm": 0.96875, + "learning_rate": 4.338461538461539e-06, + "loss": 1.6276, + "step": 115400 + }, + { + "epoch": 0.7341070588683947, + "grad_norm": 0.75, + "learning_rate": 4.333333333333334e-06, + "loss": 1.6176, + "step": 115500 + }, + { + "epoch": 0.7347426493955534, + "grad_norm": 1.2734375, + "learning_rate": 4.328205128205129e-06, + "loss": 1.6187, + "step": 115600 + }, + { + "epoch": 0.7353782399227122, + "grad_norm": 0.92578125, + "learning_rate": 4.323076923076923e-06, + "loss": 1.6196, + "step": 115700 + }, + { + "epoch": 0.736013830449871, + "grad_norm": 1.2421875, + "learning_rate": 4.317948717948719e-06, + "loss": 1.6119, + "step": 115800 + }, + { + "epoch": 0.7366494209770298, + "grad_norm": 0.875, + "learning_rate": 4.312820512820513e-06, + "loss": 1.6179, + "step": 115900 + }, + { + "epoch": 0.7372850115041886, + "grad_norm": 0.9375, + "learning_rate": 4.307692307692308e-06, + "loss": 1.6227, + "step": 116000 + }, + { + "epoch": 0.7379206020313474, + "grad_norm": 0.83984375, + "learning_rate": 4.302564102564103e-06, + "loss": 1.6225, + "step": 116100 + }, + { + "epoch": 0.7385561925585061, + "grad_norm": 0.55078125, + "learning_rate": 4.297435897435897e-06, + "loss": 1.603, + "step": 116200 + }, + { + "epoch": 0.7391917830856649, + "grad_norm": 0.66796875, + "learning_rate": 4.292307692307693e-06, + "loss": 1.62, + "step": 116300 + }, + { + "epoch": 0.7398273736128237, + "grad_norm": 1.1640625, + "learning_rate": 4.287179487179487e-06, + "loss": 1.6205, + "step": 116400 + }, + { + "epoch": 0.7404629641399825, + "grad_norm": 1.1015625, + "learning_rate": 4.282051282051282e-06, + "loss": 1.6217, + "step": 116500 + }, + { + "epoch": 0.7410985546671413, + "grad_norm": 0.859375, + "learning_rate": 4.276923076923078e-06, + "loss": 1.6287, + "step": 116600 + }, + { + "epoch": 0.7417341451943, + "grad_norm": 0.82421875, + "learning_rate": 4.271794871794872e-06, + "loss": 1.6273, + "step": 116700 + }, + { + "epoch": 0.7423697357214588, + "grad_norm": 0.859375, + "learning_rate": 4.266666666666668e-06, + "loss": 1.644, + "step": 116800 + }, + { + "epoch": 0.7430053262486176, + "grad_norm": 0.9765625, + "learning_rate": 4.261538461538462e-06, + "loss": 1.6179, + "step": 116900 + }, + { + "epoch": 0.7436409167757764, + "grad_norm": 0.8671875, + "learning_rate": 4.2564102564102566e-06, + "loss": 1.625, + "step": 117000 + }, + { + "epoch": 0.7442765073029352, + "grad_norm": 0.6875, + "learning_rate": 4.2512820512820515e-06, + "loss": 1.6082, + "step": 117100 + }, + { + "epoch": 0.7449120978300939, + "grad_norm": 0.63671875, + "learning_rate": 4.246153846153846e-06, + "loss": 1.6159, + "step": 117200 + }, + { + "epoch": 0.7455476883572527, + "grad_norm": 0.61328125, + "learning_rate": 4.241025641025641e-06, + "loss": 1.6112, + "step": 117300 + }, + { + "epoch": 0.7461832788844115, + "grad_norm": 0.8984375, + "learning_rate": 4.235897435897436e-06, + "loss": 1.6248, + "step": 117400 + }, + { + "epoch": 0.7468188694115703, + "grad_norm": 0.83203125, + "learning_rate": 4.230769230769231e-06, + "loss": 1.6073, + "step": 117500 + }, + { + "epoch": 0.747454459938729, + "grad_norm": 0.78125, + "learning_rate": 4.225641025641026e-06, + "loss": 1.6288, + "step": 117600 + }, + { + "epoch": 0.7480900504658878, + "grad_norm": 1.078125, + "learning_rate": 4.220512820512821e-06, + "loss": 1.6191, + "step": 117700 + }, + { + "epoch": 0.7487256409930466, + "grad_norm": 0.78125, + "learning_rate": 4.215384615384616e-06, + "loss": 1.622, + "step": 117800 + }, + { + "epoch": 0.7493612315202054, + "grad_norm": 1.0546875, + "learning_rate": 4.210256410256411e-06, + "loss": 1.6162, + "step": 117900 + }, + { + "epoch": 0.7499968220473642, + "grad_norm": 1.1640625, + "learning_rate": 4.2051282051282055e-06, + "loss": 1.6305, + "step": 118000 + }, + { + "epoch": 0.750632412574523, + "grad_norm": 0.97265625, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.6306, + "step": 118100 + }, + { + "epoch": 0.7512680031016817, + "grad_norm": 0.8359375, + "learning_rate": 4.194871794871795e-06, + "loss": 1.627, + "step": 118200 + }, + { + "epoch": 0.7519035936288405, + "grad_norm": 0.81640625, + "learning_rate": 4.18974358974359e-06, + "loss": 1.6304, + "step": 118300 + }, + { + "epoch": 0.7525391841559993, + "grad_norm": 1.15625, + "learning_rate": 4.184615384615385e-06, + "loss": 1.6091, + "step": 118400 + }, + { + "epoch": 0.7531747746831581, + "grad_norm": 0.87109375, + "learning_rate": 4.17948717948718e-06, + "loss": 1.6114, + "step": 118500 + }, + { + "epoch": 0.7538103652103169, + "grad_norm": 0.8515625, + "learning_rate": 4.174358974358975e-06, + "loss": 1.6268, + "step": 118600 + }, + { + "epoch": 0.7544459557374756, + "grad_norm": 0.69921875, + "learning_rate": 4.169230769230769e-06, + "loss": 1.6143, + "step": 118700 + }, + { + "epoch": 0.7550815462646344, + "grad_norm": 0.69140625, + "learning_rate": 4.164102564102565e-06, + "loss": 1.6197, + "step": 118800 + }, + { + "epoch": 0.7557171367917933, + "grad_norm": 1.1484375, + "learning_rate": 4.158974358974359e-06, + "loss": 1.6161, + "step": 118900 + }, + { + "epoch": 0.7563527273189521, + "grad_norm": 0.8984375, + "learning_rate": 4.1538461538461545e-06, + "loss": 1.6146, + "step": 119000 + }, + { + "epoch": 0.7569883178461109, + "grad_norm": 0.83203125, + "learning_rate": 4.1487179487179494e-06, + "loss": 1.6216, + "step": 119100 + }, + { + "epoch": 0.7576239083732696, + "grad_norm": 1.0234375, + "learning_rate": 4.1435897435897435e-06, + "loss": 1.6253, + "step": 119200 + }, + { + "epoch": 0.7582594989004284, + "grad_norm": 0.71484375, + "learning_rate": 4.138461538461539e-06, + "loss": 1.6314, + "step": 119300 + }, + { + "epoch": 0.7588950894275872, + "grad_norm": 0.79296875, + "learning_rate": 4.133333333333333e-06, + "loss": 1.6238, + "step": 119400 + }, + { + "epoch": 0.759530679954746, + "grad_norm": 0.6953125, + "learning_rate": 4.128205128205128e-06, + "loss": 1.6102, + "step": 119500 + }, + { + "epoch": 0.7601662704819048, + "grad_norm": 0.96484375, + "learning_rate": 4.123076923076924e-06, + "loss": 1.6004, + "step": 119600 + }, + { + "epoch": 0.7608018610090636, + "grad_norm": 1.15625, + "learning_rate": 4.117948717948718e-06, + "loss": 1.618, + "step": 119700 + }, + { + "epoch": 0.7614374515362223, + "grad_norm": 0.77734375, + "learning_rate": 4.112820512820514e-06, + "loss": 1.6198, + "step": 119800 + }, + { + "epoch": 0.7620730420633811, + "grad_norm": 0.85546875, + "learning_rate": 4.107692307692308e-06, + "loss": 1.6105, + "step": 119900 + }, + { + "epoch": 0.7627086325905399, + "grad_norm": 0.57421875, + "learning_rate": 4.102564102564103e-06, + "loss": 1.61, + "step": 120000 + }, + { + "epoch": 0.7633442231176987, + "grad_norm": 1.09375, + "learning_rate": 4.097435897435898e-06, + "loss": 1.6288, + "step": 120100 + }, + { + "epoch": 0.7639798136448575, + "grad_norm": 0.609375, + "learning_rate": 4.0923076923076925e-06, + "loss": 1.6215, + "step": 120200 + }, + { + "epoch": 0.7646154041720162, + "grad_norm": 1.09375, + "learning_rate": 4.087179487179487e-06, + "loss": 1.6368, + "step": 120300 + }, + { + "epoch": 0.765250994699175, + "grad_norm": 0.76171875, + "learning_rate": 4.082051282051282e-06, + "loss": 1.6292, + "step": 120400 + }, + { + "epoch": 0.7658865852263338, + "grad_norm": 0.71875, + "learning_rate": 4.076923076923077e-06, + "loss": 1.6237, + "step": 120500 + }, + { + "epoch": 0.7665221757534926, + "grad_norm": 0.98046875, + "learning_rate": 4.071794871794872e-06, + "loss": 1.6285, + "step": 120600 + }, + { + "epoch": 0.7671577662806514, + "grad_norm": 0.83203125, + "learning_rate": 4.066666666666667e-06, + "loss": 1.6288, + "step": 120700 + }, + { + "epoch": 0.7677933568078101, + "grad_norm": 0.87890625, + "learning_rate": 4.061538461538462e-06, + "loss": 1.634, + "step": 120800 + }, + { + "epoch": 0.7684289473349689, + "grad_norm": 1.0078125, + "learning_rate": 4.056410256410257e-06, + "loss": 1.6295, + "step": 120900 + }, + { + "epoch": 0.7690645378621277, + "grad_norm": 0.87890625, + "learning_rate": 4.051282051282052e-06, + "loss": 1.6324, + "step": 121000 + }, + { + "epoch": 0.7697001283892865, + "grad_norm": 0.984375, + "learning_rate": 4.0461538461538466e-06, + "loss": 1.6293, + "step": 121100 + }, + { + "epoch": 0.7703357189164453, + "grad_norm": 0.86328125, + "learning_rate": 4.0410256410256415e-06, + "loss": 1.6165, + "step": 121200 + }, + { + "epoch": 0.770971309443604, + "grad_norm": 0.71875, + "learning_rate": 4.035897435897436e-06, + "loss": 1.6188, + "step": 121300 + }, + { + "epoch": 0.7716068999707628, + "grad_norm": 0.95703125, + "learning_rate": 4.030769230769231e-06, + "loss": 1.6288, + "step": 121400 + }, + { + "epoch": 0.7722424904979216, + "grad_norm": 0.7265625, + "learning_rate": 4.025641025641026e-06, + "loss": 1.6196, + "step": 121500 + }, + { + "epoch": 0.7728780810250804, + "grad_norm": 1.1171875, + "learning_rate": 4.020512820512821e-06, + "loss": 1.6457, + "step": 121600 + }, + { + "epoch": 0.7735136715522392, + "grad_norm": 0.86328125, + "learning_rate": 4.015384615384615e-06, + "loss": 1.6246, + "step": 121700 + }, + { + "epoch": 0.7741492620793979, + "grad_norm": 0.8359375, + "learning_rate": 4.010256410256411e-06, + "loss": 1.6151, + "step": 121800 + }, + { + "epoch": 0.7747848526065567, + "grad_norm": 1.0078125, + "learning_rate": 4.005128205128205e-06, + "loss": 1.6346, + "step": 121900 + }, + { + "epoch": 0.7754204431337155, + "grad_norm": 0.86328125, + "learning_rate": 4.000000000000001e-06, + "loss": 1.6198, + "step": 122000 + }, + { + "epoch": 0.7760560336608743, + "grad_norm": 1.171875, + "learning_rate": 3.9948717948717956e-06, + "loss": 1.6082, + "step": 122100 + }, + { + "epoch": 0.7766916241880331, + "grad_norm": 0.84765625, + "learning_rate": 3.98974358974359e-06, + "loss": 1.5981, + "step": 122200 + }, + { + "epoch": 0.7773272147151918, + "grad_norm": 0.66796875, + "learning_rate": 3.984615384615385e-06, + "loss": 1.6236, + "step": 122300 + }, + { + "epoch": 0.7779628052423506, + "grad_norm": 0.90625, + "learning_rate": 3.979487179487179e-06, + "loss": 1.6074, + "step": 122400 + }, + { + "epoch": 0.7785983957695094, + "grad_norm": 0.90625, + "learning_rate": 3.974358974358974e-06, + "loss": 1.6149, + "step": 122500 + }, + { + "epoch": 0.7792339862966682, + "grad_norm": 0.96875, + "learning_rate": 3.96923076923077e-06, + "loss": 1.6157, + "step": 122600 + }, + { + "epoch": 0.7798695768238271, + "grad_norm": 0.95703125, + "learning_rate": 3.964102564102564e-06, + "loss": 1.6356, + "step": 122700 + }, + { + "epoch": 0.7805051673509859, + "grad_norm": 0.87890625, + "learning_rate": 3.95897435897436e-06, + "loss": 1.6314, + "step": 122800 + }, + { + "epoch": 0.7811407578781446, + "grad_norm": 0.9609375, + "learning_rate": 3.953846153846154e-06, + "loss": 1.6252, + "step": 122900 + }, + { + "epoch": 0.7817763484053034, + "grad_norm": 1.1015625, + "learning_rate": 3.948717948717949e-06, + "loss": 1.6209, + "step": 123000 + }, + { + "epoch": 0.7824119389324622, + "grad_norm": 0.8671875, + "learning_rate": 3.943589743589744e-06, + "loss": 1.6145, + "step": 123100 + }, + { + "epoch": 0.783047529459621, + "grad_norm": 1.1015625, + "learning_rate": 3.938461538461539e-06, + "loss": 1.6273, + "step": 123200 + }, + { + "epoch": 0.7836831199867798, + "grad_norm": 0.90625, + "learning_rate": 3.9333333333333335e-06, + "loss": 1.6175, + "step": 123300 + }, + { + "epoch": 0.7843187105139385, + "grad_norm": 0.9765625, + "learning_rate": 3.928205128205128e-06, + "loss": 1.6289, + "step": 123400 + }, + { + "epoch": 0.7849543010410973, + "grad_norm": 0.828125, + "learning_rate": 3.923076923076923e-06, + "loss": 1.6126, + "step": 123500 + }, + { + "epoch": 0.7855898915682561, + "grad_norm": 0.703125, + "learning_rate": 3.917948717948718e-06, + "loss": 1.6222, + "step": 123600 + }, + { + "epoch": 0.7862254820954149, + "grad_norm": 0.87890625, + "learning_rate": 3.912820512820513e-06, + "loss": 1.6176, + "step": 123700 + }, + { + "epoch": 0.7868610726225737, + "grad_norm": 0.9921875, + "learning_rate": 3.907692307692308e-06, + "loss": 1.626, + "step": 123800 + }, + { + "epoch": 0.7874966631497324, + "grad_norm": 0.8046875, + "learning_rate": 3.902564102564103e-06, + "loss": 1.6348, + "step": 123900 + }, + { + "epoch": 0.7881322536768912, + "grad_norm": 0.77734375, + "learning_rate": 3.897435897435898e-06, + "loss": 1.6156, + "step": 124000 + }, + { + "epoch": 0.78876784420405, + "grad_norm": 0.71875, + "learning_rate": 3.892307692307693e-06, + "loss": 1.6132, + "step": 124100 + }, + { + "epoch": 0.7894034347312088, + "grad_norm": 0.99609375, + "learning_rate": 3.887179487179488e-06, + "loss": 1.6178, + "step": 124200 + }, + { + "epoch": 0.7900390252583676, + "grad_norm": 0.87890625, + "learning_rate": 3.8820512820512825e-06, + "loss": 1.6131, + "step": 124300 + }, + { + "epoch": 0.7906746157855263, + "grad_norm": 0.81640625, + "learning_rate": 3.876923076923077e-06, + "loss": 1.6157, + "step": 124400 + }, + { + "epoch": 0.7913102063126851, + "grad_norm": 1.0859375, + "learning_rate": 3.871794871794872e-06, + "loss": 1.6223, + "step": 124500 + }, + { + "epoch": 0.7919457968398439, + "grad_norm": 1.4296875, + "learning_rate": 3.866666666666667e-06, + "loss": 1.6285, + "step": 124600 + }, + { + "epoch": 0.7925813873670027, + "grad_norm": 0.97265625, + "learning_rate": 3.861538461538462e-06, + "loss": 1.6237, + "step": 124700 + }, + { + "epoch": 0.7932169778941615, + "grad_norm": 0.9140625, + "learning_rate": 3.856410256410257e-06, + "loss": 1.6423, + "step": 124800 + }, + { + "epoch": 0.7938525684213202, + "grad_norm": 0.6953125, + "learning_rate": 3.851282051282051e-06, + "loss": 1.6336, + "step": 124900 + }, + { + "epoch": 0.794488158948479, + "grad_norm": 0.96484375, + "learning_rate": 3.846153846153847e-06, + "loss": 1.6335, + "step": 125000 + }, + { + "epoch": 0.7951237494756378, + "grad_norm": 0.77734375, + "learning_rate": 3.841025641025642e-06, + "loss": 1.614, + "step": 125100 + }, + { + "epoch": 0.7957593400027966, + "grad_norm": 0.95703125, + "learning_rate": 3.835897435897436e-06, + "loss": 1.6191, + "step": 125200 + }, + { + "epoch": 0.7963949305299554, + "grad_norm": 0.9453125, + "learning_rate": 3.8307692307692315e-06, + "loss": 1.6222, + "step": 125300 + }, + { + "epoch": 0.7970305210571141, + "grad_norm": 0.61328125, + "learning_rate": 3.8256410256410255e-06, + "loss": 1.6125, + "step": 125400 + }, + { + "epoch": 0.7976661115842729, + "grad_norm": 1.1171875, + "learning_rate": 3.8205128205128204e-06, + "loss": 1.6322, + "step": 125500 + }, + { + "epoch": 0.7983017021114317, + "grad_norm": 0.78125, + "learning_rate": 3.815384615384616e-06, + "loss": 1.6159, + "step": 125600 + }, + { + "epoch": 0.7989372926385905, + "grad_norm": 0.83203125, + "learning_rate": 3.8102564102564107e-06, + "loss": 1.6374, + "step": 125700 + }, + { + "epoch": 0.7995728831657493, + "grad_norm": 0.8203125, + "learning_rate": 3.8051282051282056e-06, + "loss": 1.618, + "step": 125800 + }, + { + "epoch": 0.800208473692908, + "grad_norm": 0.69140625, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.6325, + "step": 125900 + }, + { + "epoch": 0.8008440642200668, + "grad_norm": 0.91796875, + "learning_rate": 3.794871794871795e-06, + "loss": 1.6121, + "step": 126000 + }, + { + "epoch": 0.8014796547472256, + "grad_norm": 1.4609375, + "learning_rate": 3.7897435897435903e-06, + "loss": 1.6173, + "step": 126100 + }, + { + "epoch": 0.8021152452743844, + "grad_norm": 1.1640625, + "learning_rate": 3.7846153846153847e-06, + "loss": 1.6166, + "step": 126200 + }, + { + "epoch": 0.8027508358015432, + "grad_norm": 1.0078125, + "learning_rate": 3.7794871794871796e-06, + "loss": 1.6234, + "step": 126300 + }, + { + "epoch": 0.803386426328702, + "grad_norm": 1.015625, + "learning_rate": 3.774358974358975e-06, + "loss": 1.6333, + "step": 126400 + }, + { + "epoch": 0.8040220168558608, + "grad_norm": 1.078125, + "learning_rate": 3.7692307692307694e-06, + "loss": 1.6345, + "step": 126500 + }, + { + "epoch": 0.8046576073830196, + "grad_norm": 0.80078125, + "learning_rate": 3.7641025641025643e-06, + "loss": 1.6255, + "step": 126600 + }, + { + "epoch": 0.8052931979101784, + "grad_norm": 0.7421875, + "learning_rate": 3.7589743589743592e-06, + "loss": 1.6247, + "step": 126700 + }, + { + "epoch": 0.8059287884373372, + "grad_norm": 1.3515625, + "learning_rate": 3.753846153846154e-06, + "loss": 1.6234, + "step": 126800 + }, + { + "epoch": 0.806564378964496, + "grad_norm": 1.203125, + "learning_rate": 3.7487179487179495e-06, + "loss": 1.6475, + "step": 126900 + }, + { + "epoch": 0.8071999694916547, + "grad_norm": 0.7109375, + "learning_rate": 3.743589743589744e-06, + "loss": 1.6282, + "step": 127000 + }, + { + "epoch": 0.8078355600188135, + "grad_norm": 1.03125, + "learning_rate": 3.7384615384615384e-06, + "loss": 1.6238, + "step": 127100 + }, + { + "epoch": 0.8084711505459723, + "grad_norm": 1.046875, + "learning_rate": 3.7333333333333337e-06, + "loss": 1.6218, + "step": 127200 + }, + { + "epoch": 0.8091067410731311, + "grad_norm": 0.91015625, + "learning_rate": 3.7282051282051286e-06, + "loss": 1.6254, + "step": 127300 + }, + { + "epoch": 0.8097423316002899, + "grad_norm": 0.94921875, + "learning_rate": 3.723076923076923e-06, + "loss": 1.6161, + "step": 127400 + }, + { + "epoch": 0.8103779221274486, + "grad_norm": 0.61328125, + "learning_rate": 3.7179487179487184e-06, + "loss": 1.6167, + "step": 127500 + }, + { + "epoch": 0.8110135126546074, + "grad_norm": 1.2578125, + "learning_rate": 3.712820512820513e-06, + "loss": 1.6153, + "step": 127600 + }, + { + "epoch": 0.8116491031817662, + "grad_norm": 0.84765625, + "learning_rate": 3.7076923076923082e-06, + "loss": 1.6216, + "step": 127700 + }, + { + "epoch": 0.812284693708925, + "grad_norm": 0.99609375, + "learning_rate": 3.702564102564103e-06, + "loss": 1.6247, + "step": 127800 + }, + { + "epoch": 0.8129202842360838, + "grad_norm": 0.93359375, + "learning_rate": 3.6974358974358976e-06, + "loss": 1.6345, + "step": 127900 + }, + { + "epoch": 0.8135558747632425, + "grad_norm": 0.94140625, + "learning_rate": 3.692307692307693e-06, + "loss": 1.6206, + "step": 128000 + }, + { + "epoch": 0.8141914652904013, + "grad_norm": 1.0078125, + "learning_rate": 3.6871794871794874e-06, + "loss": 1.626, + "step": 128100 + }, + { + "epoch": 0.8148270558175601, + "grad_norm": 1.484375, + "learning_rate": 3.6820512820512823e-06, + "loss": 1.6217, + "step": 128200 + }, + { + "epoch": 0.8154626463447189, + "grad_norm": 0.8515625, + "learning_rate": 3.676923076923077e-06, + "loss": 1.6246, + "step": 128300 + }, + { + "epoch": 0.8160982368718777, + "grad_norm": 0.93359375, + "learning_rate": 3.671794871794872e-06, + "loss": 1.6156, + "step": 128400 + }, + { + "epoch": 0.8167338273990364, + "grad_norm": 0.9921875, + "learning_rate": 3.6666666666666666e-06, + "loss": 1.6211, + "step": 128500 + }, + { + "epoch": 0.8173694179261952, + "grad_norm": 1.15625, + "learning_rate": 3.661538461538462e-06, + "loss": 1.6198, + "step": 128600 + }, + { + "epoch": 0.818005008453354, + "grad_norm": 0.90234375, + "learning_rate": 3.656410256410257e-06, + "loss": 1.6179, + "step": 128700 + }, + { + "epoch": 0.8186405989805128, + "grad_norm": 0.890625, + "learning_rate": 3.6512820512820517e-06, + "loss": 1.6355, + "step": 128800 + }, + { + "epoch": 0.8192761895076716, + "grad_norm": 0.83984375, + "learning_rate": 3.6461538461538466e-06, + "loss": 1.6277, + "step": 128900 + }, + { + "epoch": 0.8199117800348303, + "grad_norm": 0.8046875, + "learning_rate": 3.641025641025641e-06, + "loss": 1.6202, + "step": 129000 + }, + { + "epoch": 0.8205473705619891, + "grad_norm": 0.87890625, + "learning_rate": 3.6358974358974364e-06, + "loss": 1.6254, + "step": 129100 + }, + { + "epoch": 0.8211829610891479, + "grad_norm": 0.75390625, + "learning_rate": 3.630769230769231e-06, + "loss": 1.6199, + "step": 129200 + }, + { + "epoch": 0.8218185516163067, + "grad_norm": 0.9375, + "learning_rate": 3.6256410256410258e-06, + "loss": 1.615, + "step": 129300 + }, + { + "epoch": 0.8224541421434655, + "grad_norm": 0.85546875, + "learning_rate": 3.620512820512821e-06, + "loss": 1.6212, + "step": 129400 + }, + { + "epoch": 0.8230897326706242, + "grad_norm": 0.98046875, + "learning_rate": 3.6153846153846156e-06, + "loss": 1.6211, + "step": 129500 + }, + { + "epoch": 0.823725323197783, + "grad_norm": 1.0625, + "learning_rate": 3.610256410256411e-06, + "loss": 1.6148, + "step": 129600 + }, + { + "epoch": 0.8243609137249418, + "grad_norm": 0.8828125, + "learning_rate": 3.6051282051282054e-06, + "loss": 1.6238, + "step": 129700 + }, + { + "epoch": 0.8249965042521006, + "grad_norm": 0.66796875, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.6315, + "step": 129800 + }, + { + "epoch": 0.8256320947792594, + "grad_norm": 0.8515625, + "learning_rate": 3.5948717948717956e-06, + "loss": 1.6074, + "step": 129900 + }, + { + "epoch": 0.8262676853064181, + "grad_norm": 0.94921875, + "learning_rate": 3.58974358974359e-06, + "loss": 1.6063, + "step": 130000 + }, + { + "epoch": 0.8269032758335769, + "grad_norm": 1.1015625, + "learning_rate": 3.5846153846153845e-06, + "loss": 1.6152, + "step": 130100 + }, + { + "epoch": 0.8275388663607358, + "grad_norm": 0.765625, + "learning_rate": 3.57948717948718e-06, + "loss": 1.5949, + "step": 130200 + }, + { + "epoch": 0.8281744568878946, + "grad_norm": 0.77734375, + "learning_rate": 3.5743589743589748e-06, + "loss": 1.6273, + "step": 130300 + }, + { + "epoch": 0.8288100474150534, + "grad_norm": 0.671875, + "learning_rate": 3.5692307692307692e-06, + "loss": 1.6031, + "step": 130400 + }, + { + "epoch": 0.8294456379422122, + "grad_norm": 0.8828125, + "learning_rate": 3.5641025641025646e-06, + "loss": 1.625, + "step": 130500 + }, + { + "epoch": 0.8300812284693709, + "grad_norm": 1.0390625, + "learning_rate": 3.558974358974359e-06, + "loss": 1.6359, + "step": 130600 + }, + { + "epoch": 0.8307168189965297, + "grad_norm": 0.85546875, + "learning_rate": 3.5538461538461544e-06, + "loss": 1.626, + "step": 130700 + }, + { + "epoch": 0.8313524095236885, + "grad_norm": 0.94921875, + "learning_rate": 3.5487179487179493e-06, + "loss": 1.6101, + "step": 130800 + }, + { + "epoch": 0.8319880000508473, + "grad_norm": 0.95703125, + "learning_rate": 3.5435897435897437e-06, + "loss": 1.6109, + "step": 130900 + }, + { + "epoch": 0.8326235905780061, + "grad_norm": 0.7421875, + "learning_rate": 3.538461538461539e-06, + "loss": 1.6194, + "step": 131000 + }, + { + "epoch": 0.8332591811051648, + "grad_norm": 0.5, + "learning_rate": 3.5333333333333335e-06, + "loss": 1.6146, + "step": 131100 + }, + { + "epoch": 0.8338947716323236, + "grad_norm": 0.93359375, + "learning_rate": 3.5282051282051284e-06, + "loss": 1.6276, + "step": 131200 + }, + { + "epoch": 0.8345303621594824, + "grad_norm": 1.265625, + "learning_rate": 3.5230769230769233e-06, + "loss": 1.6218, + "step": 131300 + }, + { + "epoch": 0.8351659526866412, + "grad_norm": 0.9921875, + "learning_rate": 3.5179487179487182e-06, + "loss": 1.6442, + "step": 131400 + }, + { + "epoch": 0.8358015432138, + "grad_norm": 0.77734375, + "learning_rate": 3.5128205128205127e-06, + "loss": 1.6282, + "step": 131500 + }, + { + "epoch": 0.8364371337409587, + "grad_norm": 1.0625, + "learning_rate": 3.507692307692308e-06, + "loss": 1.6294, + "step": 131600 + }, + { + "epoch": 0.8370727242681175, + "grad_norm": 0.9609375, + "learning_rate": 3.502564102564103e-06, + "loss": 1.6143, + "step": 131700 + }, + { + "epoch": 0.8377083147952763, + "grad_norm": 1.0234375, + "learning_rate": 3.497435897435898e-06, + "loss": 1.6447, + "step": 131800 + }, + { + "epoch": 0.8383439053224351, + "grad_norm": 1.015625, + "learning_rate": 3.4923076923076927e-06, + "loss": 1.6045, + "step": 131900 + }, + { + "epoch": 0.8389794958495939, + "grad_norm": 0.6640625, + "learning_rate": 3.487179487179487e-06, + "loss": 1.6307, + "step": 132000 + }, + { + "epoch": 0.8396150863767526, + "grad_norm": 0.859375, + "learning_rate": 3.4820512820512825e-06, + "loss": 1.6028, + "step": 132100 + }, + { + "epoch": 0.8402506769039114, + "grad_norm": 0.8125, + "learning_rate": 3.476923076923077e-06, + "loss": 1.6198, + "step": 132200 + }, + { + "epoch": 0.8408862674310702, + "grad_norm": 0.71875, + "learning_rate": 3.471794871794872e-06, + "loss": 1.634, + "step": 132300 + }, + { + "epoch": 0.841521857958229, + "grad_norm": 0.84765625, + "learning_rate": 3.4666666666666672e-06, + "loss": 1.6291, + "step": 132400 + }, + { + "epoch": 0.8421574484853878, + "grad_norm": 0.65234375, + "learning_rate": 3.4615384615384617e-06, + "loss": 1.631, + "step": 132500 + }, + { + "epoch": 0.8427930390125465, + "grad_norm": 0.94921875, + "learning_rate": 3.456410256410257e-06, + "loss": 1.6228, + "step": 132600 + }, + { + "epoch": 0.8434286295397053, + "grad_norm": 0.7265625, + "learning_rate": 3.4512820512820515e-06, + "loss": 1.6166, + "step": 132700 + }, + { + "epoch": 0.8440642200668641, + "grad_norm": 0.9921875, + "learning_rate": 3.4461538461538464e-06, + "loss": 1.6259, + "step": 132800 + }, + { + "epoch": 0.8446998105940229, + "grad_norm": 0.78515625, + "learning_rate": 3.4410256410256417e-06, + "loss": 1.6147, + "step": 132900 + }, + { + "epoch": 0.8453354011211817, + "grad_norm": 0.8046875, + "learning_rate": 3.435897435897436e-06, + "loss": 1.6124, + "step": 133000 + }, + { + "epoch": 0.8459709916483404, + "grad_norm": 0.79296875, + "learning_rate": 3.4307692307692307e-06, + "loss": 1.6137, + "step": 133100 + }, + { + "epoch": 0.8466065821754992, + "grad_norm": 0.9765625, + "learning_rate": 3.425641025641026e-06, + "loss": 1.622, + "step": 133200 + }, + { + "epoch": 0.847242172702658, + "grad_norm": 0.68359375, + "learning_rate": 3.420512820512821e-06, + "loss": 1.6324, + "step": 133300 + }, + { + "epoch": 0.8478777632298168, + "grad_norm": 0.84765625, + "learning_rate": 3.4153846153846154e-06, + "loss": 1.6176, + "step": 133400 + }, + { + "epoch": 0.8485133537569756, + "grad_norm": 0.7734375, + "learning_rate": 3.4102564102564107e-06, + "loss": 1.6249, + "step": 133500 + }, + { + "epoch": 0.8491489442841343, + "grad_norm": 0.78515625, + "learning_rate": 3.405128205128205e-06, + "loss": 1.635, + "step": 133600 + }, + { + "epoch": 0.8497845348112931, + "grad_norm": 0.90234375, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.6369, + "step": 133700 + }, + { + "epoch": 0.8504201253384519, + "grad_norm": 0.953125, + "learning_rate": 3.3948717948717954e-06, + "loss": 1.6082, + "step": 133800 + }, + { + "epoch": 0.8510557158656107, + "grad_norm": 0.80859375, + "learning_rate": 3.38974358974359e-06, + "loss": 1.6106, + "step": 133900 + }, + { + "epoch": 0.8516913063927696, + "grad_norm": 1.3203125, + "learning_rate": 3.384615384615385e-06, + "loss": 1.6174, + "step": 134000 + }, + { + "epoch": 0.8523268969199284, + "grad_norm": 0.83984375, + "learning_rate": 3.3794871794871797e-06, + "loss": 1.6293, + "step": 134100 + }, + { + "epoch": 0.8529624874470871, + "grad_norm": 0.83984375, + "learning_rate": 3.3743589743589746e-06, + "loss": 1.6205, + "step": 134200 + }, + { + "epoch": 0.8535980779742459, + "grad_norm": 1.03125, + "learning_rate": 3.3692307692307695e-06, + "loss": 1.611, + "step": 134300 + }, + { + "epoch": 0.8542336685014047, + "grad_norm": 0.640625, + "learning_rate": 3.3641025641025644e-06, + "loss": 1.648, + "step": 134400 + }, + { + "epoch": 0.8548692590285635, + "grad_norm": 1.203125, + "learning_rate": 3.358974358974359e-06, + "loss": 1.621, + "step": 134500 + }, + { + "epoch": 0.8555048495557223, + "grad_norm": 1.296875, + "learning_rate": 3.353846153846154e-06, + "loss": 1.6387, + "step": 134600 + }, + { + "epoch": 0.856140440082881, + "grad_norm": 1.1171875, + "learning_rate": 3.348717948717949e-06, + "loss": 1.6229, + "step": 134700 + }, + { + "epoch": 0.8567760306100398, + "grad_norm": 1.0078125, + "learning_rate": 3.343589743589744e-06, + "loss": 1.616, + "step": 134800 + }, + { + "epoch": 0.8574116211371986, + "grad_norm": 1.25, + "learning_rate": 3.338461538461539e-06, + "loss": 1.6196, + "step": 134900 + }, + { + "epoch": 0.8580472116643574, + "grad_norm": 1.0234375, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.6187, + "step": 135000 + }, + { + "epoch": 0.8586828021915162, + "grad_norm": 0.9140625, + "learning_rate": 3.3282051282051286e-06, + "loss": 1.6255, + "step": 135100 + }, + { + "epoch": 0.8593183927186749, + "grad_norm": 0.765625, + "learning_rate": 3.323076923076923e-06, + "loss": 1.6298, + "step": 135200 + }, + { + "epoch": 0.8599539832458337, + "grad_norm": 1.0625, + "learning_rate": 3.317948717948718e-06, + "loss": 1.6271, + "step": 135300 + }, + { + "epoch": 0.8605895737729925, + "grad_norm": 0.66796875, + "learning_rate": 3.3128205128205133e-06, + "loss": 1.6132, + "step": 135400 + }, + { + "epoch": 0.8612251643001513, + "grad_norm": 0.83203125, + "learning_rate": 3.307692307692308e-06, + "loss": 1.6222, + "step": 135500 + }, + { + "epoch": 0.8618607548273101, + "grad_norm": 1.1953125, + "learning_rate": 3.302564102564103e-06, + "loss": 1.6203, + "step": 135600 + }, + { + "epoch": 0.8624963453544688, + "grad_norm": 1.0625, + "learning_rate": 3.2974358974358976e-06, + "loss": 1.6274, + "step": 135700 + }, + { + "epoch": 0.8631319358816276, + "grad_norm": 1.03125, + "learning_rate": 3.2923076923076925e-06, + "loss": 1.6089, + "step": 135800 + }, + { + "epoch": 0.8637675264087864, + "grad_norm": 0.8515625, + "learning_rate": 3.287179487179488e-06, + "loss": 1.621, + "step": 135900 + }, + { + "epoch": 0.8644031169359452, + "grad_norm": 1.09375, + "learning_rate": 3.2820512820512823e-06, + "loss": 1.604, + "step": 136000 + }, + { + "epoch": 0.865038707463104, + "grad_norm": 0.828125, + "learning_rate": 3.276923076923077e-06, + "loss": 1.6205, + "step": 136100 + }, + { + "epoch": 0.8656742979902627, + "grad_norm": 0.9375, + "learning_rate": 3.271794871794872e-06, + "loss": 1.6287, + "step": 136200 + }, + { + "epoch": 0.8663098885174215, + "grad_norm": 1.2890625, + "learning_rate": 3.266666666666667e-06, + "loss": 1.6314, + "step": 136300 + }, + { + "epoch": 0.8669454790445803, + "grad_norm": 0.90625, + "learning_rate": 3.2615384615384615e-06, + "loss": 1.6229, + "step": 136400 + }, + { + "epoch": 0.8675810695717391, + "grad_norm": 0.9375, + "learning_rate": 3.256410256410257e-06, + "loss": 1.6234, + "step": 136500 + }, + { + "epoch": 0.8682166600988979, + "grad_norm": 0.7734375, + "learning_rate": 3.2512820512820513e-06, + "loss": 1.6114, + "step": 136600 + }, + { + "epoch": 0.8688522506260566, + "grad_norm": 0.60546875, + "learning_rate": 3.2461538461538466e-06, + "loss": 1.6299, + "step": 136700 + }, + { + "epoch": 0.8694878411532154, + "grad_norm": 0.859375, + "learning_rate": 3.2410256410256415e-06, + "loss": 1.5946, + "step": 136800 + }, + { + "epoch": 0.8701234316803742, + "grad_norm": 0.921875, + "learning_rate": 3.235897435897436e-06, + "loss": 1.6175, + "step": 136900 + }, + { + "epoch": 0.870759022207533, + "grad_norm": 0.95703125, + "learning_rate": 3.2307692307692313e-06, + "loss": 1.616, + "step": 137000 + }, + { + "epoch": 0.8713946127346918, + "grad_norm": 1.328125, + "learning_rate": 3.2256410256410258e-06, + "loss": 1.6083, + "step": 137100 + }, + { + "epoch": 0.8720302032618505, + "grad_norm": 1.0625, + "learning_rate": 3.2205128205128207e-06, + "loss": 1.6048, + "step": 137200 + }, + { + "epoch": 0.8726657937890093, + "grad_norm": 1.015625, + "learning_rate": 3.2153846153846156e-06, + "loss": 1.6351, + "step": 137300 + }, + { + "epoch": 0.8733013843161681, + "grad_norm": 1.0703125, + "learning_rate": 3.2102564102564105e-06, + "loss": 1.6187, + "step": 137400 + }, + { + "epoch": 0.8739369748433269, + "grad_norm": 0.7421875, + "learning_rate": 3.205128205128206e-06, + "loss": 1.616, + "step": 137500 + }, + { + "epoch": 0.8745725653704857, + "grad_norm": 0.88671875, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6311, + "step": 137600 + }, + { + "epoch": 0.8752081558976444, + "grad_norm": 0.76171875, + "learning_rate": 3.194871794871795e-06, + "loss": 1.6204, + "step": 137700 + }, + { + "epoch": 0.8758437464248033, + "grad_norm": 1.1953125, + "learning_rate": 3.18974358974359e-06, + "loss": 1.6455, + "step": 137800 + }, + { + "epoch": 0.8764793369519621, + "grad_norm": 0.73046875, + "learning_rate": 3.184615384615385e-06, + "loss": 1.6207, + "step": 137900 + }, + { + "epoch": 0.8771149274791209, + "grad_norm": 0.9375, + "learning_rate": 3.1794871794871795e-06, + "loss": 1.6087, + "step": 138000 + }, + { + "epoch": 0.8777505180062797, + "grad_norm": 1.1328125, + "learning_rate": 3.1743589743589748e-06, + "loss": 1.6088, + "step": 138100 + }, + { + "epoch": 0.8783861085334385, + "grad_norm": 0.87890625, + "learning_rate": 3.1692307692307693e-06, + "loss": 1.6146, + "step": 138200 + }, + { + "epoch": 0.8790216990605972, + "grad_norm": 0.87890625, + "learning_rate": 3.164102564102564e-06, + "loss": 1.609, + "step": 138300 + }, + { + "epoch": 0.879657289587756, + "grad_norm": 0.84375, + "learning_rate": 3.1589743589743595e-06, + "loss": 1.6187, + "step": 138400 + }, + { + "epoch": 0.8802928801149148, + "grad_norm": 1.0078125, + "learning_rate": 3.153846153846154e-06, + "loss": 1.6228, + "step": 138500 + }, + { + "epoch": 0.8809284706420736, + "grad_norm": 0.87890625, + "learning_rate": 3.1487179487179493e-06, + "loss": 1.6435, + "step": 138600 + }, + { + "epoch": 0.8815640611692324, + "grad_norm": 1.3203125, + "learning_rate": 3.1435897435897437e-06, + "loss": 1.6138, + "step": 138700 + }, + { + "epoch": 0.8821996516963911, + "grad_norm": 0.8359375, + "learning_rate": 3.1384615384615386e-06, + "loss": 1.619, + "step": 138800 + }, + { + "epoch": 0.8828352422235499, + "grad_norm": 0.8203125, + "learning_rate": 3.133333333333334e-06, + "loss": 1.629, + "step": 138900 + }, + { + "epoch": 0.8834708327507087, + "grad_norm": 0.9296875, + "learning_rate": 3.1282051282051284e-06, + "loss": 1.6192, + "step": 139000 + }, + { + "epoch": 0.8841064232778675, + "grad_norm": 0.7421875, + "learning_rate": 3.123076923076923e-06, + "loss": 1.6226, + "step": 139100 + }, + { + "epoch": 0.8847420138050263, + "grad_norm": 0.92578125, + "learning_rate": 3.1179487179487182e-06, + "loss": 1.6317, + "step": 139200 + }, + { + "epoch": 0.885377604332185, + "grad_norm": 0.80078125, + "learning_rate": 3.112820512820513e-06, + "loss": 1.6209, + "step": 139300 + }, + { + "epoch": 0.8860131948593438, + "grad_norm": 0.98828125, + "learning_rate": 3.1076923076923076e-06, + "loss": 1.6208, + "step": 139400 + }, + { + "epoch": 0.8866487853865026, + "grad_norm": 0.88671875, + "learning_rate": 3.102564102564103e-06, + "loss": 1.626, + "step": 139500 + }, + { + "epoch": 0.8872843759136614, + "grad_norm": 0.76953125, + "learning_rate": 3.0974358974358974e-06, + "loss": 1.6158, + "step": 139600 + }, + { + "epoch": 0.8879199664408202, + "grad_norm": 0.8046875, + "learning_rate": 3.0923076923076927e-06, + "loss": 1.6279, + "step": 139700 + }, + { + "epoch": 0.8885555569679789, + "grad_norm": 1.4140625, + "learning_rate": 3.0871794871794876e-06, + "loss": 1.628, + "step": 139800 + }, + { + "epoch": 0.8891911474951377, + "grad_norm": 0.9375, + "learning_rate": 3.082051282051282e-06, + "loss": 1.6184, + "step": 139900 + }, + { + "epoch": 0.8898267380222965, + "grad_norm": 1.1796875, + "learning_rate": 3.0769230769230774e-06, + "loss": 1.6352, + "step": 140000 + }, + { + "epoch": 0.8904623285494553, + "grad_norm": 1.203125, + "learning_rate": 3.071794871794872e-06, + "loss": 1.6027, + "step": 140100 + }, + { + "epoch": 0.8910979190766141, + "grad_norm": 1.7421875, + "learning_rate": 3.066666666666667e-06, + "loss": 1.6102, + "step": 140200 + }, + { + "epoch": 0.8917335096037728, + "grad_norm": 0.83984375, + "learning_rate": 3.0615384615384617e-06, + "loss": 1.6218, + "step": 140300 + }, + { + "epoch": 0.8923691001309316, + "grad_norm": 0.95703125, + "learning_rate": 3.0564102564102566e-06, + "loss": 1.6068, + "step": 140400 + }, + { + "epoch": 0.8930046906580904, + "grad_norm": 0.75, + "learning_rate": 3.051282051282052e-06, + "loss": 1.6191, + "step": 140500 + }, + { + "epoch": 0.8936402811852492, + "grad_norm": 0.87109375, + "learning_rate": 3.0461538461538464e-06, + "loss": 1.6149, + "step": 140600 + }, + { + "epoch": 0.894275871712408, + "grad_norm": 0.73828125, + "learning_rate": 3.0410256410256413e-06, + "loss": 1.6343, + "step": 140700 + }, + { + "epoch": 0.8949114622395667, + "grad_norm": 1.1640625, + "learning_rate": 3.035897435897436e-06, + "loss": 1.6081, + "step": 140800 + }, + { + "epoch": 0.8955470527667255, + "grad_norm": 0.6484375, + "learning_rate": 3.030769230769231e-06, + "loss": 1.6139, + "step": 140900 + }, + { + "epoch": 0.8961826432938843, + "grad_norm": 0.9375, + "learning_rate": 3.0256410256410256e-06, + "loss": 1.6054, + "step": 141000 + }, + { + "epoch": 0.8968182338210431, + "grad_norm": 0.9453125, + "learning_rate": 3.020512820512821e-06, + "loss": 1.6071, + "step": 141100 + }, + { + "epoch": 0.8974538243482019, + "grad_norm": 0.83203125, + "learning_rate": 3.0153846153846154e-06, + "loss": 1.6072, + "step": 141200 + }, + { + "epoch": 0.8980894148753606, + "grad_norm": 0.6796875, + "learning_rate": 3.0102564102564103e-06, + "loss": 1.6221, + "step": 141300 + }, + { + "epoch": 0.8987250054025194, + "grad_norm": 0.875, + "learning_rate": 3.0051282051282056e-06, + "loss": 1.6284, + "step": 141400 + }, + { + "epoch": 0.8993605959296783, + "grad_norm": 0.6953125, + "learning_rate": 3e-06, + "loss": 1.6133, + "step": 141500 + }, + { + "epoch": 0.8999961864568371, + "grad_norm": 0.75, + "learning_rate": 2.9948717948717954e-06, + "loss": 1.6361, + "step": 141600 + }, + { + "epoch": 0.9006317769839959, + "grad_norm": 0.9375, + "learning_rate": 2.98974358974359e-06, + "loss": 1.613, + "step": 141700 + }, + { + "epoch": 0.9012673675111547, + "grad_norm": 1.0859375, + "learning_rate": 2.9846153846153848e-06, + "loss": 1.6331, + "step": 141800 + }, + { + "epoch": 0.9019029580383134, + "grad_norm": 0.83984375, + "learning_rate": 2.97948717948718e-06, + "loss": 1.6272, + "step": 141900 + }, + { + "epoch": 0.9025385485654722, + "grad_norm": 0.70703125, + "learning_rate": 2.9743589743589746e-06, + "loss": 1.6234, + "step": 142000 + }, + { + "epoch": 0.903174139092631, + "grad_norm": 0.83984375, + "learning_rate": 2.969230769230769e-06, + "loss": 1.6222, + "step": 142100 + }, + { + "epoch": 0.9038097296197898, + "grad_norm": 0.79296875, + "learning_rate": 2.9641025641025644e-06, + "loss": 1.6143, + "step": 142200 + }, + { + "epoch": 0.9044453201469486, + "grad_norm": 0.79296875, + "learning_rate": 2.9589743589743593e-06, + "loss": 1.6141, + "step": 142300 + }, + { + "epoch": 0.9050809106741073, + "grad_norm": 0.89453125, + "learning_rate": 2.953846153846154e-06, + "loss": 1.6266, + "step": 142400 + }, + { + "epoch": 0.9057165012012661, + "grad_norm": 0.79296875, + "learning_rate": 2.948717948717949e-06, + "loss": 1.6443, + "step": 142500 + }, + { + "epoch": 0.9063520917284249, + "grad_norm": 0.859375, + "learning_rate": 2.9435897435897435e-06, + "loss": 1.6212, + "step": 142600 + }, + { + "epoch": 0.9069876822555837, + "grad_norm": 1.1796875, + "learning_rate": 2.938461538461539e-06, + "loss": 1.6373, + "step": 142700 + }, + { + "epoch": 0.9076232727827425, + "grad_norm": 0.703125, + "learning_rate": 2.9333333333333338e-06, + "loss": 1.6184, + "step": 142800 + }, + { + "epoch": 0.9082588633099012, + "grad_norm": 0.9140625, + "learning_rate": 2.9282051282051282e-06, + "loss": 1.6146, + "step": 142900 + }, + { + "epoch": 0.90889445383706, + "grad_norm": 0.9375, + "learning_rate": 2.9230769230769236e-06, + "loss": 1.6159, + "step": 143000 + }, + { + "epoch": 0.9095300443642188, + "grad_norm": 1.0859375, + "learning_rate": 2.917948717948718e-06, + "loss": 1.6432, + "step": 143100 + }, + { + "epoch": 0.9101656348913776, + "grad_norm": 0.86328125, + "learning_rate": 2.912820512820513e-06, + "loss": 1.6271, + "step": 143200 + }, + { + "epoch": 0.9108012254185364, + "grad_norm": 1.0703125, + "learning_rate": 2.907692307692308e-06, + "loss": 1.6198, + "step": 143300 + }, + { + "epoch": 0.9114368159456951, + "grad_norm": 0.84375, + "learning_rate": 2.9025641025641027e-06, + "loss": 1.638, + "step": 143400 + }, + { + "epoch": 0.9120724064728539, + "grad_norm": 1.234375, + "learning_rate": 2.897435897435898e-06, + "loss": 1.6263, + "step": 143500 + }, + { + "epoch": 0.9127079970000127, + "grad_norm": 1.0078125, + "learning_rate": 2.8923076923076925e-06, + "loss": 1.6231, + "step": 143600 + }, + { + "epoch": 0.9133435875271715, + "grad_norm": 0.64453125, + "learning_rate": 2.8871794871794874e-06, + "loss": 1.6218, + "step": 143700 + }, + { + "epoch": 0.9139791780543303, + "grad_norm": 0.85546875, + "learning_rate": 2.8820512820512823e-06, + "loss": 1.6251, + "step": 143800 + }, + { + "epoch": 0.914614768581489, + "grad_norm": 0.6640625, + "learning_rate": 2.8769230769230772e-06, + "loss": 1.6289, + "step": 143900 + }, + { + "epoch": 0.9152503591086478, + "grad_norm": 1.109375, + "learning_rate": 2.8717948717948717e-06, + "loss": 1.6282, + "step": 144000 + }, + { + "epoch": 0.9158859496358066, + "grad_norm": 0.890625, + "learning_rate": 2.866666666666667e-06, + "loss": 1.6281, + "step": 144100 + }, + { + "epoch": 0.9165215401629654, + "grad_norm": 1.046875, + "learning_rate": 2.8615384615384615e-06, + "loss": 1.6153, + "step": 144200 + }, + { + "epoch": 0.9171571306901242, + "grad_norm": 1.0390625, + "learning_rate": 2.8564102564102564e-06, + "loss": 1.6339, + "step": 144300 + }, + { + "epoch": 0.917792721217283, + "grad_norm": 0.78515625, + "learning_rate": 2.8512820512820517e-06, + "loss": 1.6205, + "step": 144400 + }, + { + "epoch": 0.9184283117444417, + "grad_norm": 1.1875, + "learning_rate": 2.846153846153846e-06, + "loss": 1.6251, + "step": 144500 + }, + { + "epoch": 0.9190639022716005, + "grad_norm": 1.015625, + "learning_rate": 2.8410256410256415e-06, + "loss": 1.6175, + "step": 144600 + }, + { + "epoch": 0.9196994927987593, + "grad_norm": 1.65625, + "learning_rate": 2.835897435897436e-06, + "loss": 1.6274, + "step": 144700 + }, + { + "epoch": 0.9203350833259181, + "grad_norm": 0.85546875, + "learning_rate": 2.830769230769231e-06, + "loss": 1.6275, + "step": 144800 + }, + { + "epoch": 0.9209706738530768, + "grad_norm": 0.8359375, + "learning_rate": 2.8256410256410262e-06, + "loss": 1.6317, + "step": 144900 + }, + { + "epoch": 0.9216062643802356, + "grad_norm": 1.171875, + "learning_rate": 2.8205128205128207e-06, + "loss": 1.6217, + "step": 145000 + }, + { + "epoch": 0.9222418549073944, + "grad_norm": 0.953125, + "learning_rate": 2.815384615384615e-06, + "loss": 1.6317, + "step": 145100 + }, + { + "epoch": 0.9228774454345532, + "grad_norm": 0.7421875, + "learning_rate": 2.8102564102564105e-06, + "loss": 1.6154, + "step": 145200 + }, + { + "epoch": 0.9235130359617121, + "grad_norm": 0.91015625, + "learning_rate": 2.8051282051282054e-06, + "loss": 1.6229, + "step": 145300 + }, + { + "epoch": 0.9241486264888709, + "grad_norm": 0.99609375, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.6387, + "step": 145400 + }, + { + "epoch": 0.9247842170160296, + "grad_norm": 1.015625, + "learning_rate": 2.794871794871795e-06, + "loss": 1.6295, + "step": 145500 + }, + { + "epoch": 0.9254198075431884, + "grad_norm": 1.140625, + "learning_rate": 2.7897435897435897e-06, + "loss": 1.5936, + "step": 145600 + }, + { + "epoch": 0.9260553980703472, + "grad_norm": 1.046875, + "learning_rate": 2.784615384615385e-06, + "loss": 1.6272, + "step": 145700 + }, + { + "epoch": 0.926690988597506, + "grad_norm": 1.03125, + "learning_rate": 2.77948717948718e-06, + "loss": 1.6146, + "step": 145800 + }, + { + "epoch": 0.9273265791246648, + "grad_norm": 1.015625, + "learning_rate": 2.7743589743589744e-06, + "loss": 1.6311, + "step": 145900 + }, + { + "epoch": 0.9279621696518235, + "grad_norm": 0.90234375, + "learning_rate": 2.7692307692307697e-06, + "loss": 1.6133, + "step": 146000 + }, + { + "epoch": 0.9285977601789823, + "grad_norm": 0.80078125, + "learning_rate": 2.764102564102564e-06, + "loss": 1.6253, + "step": 146100 + }, + { + "epoch": 0.9292333507061411, + "grad_norm": 0.9140625, + "learning_rate": 2.758974358974359e-06, + "loss": 1.6415, + "step": 146200 + }, + { + "epoch": 0.9298689412332999, + "grad_norm": 0.859375, + "learning_rate": 2.753846153846154e-06, + "loss": 1.6187, + "step": 146300 + }, + { + "epoch": 0.9305045317604587, + "grad_norm": 1.1484375, + "learning_rate": 2.748717948717949e-06, + "loss": 1.6169, + "step": 146400 + }, + { + "epoch": 0.9311401222876174, + "grad_norm": 0.640625, + "learning_rate": 2.743589743589744e-06, + "loss": 1.6248, + "step": 146500 + }, + { + "epoch": 0.9317757128147762, + "grad_norm": 0.97265625, + "learning_rate": 2.7384615384615387e-06, + "loss": 1.6086, + "step": 146600 + }, + { + "epoch": 0.932411303341935, + "grad_norm": 1.046875, + "learning_rate": 2.7333333333333336e-06, + "loss": 1.6358, + "step": 146700 + }, + { + "epoch": 0.9330468938690938, + "grad_norm": 0.875, + "learning_rate": 2.7282051282051285e-06, + "loss": 1.6129, + "step": 146800 + }, + { + "epoch": 0.9336824843962526, + "grad_norm": 0.87109375, + "learning_rate": 2.7230769230769234e-06, + "loss": 1.6228, + "step": 146900 + }, + { + "epoch": 0.9343180749234113, + "grad_norm": 0.90625, + "learning_rate": 2.717948717948718e-06, + "loss": 1.623, + "step": 147000 + }, + { + "epoch": 0.9349536654505701, + "grad_norm": 0.6484375, + "learning_rate": 2.712820512820513e-06, + "loss": 1.633, + "step": 147100 + }, + { + "epoch": 0.9355892559777289, + "grad_norm": 0.703125, + "learning_rate": 2.7076923076923076e-06, + "loss": 1.6282, + "step": 147200 + }, + { + "epoch": 0.9362248465048877, + "grad_norm": 0.88671875, + "learning_rate": 2.7025641025641025e-06, + "loss": 1.6231, + "step": 147300 + }, + { + "epoch": 0.9368604370320465, + "grad_norm": 1.21875, + "learning_rate": 2.697435897435898e-06, + "loss": 1.6084, + "step": 147400 + }, + { + "epoch": 0.9374960275592052, + "grad_norm": 0.95703125, + "learning_rate": 2.6923076923076923e-06, + "loss": 1.6344, + "step": 147500 + }, + { + "epoch": 0.938131618086364, + "grad_norm": 0.734375, + "learning_rate": 2.6871794871794877e-06, + "loss": 1.6374, + "step": 147600 + }, + { + "epoch": 0.9387672086135228, + "grad_norm": 0.8203125, + "learning_rate": 2.682051282051282e-06, + "loss": 1.6311, + "step": 147700 + }, + { + "epoch": 0.9394027991406816, + "grad_norm": 1.0625, + "learning_rate": 2.676923076923077e-06, + "loss": 1.6174, + "step": 147800 + }, + { + "epoch": 0.9400383896678404, + "grad_norm": 0.703125, + "learning_rate": 2.6717948717948724e-06, + "loss": 1.6077, + "step": 147900 + }, + { + "epoch": 0.9406739801949991, + "grad_norm": 0.88671875, + "learning_rate": 2.666666666666667e-06, + "loss": 1.6211, + "step": 148000 + }, + { + "epoch": 0.9413095707221579, + "grad_norm": 0.80078125, + "learning_rate": 2.6615384615384613e-06, + "loss": 1.6337, + "step": 148100 + }, + { + "epoch": 0.9419451612493167, + "grad_norm": 1.0859375, + "learning_rate": 2.6564102564102566e-06, + "loss": 1.6298, + "step": 148200 + }, + { + "epoch": 0.9425807517764755, + "grad_norm": 1.0703125, + "learning_rate": 2.6512820512820515e-06, + "loss": 1.6167, + "step": 148300 + }, + { + "epoch": 0.9432163423036343, + "grad_norm": 1.5625, + "learning_rate": 2.6461538461538464e-06, + "loss": 1.6251, + "step": 148400 + }, + { + "epoch": 0.943851932830793, + "grad_norm": 1.28125, + "learning_rate": 2.6410256410256413e-06, + "loss": 1.631, + "step": 148500 + }, + { + "epoch": 0.9444875233579518, + "grad_norm": 1.0546875, + "learning_rate": 2.635897435897436e-06, + "loss": 1.6133, + "step": 148600 + }, + { + "epoch": 0.9451231138851106, + "grad_norm": 0.921875, + "learning_rate": 2.630769230769231e-06, + "loss": 1.6298, + "step": 148700 + }, + { + "epoch": 0.9457587044122694, + "grad_norm": 0.91015625, + "learning_rate": 2.625641025641026e-06, + "loss": 1.6176, + "step": 148800 + }, + { + "epoch": 0.9463942949394282, + "grad_norm": 0.78515625, + "learning_rate": 2.6205128205128205e-06, + "loss": 1.6192, + "step": 148900 + }, + { + "epoch": 0.947029885466587, + "grad_norm": 0.96484375, + "learning_rate": 2.615384615384616e-06, + "loss": 1.6072, + "step": 149000 + }, + { + "epoch": 0.9476654759937458, + "grad_norm": 1.140625, + "learning_rate": 2.6102564102564103e-06, + "loss": 1.6265, + "step": 149100 + }, + { + "epoch": 0.9483010665209046, + "grad_norm": 0.9375, + "learning_rate": 2.605128205128205e-06, + "loss": 1.6223, + "step": 149200 + }, + { + "epoch": 0.9489366570480634, + "grad_norm": 0.921875, + "learning_rate": 2.6e-06, + "loss": 1.6233, + "step": 149300 + }, + { + "epoch": 0.9495722475752222, + "grad_norm": 1.1015625, + "learning_rate": 2.594871794871795e-06, + "loss": 1.6216, + "step": 149400 + }, + { + "epoch": 0.950207838102381, + "grad_norm": 0.9765625, + "learning_rate": 2.5897435897435903e-06, + "loss": 1.6383, + "step": 149500 + }, + { + "epoch": 0.9508434286295397, + "grad_norm": 0.78125, + "learning_rate": 2.584615384615385e-06, + "loss": 1.627, + "step": 149600 + }, + { + "epoch": 0.9514790191566985, + "grad_norm": 1.0390625, + "learning_rate": 2.5794871794871797e-06, + "loss": 1.6252, + "step": 149700 + }, + { + "epoch": 0.9521146096838573, + "grad_norm": 0.89453125, + "learning_rate": 2.5743589743589746e-06, + "loss": 1.6155, + "step": 149800 + }, + { + "epoch": 0.9527502002110161, + "grad_norm": 1.0859375, + "learning_rate": 2.5692307692307695e-06, + "loss": 1.6213, + "step": 149900 + }, + { + "epoch": 0.9533857907381749, + "grad_norm": 1.0859375, + "learning_rate": 2.564102564102564e-06, + "loss": 1.6184, + "step": 150000 + }, + { + "epoch": 0.9540213812653336, + "grad_norm": 1.0703125, + "learning_rate": 2.5589743589743593e-06, + "loss": 1.6158, + "step": 150100 + }, + { + "epoch": 0.9546569717924924, + "grad_norm": 0.875, + "learning_rate": 2.5538461538461538e-06, + "loss": 1.6308, + "step": 150200 + }, + { + "epoch": 0.9552925623196512, + "grad_norm": 0.87109375, + "learning_rate": 2.548717948717949e-06, + "loss": 1.6245, + "step": 150300 + }, + { + "epoch": 0.95592815284681, + "grad_norm": 0.890625, + "learning_rate": 2.543589743589744e-06, + "loss": 1.6166, + "step": 150400 + }, + { + "epoch": 0.9565637433739688, + "grad_norm": 0.953125, + "learning_rate": 2.5384615384615385e-06, + "loss": 1.6234, + "step": 150500 + }, + { + "epoch": 0.9571993339011275, + "grad_norm": 0.8359375, + "learning_rate": 2.5333333333333338e-06, + "loss": 1.6191, + "step": 150600 + }, + { + "epoch": 0.9578349244282863, + "grad_norm": 0.96875, + "learning_rate": 2.5282051282051283e-06, + "loss": 1.6152, + "step": 150700 + }, + { + "epoch": 0.9584705149554451, + "grad_norm": 0.76953125, + "learning_rate": 2.523076923076923e-06, + "loss": 1.6097, + "step": 150800 + }, + { + "epoch": 0.9591061054826039, + "grad_norm": 0.88671875, + "learning_rate": 2.5179487179487185e-06, + "loss": 1.6461, + "step": 150900 + }, + { + "epoch": 0.9597416960097627, + "grad_norm": 1.3046875, + "learning_rate": 2.512820512820513e-06, + "loss": 1.6146, + "step": 151000 + }, + { + "epoch": 0.9603772865369214, + "grad_norm": 0.9375, + "learning_rate": 2.507692307692308e-06, + "loss": 1.6435, + "step": 151100 + }, + { + "epoch": 0.9610128770640802, + "grad_norm": 0.859375, + "learning_rate": 2.5025641025641028e-06, + "loss": 1.6163, + "step": 151200 + }, + { + "epoch": 0.961648467591239, + "grad_norm": 0.69921875, + "learning_rate": 2.4974358974358977e-06, + "loss": 1.6316, + "step": 151300 + }, + { + "epoch": 0.9622840581183978, + "grad_norm": 1.109375, + "learning_rate": 2.4923076923076926e-06, + "loss": 1.6194, + "step": 151400 + }, + { + "epoch": 0.9629196486455566, + "grad_norm": 1.0703125, + "learning_rate": 2.4871794871794875e-06, + "loss": 1.6299, + "step": 151500 + }, + { + "epoch": 0.9635552391727153, + "grad_norm": 1.046875, + "learning_rate": 2.4820512820512824e-06, + "loss": 1.6017, + "step": 151600 + }, + { + "epoch": 0.9641908296998741, + "grad_norm": 1.1796875, + "learning_rate": 2.4769230769230773e-06, + "loss": 1.6184, + "step": 151700 + }, + { + "epoch": 0.9648264202270329, + "grad_norm": 0.921875, + "learning_rate": 2.471794871794872e-06, + "loss": 1.6238, + "step": 151800 + }, + { + "epoch": 0.9654620107541917, + "grad_norm": 0.8125, + "learning_rate": 2.466666666666667e-06, + "loss": 1.6227, + "step": 151900 + }, + { + "epoch": 0.9660976012813505, + "grad_norm": 1.0078125, + "learning_rate": 2.461538461538462e-06, + "loss": 1.6209, + "step": 152000 + }, + { + "epoch": 0.9667331918085093, + "grad_norm": 1.0, + "learning_rate": 2.4564102564102564e-06, + "loss": 1.6293, + "step": 152100 + }, + { + "epoch": 0.967368782335668, + "grad_norm": 0.81640625, + "learning_rate": 2.4512820512820513e-06, + "loss": 1.6269, + "step": 152200 + }, + { + "epoch": 0.9680043728628268, + "grad_norm": 1.0859375, + "learning_rate": 2.4461538461538466e-06, + "loss": 1.6321, + "step": 152300 + }, + { + "epoch": 0.9686399633899856, + "grad_norm": 0.828125, + "learning_rate": 2.441025641025641e-06, + "loss": 1.621, + "step": 152400 + }, + { + "epoch": 0.9692755539171444, + "grad_norm": 0.828125, + "learning_rate": 2.435897435897436e-06, + "loss": 1.6389, + "step": 152500 + }, + { + "epoch": 0.9699111444443032, + "grad_norm": 0.94921875, + "learning_rate": 2.430769230769231e-06, + "loss": 1.6327, + "step": 152600 + }, + { + "epoch": 0.9705467349714619, + "grad_norm": 0.6640625, + "learning_rate": 2.425641025641026e-06, + "loss": 1.6313, + "step": 152700 + }, + { + "epoch": 0.9711823254986207, + "grad_norm": 1.1640625, + "learning_rate": 2.4205128205128207e-06, + "loss": 1.6189, + "step": 152800 + }, + { + "epoch": 0.9718179160257796, + "grad_norm": 0.765625, + "learning_rate": 2.4153846153846156e-06, + "loss": 1.6292, + "step": 152900 + }, + { + "epoch": 0.9724535065529384, + "grad_norm": 1.2578125, + "learning_rate": 2.4102564102564105e-06, + "loss": 1.6358, + "step": 153000 + }, + { + "epoch": 0.9730890970800972, + "grad_norm": 1.0703125, + "learning_rate": 2.4051282051282054e-06, + "loss": 1.6183, + "step": 153100 + }, + { + "epoch": 0.973724687607256, + "grad_norm": 1.3125, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.6149, + "step": 153200 + }, + { + "epoch": 0.9743602781344147, + "grad_norm": 1.0390625, + "learning_rate": 2.3948717948717952e-06, + "loss": 1.612, + "step": 153300 + }, + { + "epoch": 0.9749958686615735, + "grad_norm": 1.015625, + "learning_rate": 2.38974358974359e-06, + "loss": 1.6341, + "step": 153400 + }, + { + "epoch": 0.9756314591887323, + "grad_norm": 0.9375, + "learning_rate": 2.384615384615385e-06, + "loss": 1.621, + "step": 153500 + }, + { + "epoch": 0.9762670497158911, + "grad_norm": 1.265625, + "learning_rate": 2.3794871794871795e-06, + "loss": 1.6103, + "step": 153600 + }, + { + "epoch": 0.9769026402430498, + "grad_norm": 0.96484375, + "learning_rate": 2.3743589743589744e-06, + "loss": 1.6274, + "step": 153700 + }, + { + "epoch": 0.9775382307702086, + "grad_norm": 1.078125, + "learning_rate": 2.3692307692307697e-06, + "loss": 1.6163, + "step": 153800 + }, + { + "epoch": 0.9781738212973674, + "grad_norm": 1.140625, + "learning_rate": 2.364102564102564e-06, + "loss": 1.6313, + "step": 153900 + }, + { + "epoch": 0.9788094118245262, + "grad_norm": 0.703125, + "learning_rate": 2.358974358974359e-06, + "loss": 1.613, + "step": 154000 + }, + { + "epoch": 0.979445002351685, + "grad_norm": 0.74609375, + "learning_rate": 2.353846153846154e-06, + "loss": 1.6266, + "step": 154100 + }, + { + "epoch": 0.9800805928788437, + "grad_norm": 0.9453125, + "learning_rate": 2.348717948717949e-06, + "loss": 1.6425, + "step": 154200 + }, + { + "epoch": 0.9807161834060025, + "grad_norm": 1.09375, + "learning_rate": 2.3435897435897438e-06, + "loss": 1.6238, + "step": 154300 + }, + { + "epoch": 0.9813517739331613, + "grad_norm": 0.86328125, + "learning_rate": 2.3384615384615387e-06, + "loss": 1.6389, + "step": 154400 + }, + { + "epoch": 0.9819873644603201, + "grad_norm": 0.8046875, + "learning_rate": 2.3333333333333336e-06, + "loss": 1.6078, + "step": 154500 + }, + { + "epoch": 0.9826229549874789, + "grad_norm": 1.3203125, + "learning_rate": 2.3282051282051285e-06, + "loss": 1.6403, + "step": 154600 + }, + { + "epoch": 0.9832585455146376, + "grad_norm": 1.0546875, + "learning_rate": 2.3230769230769234e-06, + "loss": 1.6259, + "step": 154700 + }, + { + "epoch": 0.9838941360417964, + "grad_norm": 1.0703125, + "learning_rate": 2.3179487179487183e-06, + "loss": 1.6166, + "step": 154800 + }, + { + "epoch": 0.9845297265689552, + "grad_norm": 1.1015625, + "learning_rate": 2.312820512820513e-06, + "loss": 1.6286, + "step": 154900 + }, + { + "epoch": 0.985165317096114, + "grad_norm": 1.125, + "learning_rate": 2.307692307692308e-06, + "loss": 1.6285, + "step": 155000 + }, + { + "epoch": 0.9858009076232728, + "grad_norm": 0.71875, + "learning_rate": 2.3025641025641026e-06, + "loss": 1.6191, + "step": 155100 + }, + { + "epoch": 0.9864364981504316, + "grad_norm": 1.125, + "learning_rate": 2.2974358974358975e-06, + "loss": 1.6229, + "step": 155200 + }, + { + "epoch": 0.9870720886775903, + "grad_norm": 0.6953125, + "learning_rate": 2.2923076923076928e-06, + "loss": 1.6099, + "step": 155300 + }, + { + "epoch": 0.9877076792047491, + "grad_norm": 0.90234375, + "learning_rate": 2.2871794871794872e-06, + "loss": 1.621, + "step": 155400 + }, + { + "epoch": 0.9883432697319079, + "grad_norm": 1.0625, + "learning_rate": 2.282051282051282e-06, + "loss": 1.6241, + "step": 155500 + }, + { + "epoch": 0.9889788602590667, + "grad_norm": 0.8828125, + "learning_rate": 2.276923076923077e-06, + "loss": 1.6201, + "step": 155600 + }, + { + "epoch": 0.9896144507862255, + "grad_norm": 0.80859375, + "learning_rate": 2.271794871794872e-06, + "loss": 1.6286, + "step": 155700 + }, + { + "epoch": 0.9902500413133842, + "grad_norm": 1.2734375, + "learning_rate": 2.266666666666667e-06, + "loss": 1.6277, + "step": 155800 + }, + { + "epoch": 0.990885631840543, + "grad_norm": 0.99609375, + "learning_rate": 2.2615384615384617e-06, + "loss": 1.6249, + "step": 155900 + }, + { + "epoch": 0.9915212223677018, + "grad_norm": 0.78125, + "learning_rate": 2.2564102564102566e-06, + "loss": 1.6259, + "step": 156000 + }, + { + "epoch": 0.9921568128948606, + "grad_norm": 0.94921875, + "learning_rate": 2.2512820512820515e-06, + "loss": 1.6344, + "step": 156100 + }, + { + "epoch": 0.9927924034220194, + "grad_norm": 1.2421875, + "learning_rate": 2.2461538461538464e-06, + "loss": 1.6339, + "step": 156200 + }, + { + "epoch": 0.9934279939491781, + "grad_norm": 0.85546875, + "learning_rate": 2.2410256410256413e-06, + "loss": 1.6155, + "step": 156300 + }, + { + "epoch": 0.9940635844763369, + "grad_norm": 1.1015625, + "learning_rate": 2.2358974358974362e-06, + "loss": 1.6393, + "step": 156400 + }, + { + "epoch": 0.9946991750034957, + "grad_norm": 0.99609375, + "learning_rate": 2.230769230769231e-06, + "loss": 1.6203, + "step": 156500 + }, + { + "epoch": 0.9953347655306546, + "grad_norm": 0.7421875, + "learning_rate": 2.2256410256410256e-06, + "loss": 1.6214, + "step": 156600 + }, + { + "epoch": 0.9959703560578134, + "grad_norm": 1.0390625, + "learning_rate": 2.2205128205128205e-06, + "loss": 1.6092, + "step": 156700 + }, + { + "epoch": 0.9966059465849721, + "grad_norm": 1.0390625, + "learning_rate": 2.215384615384616e-06, + "loss": 1.6123, + "step": 156800 + }, + { + "epoch": 0.9972415371121309, + "grad_norm": 1.171875, + "learning_rate": 2.2102564102564103e-06, + "loss": 1.6175, + "step": 156900 + }, + { + "epoch": 0.9978771276392897, + "grad_norm": 0.91796875, + "learning_rate": 2.2051282051282052e-06, + "loss": 1.5962, + "step": 157000 + }, + { + "epoch": 0.9985127181664485, + "grad_norm": 0.58984375, + "learning_rate": 2.2e-06, + "loss": 1.6391, + "step": 157100 + }, + { + "epoch": 0.9991483086936073, + "grad_norm": 1.0078125, + "learning_rate": 2.194871794871795e-06, + "loss": 1.6408, + "step": 157200 + }, + { + "epoch": 0.999783899220766, + "grad_norm": 0.85546875, + "learning_rate": 2.18974358974359e-06, + "loss": 1.6303, + "step": 157300 + }, + { + "epoch": 1.0004194897479248, + "grad_norm": 1.1640625, + "learning_rate": 2.184615384615385e-06, + "loss": 1.6148, + "step": 157400 + }, + { + "epoch": 1.0010550802750835, + "grad_norm": 1.1015625, + "learning_rate": 2.1794871794871797e-06, + "loss": 1.6173, + "step": 157500 + }, + { + "epoch": 1.0016906708022424, + "grad_norm": 0.953125, + "learning_rate": 2.1743589743589746e-06, + "loss": 1.6077, + "step": 157600 + }, + { + "epoch": 1.002326261329401, + "grad_norm": 1.171875, + "learning_rate": 2.1692307692307695e-06, + "loss": 1.6165, + "step": 157700 + }, + { + "epoch": 1.00296185185656, + "grad_norm": 0.98046875, + "learning_rate": 2.1641025641025644e-06, + "loss": 1.6246, + "step": 157800 + }, + { + "epoch": 1.0035974423837186, + "grad_norm": 0.87109375, + "learning_rate": 2.1589743589743593e-06, + "loss": 1.6128, + "step": 157900 + }, + { + "epoch": 1.0042330329108775, + "grad_norm": 0.82421875, + "learning_rate": 2.153846153846154e-06, + "loss": 1.6114, + "step": 158000 + }, + { + "epoch": 1.0048686234380362, + "grad_norm": 0.80859375, + "learning_rate": 2.1487179487179487e-06, + "loss": 1.6056, + "step": 158100 + }, + { + "epoch": 1.005504213965195, + "grad_norm": 0.92578125, + "learning_rate": 2.1435897435897436e-06, + "loss": 1.6178, + "step": 158200 + }, + { + "epoch": 1.0061398044923537, + "grad_norm": 0.9609375, + "learning_rate": 2.138461538461539e-06, + "loss": 1.6287, + "step": 158300 + }, + { + "epoch": 1.0067753950195126, + "grad_norm": 0.9140625, + "learning_rate": 2.133333333333334e-06, + "loss": 1.6419, + "step": 158400 + }, + { + "epoch": 1.0074109855466715, + "grad_norm": 1.21875, + "learning_rate": 2.1282051282051283e-06, + "loss": 1.6179, + "step": 158500 + }, + { + "epoch": 1.0080465760738302, + "grad_norm": 0.98828125, + "learning_rate": 2.123076923076923e-06, + "loss": 1.6242, + "step": 158600 + }, + { + "epoch": 1.008682166600989, + "grad_norm": 0.859375, + "learning_rate": 2.117948717948718e-06, + "loss": 1.6114, + "step": 158700 + }, + { + "epoch": 1.0093177571281478, + "grad_norm": 0.80859375, + "learning_rate": 2.112820512820513e-06, + "loss": 1.6039, + "step": 158800 + }, + { + "epoch": 1.0099533476553066, + "grad_norm": 1.015625, + "learning_rate": 2.107692307692308e-06, + "loss": 1.6231, + "step": 158900 + }, + { + "epoch": 1.0105889381824653, + "grad_norm": 1.1640625, + "learning_rate": 2.1025641025641028e-06, + "loss": 1.6244, + "step": 159000 + }, + { + "epoch": 1.0112245287096242, + "grad_norm": 0.9375, + "learning_rate": 2.0974358974358977e-06, + "loss": 1.629, + "step": 159100 + }, + { + "epoch": 1.0118601192367829, + "grad_norm": 0.8046875, + "learning_rate": 2.0923076923076926e-06, + "loss": 1.6334, + "step": 159200 + }, + { + "epoch": 1.0124957097639418, + "grad_norm": 0.7578125, + "learning_rate": 2.0871794871794875e-06, + "loss": 1.6162, + "step": 159300 + }, + { + "epoch": 1.0131313002911004, + "grad_norm": 0.84765625, + "learning_rate": 2.0820512820512824e-06, + "loss": 1.622, + "step": 159400 + }, + { + "epoch": 1.0137668908182593, + "grad_norm": 0.94140625, + "learning_rate": 2.0769230769230773e-06, + "loss": 1.6202, + "step": 159500 + }, + { + "epoch": 1.014402481345418, + "grad_norm": 0.953125, + "learning_rate": 2.0717948717948717e-06, + "loss": 1.6231, + "step": 159600 + }, + { + "epoch": 1.0150380718725769, + "grad_norm": 0.73046875, + "learning_rate": 2.0666666666666666e-06, + "loss": 1.6075, + "step": 159700 + }, + { + "epoch": 1.0156736623997356, + "grad_norm": 0.8125, + "learning_rate": 2.061538461538462e-06, + "loss": 1.6256, + "step": 159800 + }, + { + "epoch": 1.0163092529268944, + "grad_norm": 0.73828125, + "learning_rate": 2.056410256410257e-06, + "loss": 1.6224, + "step": 159900 + }, + { + "epoch": 1.0169448434540531, + "grad_norm": 0.90234375, + "learning_rate": 2.0512820512820513e-06, + "loss": 1.6256, + "step": 160000 + }, + { + "epoch": 1.017580433981212, + "grad_norm": 0.78125, + "learning_rate": 2.0461538461538462e-06, + "loss": 1.6318, + "step": 160100 + }, + { + "epoch": 1.0182160245083707, + "grad_norm": 0.73828125, + "learning_rate": 2.041025641025641e-06, + "loss": 1.621, + "step": 160200 + }, + { + "epoch": 1.0188516150355296, + "grad_norm": 1.2734375, + "learning_rate": 2.035897435897436e-06, + "loss": 1.6265, + "step": 160300 + }, + { + "epoch": 1.0194872055626882, + "grad_norm": 0.984375, + "learning_rate": 2.030769230769231e-06, + "loss": 1.6318, + "step": 160400 + }, + { + "epoch": 1.0201227960898471, + "grad_norm": 1.234375, + "learning_rate": 2.025641025641026e-06, + "loss": 1.6279, + "step": 160500 + }, + { + "epoch": 1.0207583866170058, + "grad_norm": 0.92578125, + "learning_rate": 2.0205128205128207e-06, + "loss": 1.6012, + "step": 160600 + }, + { + "epoch": 1.0213939771441647, + "grad_norm": 1.0390625, + "learning_rate": 2.0153846153846156e-06, + "loss": 1.6238, + "step": 160700 + }, + { + "epoch": 1.0220295676713234, + "grad_norm": 1.5234375, + "learning_rate": 2.0102564102564105e-06, + "loss": 1.6219, + "step": 160800 + }, + { + "epoch": 1.0226651581984822, + "grad_norm": 0.76171875, + "learning_rate": 2.0051282051282054e-06, + "loss": 1.6225, + "step": 160900 + }, + { + "epoch": 1.023300748725641, + "grad_norm": 0.921875, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.6197, + "step": 161000 + }, + { + "epoch": 1.0239363392527998, + "grad_norm": 0.7421875, + "learning_rate": 1.994871794871795e-06, + "loss": 1.6083, + "step": 161100 + }, + { + "epoch": 1.0245719297799585, + "grad_norm": 0.9765625, + "learning_rate": 1.9897435897435897e-06, + "loss": 1.641, + "step": 161200 + }, + { + "epoch": 1.0252075203071174, + "grad_norm": 0.6484375, + "learning_rate": 1.984615384615385e-06, + "loss": 1.6369, + "step": 161300 + }, + { + "epoch": 1.025843110834276, + "grad_norm": 0.9765625, + "learning_rate": 1.97948717948718e-06, + "loss": 1.6109, + "step": 161400 + }, + { + "epoch": 1.026478701361435, + "grad_norm": 1.0703125, + "learning_rate": 1.9743589743589744e-06, + "loss": 1.6108, + "step": 161500 + }, + { + "epoch": 1.0271142918885936, + "grad_norm": 1.0703125, + "learning_rate": 1.9692307692307693e-06, + "loss": 1.6292, + "step": 161600 + }, + { + "epoch": 1.0277498824157525, + "grad_norm": 1.3828125, + "learning_rate": 1.964102564102564e-06, + "loss": 1.6273, + "step": 161700 + }, + { + "epoch": 1.0283854729429112, + "grad_norm": 0.77734375, + "learning_rate": 1.958974358974359e-06, + "loss": 1.6217, + "step": 161800 + }, + { + "epoch": 1.02902106347007, + "grad_norm": 0.89453125, + "learning_rate": 1.953846153846154e-06, + "loss": 1.6362, + "step": 161900 + }, + { + "epoch": 1.0296566539972287, + "grad_norm": 1.125, + "learning_rate": 1.948717948717949e-06, + "loss": 1.6055, + "step": 162000 + }, + { + "epoch": 1.0302922445243876, + "grad_norm": 1.2109375, + "learning_rate": 1.943589743589744e-06, + "loss": 1.6192, + "step": 162100 + }, + { + "epoch": 1.0309278350515463, + "grad_norm": 0.75390625, + "learning_rate": 1.9384615384615387e-06, + "loss": 1.6168, + "step": 162200 + }, + { + "epoch": 1.0315634255787052, + "grad_norm": 1.0234375, + "learning_rate": 1.9333333333333336e-06, + "loss": 1.6188, + "step": 162300 + }, + { + "epoch": 1.032199016105864, + "grad_norm": 0.8203125, + "learning_rate": 1.9282051282051285e-06, + "loss": 1.6285, + "step": 162400 + }, + { + "epoch": 1.0328346066330227, + "grad_norm": 0.765625, + "learning_rate": 1.9230769230769234e-06, + "loss": 1.6258, + "step": 162500 + }, + { + "epoch": 1.0334701971601816, + "grad_norm": 1.0859375, + "learning_rate": 1.917948717948718e-06, + "loss": 1.6148, + "step": 162600 + }, + { + "epoch": 1.0341057876873403, + "grad_norm": 1.1328125, + "learning_rate": 1.9128205128205128e-06, + "loss": 1.6059, + "step": 162700 + }, + { + "epoch": 1.0347413782144992, + "grad_norm": 0.87109375, + "learning_rate": 1.907692307692308e-06, + "loss": 1.635, + "step": 162800 + }, + { + "epoch": 1.0353769687416579, + "grad_norm": 0.90234375, + "learning_rate": 1.9025641025641028e-06, + "loss": 1.6108, + "step": 162900 + }, + { + "epoch": 1.0360125592688167, + "grad_norm": 0.93359375, + "learning_rate": 1.8974358974358975e-06, + "loss": 1.6288, + "step": 163000 + }, + { + "epoch": 1.0366481497959754, + "grad_norm": 1.0859375, + "learning_rate": 1.8923076923076924e-06, + "loss": 1.6269, + "step": 163100 + }, + { + "epoch": 1.0372837403231343, + "grad_norm": 0.78125, + "learning_rate": 1.8871794871794875e-06, + "loss": 1.6147, + "step": 163200 + }, + { + "epoch": 1.037919330850293, + "grad_norm": 0.7890625, + "learning_rate": 1.8820512820512822e-06, + "loss": 1.6123, + "step": 163300 + }, + { + "epoch": 1.0385549213774519, + "grad_norm": 1.0078125, + "learning_rate": 1.876923076923077e-06, + "loss": 1.6011, + "step": 163400 + }, + { + "epoch": 1.0391905119046105, + "grad_norm": 1.0234375, + "learning_rate": 1.871794871794872e-06, + "loss": 1.6291, + "step": 163500 + }, + { + "epoch": 1.0398261024317694, + "grad_norm": 1.1640625, + "learning_rate": 1.8666666666666669e-06, + "loss": 1.62, + "step": 163600 + }, + { + "epoch": 1.040461692958928, + "grad_norm": 0.84375, + "learning_rate": 1.8615384615384616e-06, + "loss": 1.6134, + "step": 163700 + }, + { + "epoch": 1.041097283486087, + "grad_norm": 0.640625, + "learning_rate": 1.8564102564102565e-06, + "loss": 1.6198, + "step": 163800 + }, + { + "epoch": 1.0417328740132457, + "grad_norm": 1.3828125, + "learning_rate": 1.8512820512820516e-06, + "loss": 1.624, + "step": 163900 + }, + { + "epoch": 1.0423684645404045, + "grad_norm": 0.66796875, + "learning_rate": 1.8461538461538465e-06, + "loss": 1.6289, + "step": 164000 + }, + { + "epoch": 1.0430040550675632, + "grad_norm": 1.109375, + "learning_rate": 1.8410256410256411e-06, + "loss": 1.6283, + "step": 164100 + }, + { + "epoch": 1.043639645594722, + "grad_norm": 0.75, + "learning_rate": 1.835897435897436e-06, + "loss": 1.63, + "step": 164200 + }, + { + "epoch": 1.0442752361218808, + "grad_norm": 1.0390625, + "learning_rate": 1.830769230769231e-06, + "loss": 1.6104, + "step": 164300 + }, + { + "epoch": 1.0449108266490397, + "grad_norm": 0.83203125, + "learning_rate": 1.8256410256410258e-06, + "loss": 1.6221, + "step": 164400 + }, + { + "epoch": 1.0455464171761983, + "grad_norm": 1.0234375, + "learning_rate": 1.8205128205128205e-06, + "loss": 1.6318, + "step": 164500 + }, + { + "epoch": 1.0461820077033572, + "grad_norm": 0.98046875, + "learning_rate": 1.8153846153846154e-06, + "loss": 1.6349, + "step": 164600 + }, + { + "epoch": 1.046817598230516, + "grad_norm": 1.1015625, + "learning_rate": 1.8102564102564105e-06, + "loss": 1.6375, + "step": 164700 + }, + { + "epoch": 1.0474531887576748, + "grad_norm": 1.0546875, + "learning_rate": 1.8051282051282054e-06, + "loss": 1.6247, + "step": 164800 + }, + { + "epoch": 1.0480887792848335, + "grad_norm": 0.75, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.6145, + "step": 164900 + }, + { + "epoch": 1.0487243698119924, + "grad_norm": 0.60546875, + "learning_rate": 1.794871794871795e-06, + "loss": 1.6266, + "step": 165000 + }, + { + "epoch": 1.049359960339151, + "grad_norm": 0.90234375, + "learning_rate": 1.78974358974359e-06, + "loss": 1.6338, + "step": 165100 + }, + { + "epoch": 1.04999555086631, + "grad_norm": 1.203125, + "learning_rate": 1.7846153846153846e-06, + "loss": 1.6246, + "step": 165200 + }, + { + "epoch": 1.0506311413934686, + "grad_norm": 1.0390625, + "learning_rate": 1.7794871794871795e-06, + "loss": 1.6204, + "step": 165300 + }, + { + "epoch": 1.0512667319206275, + "grad_norm": 0.80859375, + "learning_rate": 1.7743589743589746e-06, + "loss": 1.6184, + "step": 165400 + }, + { + "epoch": 1.0519023224477861, + "grad_norm": 0.67578125, + "learning_rate": 1.7692307692307695e-06, + "loss": 1.6243, + "step": 165500 + }, + { + "epoch": 1.052537912974945, + "grad_norm": 1.015625, + "learning_rate": 1.7641025641025642e-06, + "loss": 1.623, + "step": 165600 + }, + { + "epoch": 1.0531735035021037, + "grad_norm": 1.109375, + "learning_rate": 1.7589743589743591e-06, + "loss": 1.6189, + "step": 165700 + }, + { + "epoch": 1.0538090940292626, + "grad_norm": 1.2578125, + "learning_rate": 1.753846153846154e-06, + "loss": 1.6277, + "step": 165800 + }, + { + "epoch": 1.0544446845564215, + "grad_norm": 0.91015625, + "learning_rate": 1.748717948717949e-06, + "loss": 1.6033, + "step": 165900 + }, + { + "epoch": 1.0550802750835802, + "grad_norm": 0.99609375, + "learning_rate": 1.7435897435897436e-06, + "loss": 1.6138, + "step": 166000 + }, + { + "epoch": 1.0557158656107388, + "grad_norm": 0.80078125, + "learning_rate": 1.7384615384615385e-06, + "loss": 1.6237, + "step": 166100 + }, + { + "epoch": 1.0563514561378977, + "grad_norm": 0.953125, + "learning_rate": 1.7333333333333336e-06, + "loss": 1.6208, + "step": 166200 + }, + { + "epoch": 1.0569870466650566, + "grad_norm": 0.94921875, + "learning_rate": 1.7282051282051285e-06, + "loss": 1.6214, + "step": 166300 + }, + { + "epoch": 1.0576226371922153, + "grad_norm": 0.7734375, + "learning_rate": 1.7230769230769232e-06, + "loss": 1.6138, + "step": 166400 + }, + { + "epoch": 1.0582582277193742, + "grad_norm": 0.74609375, + "learning_rate": 1.717948717948718e-06, + "loss": 1.6093, + "step": 166500 + }, + { + "epoch": 1.0588938182465328, + "grad_norm": 1.0, + "learning_rate": 1.712820512820513e-06, + "loss": 1.6212, + "step": 166600 + }, + { + "epoch": 1.0595294087736917, + "grad_norm": 0.828125, + "learning_rate": 1.7076923076923077e-06, + "loss": 1.6342, + "step": 166700 + }, + { + "epoch": 1.0601649993008504, + "grad_norm": 0.5859375, + "learning_rate": 1.7025641025641026e-06, + "loss": 1.6263, + "step": 166800 + }, + { + "epoch": 1.0608005898280093, + "grad_norm": 1.125, + "learning_rate": 1.6974358974358977e-06, + "loss": 1.6238, + "step": 166900 + }, + { + "epoch": 1.061436180355168, + "grad_norm": 0.91015625, + "learning_rate": 1.6923076923076926e-06, + "loss": 1.5956, + "step": 167000 + }, + { + "epoch": 1.0620717708823268, + "grad_norm": 0.79296875, + "learning_rate": 1.6871794871794873e-06, + "loss": 1.6477, + "step": 167100 + }, + { + "epoch": 1.0627073614094855, + "grad_norm": 1.234375, + "learning_rate": 1.6820512820512822e-06, + "loss": 1.6159, + "step": 167200 + }, + { + "epoch": 1.0633429519366444, + "grad_norm": 0.87890625, + "learning_rate": 1.676923076923077e-06, + "loss": 1.6345, + "step": 167300 + }, + { + "epoch": 1.063978542463803, + "grad_norm": 0.6953125, + "learning_rate": 1.671794871794872e-06, + "loss": 1.6316, + "step": 167400 + }, + { + "epoch": 1.064614132990962, + "grad_norm": 1.03125, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.6155, + "step": 167500 + }, + { + "epoch": 1.0652497235181206, + "grad_norm": 1.3125, + "learning_rate": 1.6615384615384616e-06, + "loss": 1.6145, + "step": 167600 + }, + { + "epoch": 1.0658853140452795, + "grad_norm": 0.7265625, + "learning_rate": 1.6564102564102567e-06, + "loss": 1.626, + "step": 167700 + }, + { + "epoch": 1.0665209045724382, + "grad_norm": 1.0390625, + "learning_rate": 1.6512820512820516e-06, + "loss": 1.6213, + "step": 167800 + }, + { + "epoch": 1.067156495099597, + "grad_norm": 0.875, + "learning_rate": 1.6461538461538463e-06, + "loss": 1.6201, + "step": 167900 + }, + { + "epoch": 1.0677920856267558, + "grad_norm": 0.69140625, + "learning_rate": 1.6410256410256412e-06, + "loss": 1.6231, + "step": 168000 + }, + { + "epoch": 1.0684276761539147, + "grad_norm": 0.82421875, + "learning_rate": 1.635897435897436e-06, + "loss": 1.6285, + "step": 168100 + }, + { + "epoch": 1.0690632666810733, + "grad_norm": 1.046875, + "learning_rate": 1.6307692307692307e-06, + "loss": 1.628, + "step": 168200 + }, + { + "epoch": 1.0696988572082322, + "grad_norm": 1.3125, + "learning_rate": 1.6256410256410256e-06, + "loss": 1.625, + "step": 168300 + }, + { + "epoch": 1.0703344477353909, + "grad_norm": 0.92578125, + "learning_rate": 1.6205128205128208e-06, + "loss": 1.6231, + "step": 168400 + }, + { + "epoch": 1.0709700382625498, + "grad_norm": 0.8125, + "learning_rate": 1.6153846153846157e-06, + "loss": 1.6204, + "step": 168500 + }, + { + "epoch": 1.0716056287897084, + "grad_norm": 1.2734375, + "learning_rate": 1.6102564102564103e-06, + "loss": 1.6307, + "step": 168600 + }, + { + "epoch": 1.0722412193168673, + "grad_norm": 0.97265625, + "learning_rate": 1.6051282051282052e-06, + "loss": 1.6235, + "step": 168700 + }, + { + "epoch": 1.072876809844026, + "grad_norm": 1.046875, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.6222, + "step": 168800 + }, + { + "epoch": 1.073512400371185, + "grad_norm": 1.03125, + "learning_rate": 1.594871794871795e-06, + "loss": 1.6281, + "step": 168900 + }, + { + "epoch": 1.0741479908983436, + "grad_norm": 1.1640625, + "learning_rate": 1.5897435897435897e-06, + "loss": 1.6216, + "step": 169000 + }, + { + "epoch": 1.0747835814255025, + "grad_norm": 1.34375, + "learning_rate": 1.5846153846153846e-06, + "loss": 1.63, + "step": 169100 + }, + { + "epoch": 1.0754191719526611, + "grad_norm": 0.9609375, + "learning_rate": 1.5794871794871797e-06, + "loss": 1.617, + "step": 169200 + }, + { + "epoch": 1.07605476247982, + "grad_norm": 0.890625, + "learning_rate": 1.5743589743589746e-06, + "loss": 1.6236, + "step": 169300 + }, + { + "epoch": 1.0766903530069787, + "grad_norm": 0.921875, + "learning_rate": 1.5692307692307693e-06, + "loss": 1.6239, + "step": 169400 + }, + { + "epoch": 1.0773259435341376, + "grad_norm": 0.93359375, + "learning_rate": 1.5641025641025642e-06, + "loss": 1.6126, + "step": 169500 + }, + { + "epoch": 1.0779615340612962, + "grad_norm": 0.796875, + "learning_rate": 1.5589743589743591e-06, + "loss": 1.6349, + "step": 169600 + }, + { + "epoch": 1.0785971245884551, + "grad_norm": 0.8046875, + "learning_rate": 1.5538461538461538e-06, + "loss": 1.6318, + "step": 169700 + }, + { + "epoch": 1.079232715115614, + "grad_norm": 1.0390625, + "learning_rate": 1.5487179487179487e-06, + "loss": 1.6238, + "step": 169800 + }, + { + "epoch": 1.0798683056427727, + "grad_norm": 1.2734375, + "learning_rate": 1.5435897435897438e-06, + "loss": 1.6289, + "step": 169900 + }, + { + "epoch": 1.0805038961699314, + "grad_norm": 1.0390625, + "learning_rate": 1.5384615384615387e-06, + "loss": 1.6279, + "step": 170000 + }, + { + "epoch": 1.0811394866970903, + "grad_norm": 0.7578125, + "learning_rate": 1.5333333333333334e-06, + "loss": 1.6226, + "step": 170100 + }, + { + "epoch": 1.0817750772242491, + "grad_norm": 0.85546875, + "learning_rate": 1.5282051282051283e-06, + "loss": 1.621, + "step": 170200 + }, + { + "epoch": 1.0824106677514078, + "grad_norm": 0.7421875, + "learning_rate": 1.5230769230769232e-06, + "loss": 1.6347, + "step": 170300 + }, + { + "epoch": 1.0830462582785667, + "grad_norm": 1.4296875, + "learning_rate": 1.517948717948718e-06, + "loss": 1.6362, + "step": 170400 + }, + { + "epoch": 1.0836818488057254, + "grad_norm": 0.72265625, + "learning_rate": 1.5128205128205128e-06, + "loss": 1.6452, + "step": 170500 + }, + { + "epoch": 1.0843174393328843, + "grad_norm": 1.1875, + "learning_rate": 1.5076923076923077e-06, + "loss": 1.6087, + "step": 170600 + }, + { + "epoch": 1.084953029860043, + "grad_norm": 1.109375, + "learning_rate": 1.5025641025641028e-06, + "loss": 1.614, + "step": 170700 + }, + { + "epoch": 1.0855886203872018, + "grad_norm": 1.4140625, + "learning_rate": 1.4974358974358977e-06, + "loss": 1.6244, + "step": 170800 + }, + { + "epoch": 1.0862242109143605, + "grad_norm": 0.78125, + "learning_rate": 1.4923076923076924e-06, + "loss": 1.6118, + "step": 170900 + }, + { + "epoch": 1.0868598014415194, + "grad_norm": 0.83203125, + "learning_rate": 1.4871794871794873e-06, + "loss": 1.62, + "step": 171000 + }, + { + "epoch": 1.087495391968678, + "grad_norm": 0.60546875, + "learning_rate": 1.4820512820512822e-06, + "loss": 1.6146, + "step": 171100 + }, + { + "epoch": 1.088130982495837, + "grad_norm": 0.59765625, + "learning_rate": 1.476923076923077e-06, + "loss": 1.6233, + "step": 171200 + }, + { + "epoch": 1.0887665730229956, + "grad_norm": 0.69921875, + "learning_rate": 1.4717948717948718e-06, + "loss": 1.607, + "step": 171300 + }, + { + "epoch": 1.0894021635501545, + "grad_norm": 0.8125, + "learning_rate": 1.4666666666666669e-06, + "loss": 1.6358, + "step": 171400 + }, + { + "epoch": 1.0900377540773132, + "grad_norm": 0.921875, + "learning_rate": 1.4615384615384618e-06, + "loss": 1.6165, + "step": 171500 + }, + { + "epoch": 1.090673344604472, + "grad_norm": 1.4375, + "learning_rate": 1.4564102564102565e-06, + "loss": 1.624, + "step": 171600 + }, + { + "epoch": 1.0913089351316307, + "grad_norm": 0.6328125, + "learning_rate": 1.4512820512820514e-06, + "loss": 1.6289, + "step": 171700 + }, + { + "epoch": 1.0919445256587896, + "grad_norm": 1.2578125, + "learning_rate": 1.4461538461538463e-06, + "loss": 1.6245, + "step": 171800 + }, + { + "epoch": 1.0925801161859483, + "grad_norm": 1.3046875, + "learning_rate": 1.4410256410256412e-06, + "loss": 1.635, + "step": 171900 + }, + { + "epoch": 1.0932157067131072, + "grad_norm": 0.69140625, + "learning_rate": 1.4358974358974359e-06, + "loss": 1.6379, + "step": 172000 + }, + { + "epoch": 1.0938512972402659, + "grad_norm": 1.0234375, + "learning_rate": 1.4307692307692308e-06, + "loss": 1.6384, + "step": 172100 + }, + { + "epoch": 1.0944868877674248, + "grad_norm": 0.78125, + "learning_rate": 1.4256410256410259e-06, + "loss": 1.6085, + "step": 172200 + }, + { + "epoch": 1.0951224782945834, + "grad_norm": 0.984375, + "learning_rate": 1.4205128205128208e-06, + "loss": 1.6252, + "step": 172300 + }, + { + "epoch": 1.0957580688217423, + "grad_norm": 0.78125, + "learning_rate": 1.4153846153846155e-06, + "loss": 1.6271, + "step": 172400 + }, + { + "epoch": 1.096393659348901, + "grad_norm": 0.953125, + "learning_rate": 1.4102564102564104e-06, + "loss": 1.6031, + "step": 172500 + }, + { + "epoch": 1.0970292498760599, + "grad_norm": 2.21875, + "learning_rate": 1.4051282051282052e-06, + "loss": 1.6219, + "step": 172600 + }, + { + "epoch": 1.0976648404032185, + "grad_norm": 1.3203125, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.629, + "step": 172700 + }, + { + "epoch": 1.0983004309303774, + "grad_norm": 1.15625, + "learning_rate": 1.3948717948717948e-06, + "loss": 1.6137, + "step": 172800 + }, + { + "epoch": 1.098936021457536, + "grad_norm": 1.40625, + "learning_rate": 1.38974358974359e-06, + "loss": 1.6134, + "step": 172900 + }, + { + "epoch": 1.099571611984695, + "grad_norm": 0.80078125, + "learning_rate": 1.3846153846153848e-06, + "loss": 1.6247, + "step": 173000 + }, + { + "epoch": 1.1002072025118537, + "grad_norm": 0.734375, + "learning_rate": 1.3794871794871795e-06, + "loss": 1.6231, + "step": 173100 + }, + { + "epoch": 1.1008427930390126, + "grad_norm": 1.1171875, + "learning_rate": 1.3743589743589744e-06, + "loss": 1.6254, + "step": 173200 + }, + { + "epoch": 1.1014783835661712, + "grad_norm": 1.0234375, + "learning_rate": 1.3692307692307693e-06, + "loss": 1.6358, + "step": 173300 + }, + { + "epoch": 1.1021139740933301, + "grad_norm": 0.90234375, + "learning_rate": 1.3641025641025642e-06, + "loss": 1.6345, + "step": 173400 + }, + { + "epoch": 1.1027495646204888, + "grad_norm": 1.3359375, + "learning_rate": 1.358974358974359e-06, + "loss": 1.6374, + "step": 173500 + }, + { + "epoch": 1.1033851551476477, + "grad_norm": 1.0078125, + "learning_rate": 1.3538461538461538e-06, + "loss": 1.6277, + "step": 173600 + }, + { + "epoch": 1.1040207456748066, + "grad_norm": 0.94140625, + "learning_rate": 1.348717948717949e-06, + "loss": 1.6246, + "step": 173700 + }, + { + "epoch": 1.1046563362019652, + "grad_norm": 0.84375, + "learning_rate": 1.3435897435897438e-06, + "loss": 1.6221, + "step": 173800 + }, + { + "epoch": 1.1052919267291241, + "grad_norm": 0.765625, + "learning_rate": 1.3384615384615385e-06, + "loss": 1.61, + "step": 173900 + }, + { + "epoch": 1.1059275172562828, + "grad_norm": 0.9140625, + "learning_rate": 1.3333333333333334e-06, + "loss": 1.633, + "step": 174000 + }, + { + "epoch": 1.1065631077834417, + "grad_norm": 0.953125, + "learning_rate": 1.3282051282051283e-06, + "loss": 1.6234, + "step": 174100 + }, + { + "epoch": 1.1071986983106004, + "grad_norm": 1.0703125, + "learning_rate": 1.3230769230769232e-06, + "loss": 1.6222, + "step": 174200 + }, + { + "epoch": 1.1078342888377593, + "grad_norm": 0.89453125, + "learning_rate": 1.317948717948718e-06, + "loss": 1.6104, + "step": 174300 + }, + { + "epoch": 1.108469879364918, + "grad_norm": 1.3671875, + "learning_rate": 1.312820512820513e-06, + "loss": 1.6357, + "step": 174400 + }, + { + "epoch": 1.1091054698920768, + "grad_norm": 0.8203125, + "learning_rate": 1.307692307692308e-06, + "loss": 1.6394, + "step": 174500 + }, + { + "epoch": 1.1097410604192355, + "grad_norm": 0.875, + "learning_rate": 1.3025641025641026e-06, + "loss": 1.6243, + "step": 174600 + }, + { + "epoch": 1.1103766509463944, + "grad_norm": 0.9765625, + "learning_rate": 1.2974358974358975e-06, + "loss": 1.6249, + "step": 174700 + }, + { + "epoch": 1.111012241473553, + "grad_norm": 1.0234375, + "learning_rate": 1.2923076923076924e-06, + "loss": 1.6309, + "step": 174800 + }, + { + "epoch": 1.111647832000712, + "grad_norm": 0.828125, + "learning_rate": 1.2871794871794873e-06, + "loss": 1.6047, + "step": 174900 + }, + { + "epoch": 1.1122834225278706, + "grad_norm": 0.9453125, + "learning_rate": 1.282051282051282e-06, + "loss": 1.6151, + "step": 175000 + }, + { + "epoch": 1.1129190130550295, + "grad_norm": 1.0390625, + "learning_rate": 1.2769230769230769e-06, + "loss": 1.6192, + "step": 175100 + }, + { + "epoch": 1.1135546035821882, + "grad_norm": 0.90234375, + "learning_rate": 1.271794871794872e-06, + "loss": 1.6136, + "step": 175200 + }, + { + "epoch": 1.114190194109347, + "grad_norm": 0.953125, + "learning_rate": 1.2666666666666669e-06, + "loss": 1.6255, + "step": 175300 + }, + { + "epoch": 1.1148257846365057, + "grad_norm": 1.234375, + "learning_rate": 1.2615384615384616e-06, + "loss": 1.6317, + "step": 175400 + }, + { + "epoch": 1.1154613751636646, + "grad_norm": 1.09375, + "learning_rate": 1.2564102564102565e-06, + "loss": 1.6191, + "step": 175500 + }, + { + "epoch": 1.1160969656908233, + "grad_norm": 0.83984375, + "learning_rate": 1.2512820512820514e-06, + "loss": 1.6208, + "step": 175600 + }, + { + "epoch": 1.1167325562179822, + "grad_norm": 0.8046875, + "learning_rate": 1.2461538461538463e-06, + "loss": 1.6289, + "step": 175700 + }, + { + "epoch": 1.1173681467451408, + "grad_norm": 0.76953125, + "learning_rate": 1.2410256410256412e-06, + "loss": 1.632, + "step": 175800 + }, + { + "epoch": 1.1180037372722997, + "grad_norm": 0.78515625, + "learning_rate": 1.235897435897436e-06, + "loss": 1.6146, + "step": 175900 + }, + { + "epoch": 1.1186393277994584, + "grad_norm": 1.1796875, + "learning_rate": 1.230769230769231e-06, + "loss": 1.626, + "step": 176000 + }, + { + "epoch": 1.1192749183266173, + "grad_norm": 0.79296875, + "learning_rate": 1.2256410256410257e-06, + "loss": 1.6242, + "step": 176100 + }, + { + "epoch": 1.119910508853776, + "grad_norm": 1.25, + "learning_rate": 1.2205128205128206e-06, + "loss": 1.6396, + "step": 176200 + }, + { + "epoch": 1.1205460993809349, + "grad_norm": 0.91796875, + "learning_rate": 1.2153846153846155e-06, + "loss": 1.6206, + "step": 176300 + }, + { + "epoch": 1.1211816899080935, + "grad_norm": 0.83984375, + "learning_rate": 1.2102564102564104e-06, + "loss": 1.6212, + "step": 176400 + }, + { + "epoch": 1.1218172804352524, + "grad_norm": 0.75390625, + "learning_rate": 1.2051282051282053e-06, + "loss": 1.6244, + "step": 176500 + }, + { + "epoch": 1.122452870962411, + "grad_norm": 0.78125, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.623, + "step": 176600 + }, + { + "epoch": 1.12308846148957, + "grad_norm": 1.078125, + "learning_rate": 1.194871794871795e-06, + "loss": 1.6183, + "step": 176700 + }, + { + "epoch": 1.1237240520167286, + "grad_norm": 1.1640625, + "learning_rate": 1.1897435897435897e-06, + "loss": 1.622, + "step": 176800 + }, + { + "epoch": 1.1243596425438875, + "grad_norm": 1.5078125, + "learning_rate": 1.1846153846153849e-06, + "loss": 1.6249, + "step": 176900 + }, + { + "epoch": 1.1249952330710462, + "grad_norm": 1.1484375, + "learning_rate": 1.1794871794871795e-06, + "loss": 1.627, + "step": 177000 + }, + { + "epoch": 1.125630823598205, + "grad_norm": 0.7265625, + "learning_rate": 1.1743589743589744e-06, + "loss": 1.6355, + "step": 177100 + }, + { + "epoch": 1.126266414125364, + "grad_norm": 0.85546875, + "learning_rate": 1.1692307692307693e-06, + "loss": 1.6318, + "step": 177200 + }, + { + "epoch": 1.1269020046525227, + "grad_norm": 1.109375, + "learning_rate": 1.1641025641025642e-06, + "loss": 1.6169, + "step": 177300 + }, + { + "epoch": 1.1275375951796813, + "grad_norm": 1.046875, + "learning_rate": 1.1589743589743591e-06, + "loss": 1.6195, + "step": 177400 + }, + { + "epoch": 1.1281731857068402, + "grad_norm": 1.0703125, + "learning_rate": 1.153846153846154e-06, + "loss": 1.6207, + "step": 177500 + }, + { + "epoch": 1.1288087762339991, + "grad_norm": 0.9609375, + "learning_rate": 1.1487179487179487e-06, + "loss": 1.6202, + "step": 177600 + }, + { + "epoch": 1.1294443667611578, + "grad_norm": 0.828125, + "learning_rate": 1.1435897435897436e-06, + "loss": 1.6311, + "step": 177700 + }, + { + "epoch": 1.1300799572883165, + "grad_norm": 1.0625, + "learning_rate": 1.1384615384615385e-06, + "loss": 1.6345, + "step": 177800 + }, + { + "epoch": 1.1307155478154753, + "grad_norm": 0.66796875, + "learning_rate": 1.1333333333333334e-06, + "loss": 1.623, + "step": 177900 + }, + { + "epoch": 1.1313511383426342, + "grad_norm": 0.9140625, + "learning_rate": 1.1282051282051283e-06, + "loss": 1.6246, + "step": 178000 + }, + { + "epoch": 1.131986728869793, + "grad_norm": 1.1484375, + "learning_rate": 1.1230769230769232e-06, + "loss": 1.6302, + "step": 178100 + }, + { + "epoch": 1.1326223193969518, + "grad_norm": 0.84375, + "learning_rate": 1.1179487179487181e-06, + "loss": 1.6337, + "step": 178200 + }, + { + "epoch": 1.1332579099241105, + "grad_norm": 0.78515625, + "learning_rate": 1.1128205128205128e-06, + "loss": 1.6308, + "step": 178300 + }, + { + "epoch": 1.1338935004512694, + "grad_norm": 0.734375, + "learning_rate": 1.107692307692308e-06, + "loss": 1.6263, + "step": 178400 + }, + { + "epoch": 1.134529090978428, + "grad_norm": 0.8359375, + "learning_rate": 1.1025641025641026e-06, + "loss": 1.6238, + "step": 178500 + }, + { + "epoch": 1.135164681505587, + "grad_norm": 1.0390625, + "learning_rate": 1.0974358974358975e-06, + "loss": 1.6269, + "step": 178600 + }, + { + "epoch": 1.1358002720327456, + "grad_norm": 0.8125, + "learning_rate": 1.0923076923076924e-06, + "loss": 1.624, + "step": 178700 + }, + { + "epoch": 1.1364358625599045, + "grad_norm": 0.6328125, + "learning_rate": 1.0871794871794873e-06, + "loss": 1.6185, + "step": 178800 + }, + { + "epoch": 1.1370714530870631, + "grad_norm": 0.70703125, + "learning_rate": 1.0820512820512822e-06, + "loss": 1.612, + "step": 178900 + }, + { + "epoch": 1.137707043614222, + "grad_norm": 0.83984375, + "learning_rate": 1.076923076923077e-06, + "loss": 1.6091, + "step": 179000 + }, + { + "epoch": 1.1383426341413807, + "grad_norm": 1.1953125, + "learning_rate": 1.0717948717948718e-06, + "loss": 1.6232, + "step": 179100 + }, + { + "epoch": 1.1389782246685396, + "grad_norm": 0.8203125, + "learning_rate": 1.066666666666667e-06, + "loss": 1.604, + "step": 179200 + }, + { + "epoch": 1.1396138151956983, + "grad_norm": 0.98046875, + "learning_rate": 1.0615384615384616e-06, + "loss": 1.6217, + "step": 179300 + }, + { + "epoch": 1.1402494057228572, + "grad_norm": 0.83984375, + "learning_rate": 1.0564102564102565e-06, + "loss": 1.6103, + "step": 179400 + }, + { + "epoch": 1.1408849962500158, + "grad_norm": 1.0859375, + "learning_rate": 1.0512820512820514e-06, + "loss": 1.6224, + "step": 179500 + }, + { + "epoch": 1.1415205867771747, + "grad_norm": 1.0234375, + "learning_rate": 1.0461538461538463e-06, + "loss": 1.6232, + "step": 179600 + }, + { + "epoch": 1.1421561773043334, + "grad_norm": 1.0859375, + "learning_rate": 1.0410256410256412e-06, + "loss": 1.6241, + "step": 179700 + }, + { + "epoch": 1.1427917678314923, + "grad_norm": 0.78125, + "learning_rate": 1.0358974358974359e-06, + "loss": 1.616, + "step": 179800 + }, + { + "epoch": 1.143427358358651, + "grad_norm": 0.6953125, + "learning_rate": 1.030769230769231e-06, + "loss": 1.6291, + "step": 179900 + }, + { + "epoch": 1.1440629488858098, + "grad_norm": 0.87109375, + "learning_rate": 1.0256410256410257e-06, + "loss": 1.6228, + "step": 180000 + }, + { + "epoch": 1.1446985394129685, + "grad_norm": 1.03125, + "learning_rate": 1.0205128205128206e-06, + "loss": 1.6178, + "step": 180100 + }, + { + "epoch": 1.1453341299401274, + "grad_norm": 1.1484375, + "learning_rate": 1.0153846153846155e-06, + "loss": 1.6336, + "step": 180200 + }, + { + "epoch": 1.145969720467286, + "grad_norm": 1.0859375, + "learning_rate": 1.0102564102564104e-06, + "loss": 1.6261, + "step": 180300 + }, + { + "epoch": 1.146605310994445, + "grad_norm": 1.1796875, + "learning_rate": 1.0051282051282053e-06, + "loss": 1.6065, + "step": 180400 + }, + { + "epoch": 1.1472409015216036, + "grad_norm": 0.7421875, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.605, + "step": 180500 + }, + { + "epoch": 1.1478764920487625, + "grad_norm": 0.9609375, + "learning_rate": 9.948717948717949e-07, + "loss": 1.6426, + "step": 180600 + }, + { + "epoch": 1.1485120825759214, + "grad_norm": 1.2734375, + "learning_rate": 9.8974358974359e-07, + "loss": 1.6313, + "step": 180700 + }, + { + "epoch": 1.14914767310308, + "grad_norm": 1.109375, + "learning_rate": 9.846153846153847e-07, + "loss": 1.6194, + "step": 180800 + }, + { + "epoch": 1.1497832636302387, + "grad_norm": 0.8125, + "learning_rate": 9.794871794871796e-07, + "loss": 1.6249, + "step": 180900 + }, + { + "epoch": 1.1504188541573976, + "grad_norm": 1.2109375, + "learning_rate": 9.743589743589745e-07, + "loss": 1.6149, + "step": 181000 + }, + { + "epoch": 1.1510544446845565, + "grad_norm": 0.96875, + "learning_rate": 9.692307692307693e-07, + "loss": 1.6307, + "step": 181100 + }, + { + "epoch": 1.1516900352117152, + "grad_norm": 1.0234375, + "learning_rate": 9.641025641025642e-07, + "loss": 1.6369, + "step": 181200 + }, + { + "epoch": 1.1523256257388739, + "grad_norm": 1.1875, + "learning_rate": 9.58974358974359e-07, + "loss": 1.627, + "step": 181300 + }, + { + "epoch": 1.1529612162660328, + "grad_norm": 1.21875, + "learning_rate": 9.53846153846154e-07, + "loss": 1.6305, + "step": 181400 + }, + { + "epoch": 1.1535968067931917, + "grad_norm": 0.7890625, + "learning_rate": 9.487179487179487e-07, + "loss": 1.6156, + "step": 181500 + }, + { + "epoch": 1.1542323973203503, + "grad_norm": 0.98046875, + "learning_rate": 9.435897435897437e-07, + "loss": 1.6276, + "step": 181600 + }, + { + "epoch": 1.1548679878475092, + "grad_norm": 1.0390625, + "learning_rate": 9.384615384615385e-07, + "loss": 1.6395, + "step": 181700 + }, + { + "epoch": 1.1555035783746679, + "grad_norm": 0.81640625, + "learning_rate": 9.333333333333334e-07, + "loss": 1.6233, + "step": 181800 + }, + { + "epoch": 1.1561391689018268, + "grad_norm": 0.765625, + "learning_rate": 9.282051282051282e-07, + "loss": 1.6164, + "step": 181900 + }, + { + "epoch": 1.1567747594289854, + "grad_norm": 1.234375, + "learning_rate": 9.230769230769232e-07, + "loss": 1.6231, + "step": 182000 + }, + { + "epoch": 1.1574103499561443, + "grad_norm": 1.1015625, + "learning_rate": 9.17948717948718e-07, + "loss": 1.6251, + "step": 182100 + }, + { + "epoch": 1.158045940483303, + "grad_norm": 0.796875, + "learning_rate": 9.128205128205129e-07, + "loss": 1.6266, + "step": 182200 + }, + { + "epoch": 1.158681531010462, + "grad_norm": 0.80078125, + "learning_rate": 9.076923076923077e-07, + "loss": 1.6057, + "step": 182300 + }, + { + "epoch": 1.1593171215376206, + "grad_norm": 1.09375, + "learning_rate": 9.025641025641027e-07, + "loss": 1.6329, + "step": 182400 + }, + { + "epoch": 1.1599527120647795, + "grad_norm": 0.734375, + "learning_rate": 8.974358974358975e-07, + "loss": 1.6308, + "step": 182500 + }, + { + "epoch": 1.1605883025919381, + "grad_norm": 0.63671875, + "learning_rate": 8.923076923076923e-07, + "loss": 1.6234, + "step": 182600 + }, + { + "epoch": 1.161223893119097, + "grad_norm": 0.8515625, + "learning_rate": 8.871794871794873e-07, + "loss": 1.6105, + "step": 182700 + }, + { + "epoch": 1.1618594836462557, + "grad_norm": 0.9375, + "learning_rate": 8.820512820512821e-07, + "loss": 1.627, + "step": 182800 + }, + { + "epoch": 1.1624950741734146, + "grad_norm": 0.828125, + "learning_rate": 8.76923076923077e-07, + "loss": 1.622, + "step": 182900 + }, + { + "epoch": 1.1631306647005732, + "grad_norm": 0.76953125, + "learning_rate": 8.717948717948718e-07, + "loss": 1.6267, + "step": 183000 + }, + { + "epoch": 1.1637662552277321, + "grad_norm": 0.984375, + "learning_rate": 8.666666666666668e-07, + "loss": 1.628, + "step": 183100 + }, + { + "epoch": 1.1644018457548908, + "grad_norm": 0.82421875, + "learning_rate": 8.615384615384616e-07, + "loss": 1.6225, + "step": 183200 + }, + { + "epoch": 1.1650374362820497, + "grad_norm": 1.0078125, + "learning_rate": 8.564102564102565e-07, + "loss": 1.6211, + "step": 183300 + }, + { + "epoch": 1.1656730268092084, + "grad_norm": 0.9765625, + "learning_rate": 8.512820512820513e-07, + "loss": 1.6209, + "step": 183400 + }, + { + "epoch": 1.1663086173363673, + "grad_norm": 1.3046875, + "learning_rate": 8.461538461538463e-07, + "loss": 1.6237, + "step": 183500 + }, + { + "epoch": 1.166944207863526, + "grad_norm": 0.60546875, + "learning_rate": 8.410256410256411e-07, + "loss": 1.6333, + "step": 183600 + }, + { + "epoch": 1.1675797983906848, + "grad_norm": 0.8203125, + "learning_rate": 8.35897435897436e-07, + "loss": 1.6243, + "step": 183700 + }, + { + "epoch": 1.1682153889178435, + "grad_norm": 1.0703125, + "learning_rate": 8.307692307692308e-07, + "loss": 1.6183, + "step": 183800 + }, + { + "epoch": 1.1688509794450024, + "grad_norm": 0.73828125, + "learning_rate": 8.256410256410258e-07, + "loss": 1.6256, + "step": 183900 + }, + { + "epoch": 1.169486569972161, + "grad_norm": 0.99609375, + "learning_rate": 8.205128205128206e-07, + "loss": 1.6199, + "step": 184000 + }, + { + "epoch": 1.17012216049932, + "grad_norm": 1.3359375, + "learning_rate": 8.153846153846154e-07, + "loss": 1.622, + "step": 184100 + }, + { + "epoch": 1.1707577510264786, + "grad_norm": 1.234375, + "learning_rate": 8.102564102564104e-07, + "loss": 1.6335, + "step": 184200 + }, + { + "epoch": 1.1713933415536375, + "grad_norm": 0.71875, + "learning_rate": 8.051282051282052e-07, + "loss": 1.6162, + "step": 184300 + }, + { + "epoch": 1.1720289320807962, + "grad_norm": 1.203125, + "learning_rate": 8.000000000000001e-07, + "loss": 1.6326, + "step": 184400 + }, + { + "epoch": 1.172664522607955, + "grad_norm": 1.28125, + "learning_rate": 7.948717948717949e-07, + "loss": 1.6317, + "step": 184500 + }, + { + "epoch": 1.173300113135114, + "grad_norm": 1.234375, + "learning_rate": 7.897435897435899e-07, + "loss": 1.626, + "step": 184600 + }, + { + "epoch": 1.1739357036622726, + "grad_norm": 1.1171875, + "learning_rate": 7.846153846153847e-07, + "loss": 1.6298, + "step": 184700 + }, + { + "epoch": 1.1745712941894313, + "grad_norm": 1.0703125, + "learning_rate": 7.794871794871796e-07, + "loss": 1.6062, + "step": 184800 + }, + { + "epoch": 1.1752068847165902, + "grad_norm": 1.0625, + "learning_rate": 7.743589743589744e-07, + "loss": 1.6121, + "step": 184900 + }, + { + "epoch": 1.175842475243749, + "grad_norm": 0.6015625, + "learning_rate": 7.692307692307694e-07, + "loss": 1.6162, + "step": 185000 + }, + { + "epoch": 1.1764780657709077, + "grad_norm": 1.234375, + "learning_rate": 7.641025641025642e-07, + "loss": 1.6225, + "step": 185100 + }, + { + "epoch": 1.1771136562980664, + "grad_norm": 0.94140625, + "learning_rate": 7.58974358974359e-07, + "loss": 1.6196, + "step": 185200 + }, + { + "epoch": 1.1777492468252253, + "grad_norm": 0.8125, + "learning_rate": 7.538461538461538e-07, + "loss": 1.6323, + "step": 185300 + }, + { + "epoch": 1.1783848373523842, + "grad_norm": 1.03125, + "learning_rate": 7.487179487179488e-07, + "loss": 1.6396, + "step": 185400 + }, + { + "epoch": 1.1790204278795429, + "grad_norm": 0.9375, + "learning_rate": 7.435897435897436e-07, + "loss": 1.6313, + "step": 185500 + }, + { + "epoch": 1.1796560184067018, + "grad_norm": 0.75, + "learning_rate": 7.384615384615385e-07, + "loss": 1.6339, + "step": 185600 + }, + { + "epoch": 1.1802916089338604, + "grad_norm": 0.76953125, + "learning_rate": 7.333333333333334e-07, + "loss": 1.6106, + "step": 185700 + }, + { + "epoch": 1.1809271994610193, + "grad_norm": 0.66796875, + "learning_rate": 7.282051282051282e-07, + "loss": 1.6374, + "step": 185800 + }, + { + "epoch": 1.181562789988178, + "grad_norm": 0.94921875, + "learning_rate": 7.230769230769231e-07, + "loss": 1.6183, + "step": 185900 + }, + { + "epoch": 1.1821983805153369, + "grad_norm": 1.0703125, + "learning_rate": 7.179487179487179e-07, + "loss": 1.6314, + "step": 186000 + }, + { + "epoch": 1.1828339710424955, + "grad_norm": 0.9140625, + "learning_rate": 7.128205128205129e-07, + "loss": 1.6269, + "step": 186100 + }, + { + "epoch": 1.1834695615696544, + "grad_norm": 1.2890625, + "learning_rate": 7.076923076923077e-07, + "loss": 1.6146, + "step": 186200 + }, + { + "epoch": 1.184105152096813, + "grad_norm": 1.09375, + "learning_rate": 7.025641025641026e-07, + "loss": 1.6253, + "step": 186300 + }, + { + "epoch": 1.184740742623972, + "grad_norm": 0.65625, + "learning_rate": 6.974358974358974e-07, + "loss": 1.6206, + "step": 186400 + }, + { + "epoch": 1.1853763331511307, + "grad_norm": 1.3125, + "learning_rate": 6.923076923076924e-07, + "loss": 1.6378, + "step": 186500 + }, + { + "epoch": 1.1860119236782896, + "grad_norm": 0.89453125, + "learning_rate": 6.871794871794872e-07, + "loss": 1.6195, + "step": 186600 + }, + { + "epoch": 1.1866475142054482, + "grad_norm": 1.140625, + "learning_rate": 6.820512820512821e-07, + "loss": 1.6105, + "step": 186700 + }, + { + "epoch": 1.1872831047326071, + "grad_norm": 0.99609375, + "learning_rate": 6.769230769230769e-07, + "loss": 1.634, + "step": 186800 + }, + { + "epoch": 1.1879186952597658, + "grad_norm": 0.86328125, + "learning_rate": 6.717948717948719e-07, + "loss": 1.6297, + "step": 186900 + }, + { + "epoch": 1.1885542857869247, + "grad_norm": 0.765625, + "learning_rate": 6.666666666666667e-07, + "loss": 1.6124, + "step": 187000 + }, + { + "epoch": 1.1891898763140833, + "grad_norm": 0.734375, + "learning_rate": 6.615384615384616e-07, + "loss": 1.6296, + "step": 187100 + }, + { + "epoch": 1.1898254668412422, + "grad_norm": 1.0234375, + "learning_rate": 6.564102564102565e-07, + "loss": 1.6216, + "step": 187200 + }, + { + "epoch": 1.190461057368401, + "grad_norm": 1.0234375, + "learning_rate": 6.512820512820513e-07, + "loss": 1.6228, + "step": 187300 + }, + { + "epoch": 1.1910966478955598, + "grad_norm": 1.2890625, + "learning_rate": 6.461538461538462e-07, + "loss": 1.6217, + "step": 187400 + }, + { + "epoch": 1.1917322384227185, + "grad_norm": 1.0859375, + "learning_rate": 6.41025641025641e-07, + "loss": 1.6235, + "step": 187500 + }, + { + "epoch": 1.1923678289498774, + "grad_norm": 1.046875, + "learning_rate": 6.35897435897436e-07, + "loss": 1.6218, + "step": 187600 + }, + { + "epoch": 1.193003419477036, + "grad_norm": 0.984375, + "learning_rate": 6.307692307692308e-07, + "loss": 1.6405, + "step": 187700 + }, + { + "epoch": 1.193639010004195, + "grad_norm": 1.2890625, + "learning_rate": 6.256410256410257e-07, + "loss": 1.6174, + "step": 187800 + }, + { + "epoch": 1.1942746005313536, + "grad_norm": 0.50390625, + "learning_rate": 6.205128205128206e-07, + "loss": 1.6174, + "step": 187900 + }, + { + "epoch": 1.1949101910585125, + "grad_norm": 0.69921875, + "learning_rate": 6.153846153846155e-07, + "loss": 1.6336, + "step": 188000 + }, + { + "epoch": 1.1955457815856712, + "grad_norm": 0.84765625, + "learning_rate": 6.102564102564103e-07, + "loss": 1.6303, + "step": 188100 + }, + { + "epoch": 1.19618137211283, + "grad_norm": 1.09375, + "learning_rate": 6.051282051282052e-07, + "loss": 1.629, + "step": 188200 + }, + { + "epoch": 1.1968169626399887, + "grad_norm": 0.5859375, + "learning_rate": 6.000000000000001e-07, + "loss": 1.6121, + "step": 188300 + }, + { + "epoch": 1.1974525531671476, + "grad_norm": 1.0703125, + "learning_rate": 5.948717948717949e-07, + "loss": 1.6173, + "step": 188400 + }, + { + "epoch": 1.1980881436943065, + "grad_norm": 0.69140625, + "learning_rate": 5.897435897435898e-07, + "loss": 1.6073, + "step": 188500 + }, + { + "epoch": 1.1987237342214652, + "grad_norm": 0.85546875, + "learning_rate": 5.846153846153847e-07, + "loss": 1.6165, + "step": 188600 + }, + { + "epoch": 1.1993593247486238, + "grad_norm": 1.1171875, + "learning_rate": 5.794871794871796e-07, + "loss": 1.6199, + "step": 188700 + }, + { + "epoch": 1.1999949152757827, + "grad_norm": 0.8046875, + "learning_rate": 5.743589743589744e-07, + "loss": 1.6232, + "step": 188800 + }, + { + "epoch": 1.2006305058029416, + "grad_norm": 0.86328125, + "learning_rate": 5.692307692307693e-07, + "loss": 1.6187, + "step": 188900 + }, + { + "epoch": 1.2012660963301003, + "grad_norm": 0.99609375, + "learning_rate": 5.641025641025642e-07, + "loss": 1.6207, + "step": 189000 + }, + { + "epoch": 1.201901686857259, + "grad_norm": 0.8125, + "learning_rate": 5.589743589743591e-07, + "loss": 1.613, + "step": 189100 + }, + { + "epoch": 1.2025372773844178, + "grad_norm": 0.89453125, + "learning_rate": 5.53846153846154e-07, + "loss": 1.6118, + "step": 189200 + }, + { + "epoch": 1.2031728679115767, + "grad_norm": 0.68359375, + "learning_rate": 5.487179487179488e-07, + "loss": 1.6395, + "step": 189300 + }, + { + "epoch": 1.2038084584387354, + "grad_norm": 0.82421875, + "learning_rate": 5.435897435897437e-07, + "loss": 1.6283, + "step": 189400 + }, + { + "epoch": 1.2044440489658943, + "grad_norm": 1.1953125, + "learning_rate": 5.384615384615386e-07, + "loss": 1.639, + "step": 189500 + }, + { + "epoch": 1.205079639493053, + "grad_norm": 1.1328125, + "learning_rate": 5.333333333333335e-07, + "loss": 1.6224, + "step": 189600 + }, + { + "epoch": 1.2057152300202119, + "grad_norm": 1.109375, + "learning_rate": 5.282051282051282e-07, + "loss": 1.6401, + "step": 189700 + }, + { + "epoch": 1.2063508205473705, + "grad_norm": 0.7421875, + "learning_rate": 5.230769230769231e-07, + "loss": 1.6279, + "step": 189800 + }, + { + "epoch": 1.2069864110745294, + "grad_norm": 1.125, + "learning_rate": 5.179487179487179e-07, + "loss": 1.6324, + "step": 189900 + }, + { + "epoch": 1.207622001601688, + "grad_norm": 0.76171875, + "learning_rate": 5.128205128205128e-07, + "loss": 1.6146, + "step": 190000 + }, + { + "epoch": 1.208257592128847, + "grad_norm": 1.0390625, + "learning_rate": 5.076923076923077e-07, + "loss": 1.6171, + "step": 190100 + }, + { + "epoch": 1.2088931826560056, + "grad_norm": 1.0078125, + "learning_rate": 5.025641025641026e-07, + "loss": 1.6087, + "step": 190200 + }, + { + "epoch": 1.2095287731831645, + "grad_norm": 0.8984375, + "learning_rate": 4.974358974358974e-07, + "loss": 1.6004, + "step": 190300 + }, + { + "epoch": 1.2101643637103232, + "grad_norm": 1.2109375, + "learning_rate": 4.923076923076923e-07, + "loss": 1.6331, + "step": 190400 + }, + { + "epoch": 1.210799954237482, + "grad_norm": 0.91015625, + "learning_rate": 4.871794871794872e-07, + "loss": 1.6412, + "step": 190500 + }, + { + "epoch": 1.2114355447646408, + "grad_norm": 0.66015625, + "learning_rate": 4.820512820512821e-07, + "loss": 1.6085, + "step": 190600 + }, + { + "epoch": 1.2120711352917997, + "grad_norm": 0.65234375, + "learning_rate": 4.76923076923077e-07, + "loss": 1.6188, + "step": 190700 + }, + { + "epoch": 1.2127067258189583, + "grad_norm": 0.85546875, + "learning_rate": 4.7179487179487187e-07, + "loss": 1.6264, + "step": 190800 + }, + { + "epoch": 1.2133423163461172, + "grad_norm": 0.7421875, + "learning_rate": 4.666666666666667e-07, + "loss": 1.6233, + "step": 190900 + }, + { + "epoch": 1.213977906873276, + "grad_norm": 1.0390625, + "learning_rate": 4.615384615384616e-07, + "loss": 1.6342, + "step": 191000 + }, + { + "epoch": 1.2146134974004348, + "grad_norm": 0.6484375, + "learning_rate": 4.5641025641025646e-07, + "loss": 1.6317, + "step": 191100 + }, + { + "epoch": 1.2152490879275935, + "grad_norm": 0.8828125, + "learning_rate": 4.5128205128205136e-07, + "loss": 1.6204, + "step": 191200 + }, + { + "epoch": 1.2158846784547523, + "grad_norm": 1.015625, + "learning_rate": 4.4615384615384615e-07, + "loss": 1.6211, + "step": 191300 + }, + { + "epoch": 1.216520268981911, + "grad_norm": 1.15625, + "learning_rate": 4.4102564102564105e-07, + "loss": 1.6256, + "step": 191400 + }, + { + "epoch": 1.21715585950907, + "grad_norm": 0.94921875, + "learning_rate": 4.358974358974359e-07, + "loss": 1.6148, + "step": 191500 + }, + { + "epoch": 1.2177914500362286, + "grad_norm": 0.7265625, + "learning_rate": 4.307692307692308e-07, + "loss": 1.6202, + "step": 191600 + }, + { + "epoch": 1.2184270405633875, + "grad_norm": 0.87890625, + "learning_rate": 4.2564102564102564e-07, + "loss": 1.627, + "step": 191700 + }, + { + "epoch": 1.2190626310905461, + "grad_norm": 0.890625, + "learning_rate": 4.2051282051282054e-07, + "loss": 1.6217, + "step": 191800 + }, + { + "epoch": 1.219698221617705, + "grad_norm": 1.0703125, + "learning_rate": 4.153846153846154e-07, + "loss": 1.6366, + "step": 191900 + }, + { + "epoch": 1.220333812144864, + "grad_norm": 1.0078125, + "learning_rate": 4.102564102564103e-07, + "loss": 1.6388, + "step": 192000 + }, + { + "epoch": 1.2209694026720226, + "grad_norm": 1.3984375, + "learning_rate": 4.051282051282052e-07, + "loss": 1.6081, + "step": 192100 + }, + { + "epoch": 1.2216049931991813, + "grad_norm": 0.90625, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.6327, + "step": 192200 + }, + { + "epoch": 1.2222405837263401, + "grad_norm": 1.0390625, + "learning_rate": 3.9487179487179493e-07, + "loss": 1.6144, + "step": 192300 + }, + { + "epoch": 1.222876174253499, + "grad_norm": 0.8984375, + "learning_rate": 3.897435897435898e-07, + "loss": 1.6078, + "step": 192400 + }, + { + "epoch": 1.2235117647806577, + "grad_norm": 1.1171875, + "learning_rate": 3.846153846153847e-07, + "loss": 1.635, + "step": 192500 + }, + { + "epoch": 1.2241473553078164, + "grad_norm": 0.79296875, + "learning_rate": 3.794871794871795e-07, + "loss": 1.6114, + "step": 192600 + }, + { + "epoch": 1.2247829458349753, + "grad_norm": 0.9296875, + "learning_rate": 3.743589743589744e-07, + "loss": 1.629, + "step": 192700 + }, + { + "epoch": 1.2254185363621342, + "grad_norm": 0.9921875, + "learning_rate": 3.6923076923076927e-07, + "loss": 1.6034, + "step": 192800 + }, + { + "epoch": 1.2260541268892928, + "grad_norm": 1.296875, + "learning_rate": 3.641025641025641e-07, + "loss": 1.6224, + "step": 192900 + }, + { + "epoch": 1.2266897174164517, + "grad_norm": 0.9765625, + "learning_rate": 3.5897435897435896e-07, + "loss": 1.6244, + "step": 193000 + }, + { + "epoch": 1.2273253079436104, + "grad_norm": 1.078125, + "learning_rate": 3.5384615384615386e-07, + "loss": 1.6263, + "step": 193100 + }, + { + "epoch": 1.2279608984707693, + "grad_norm": 1.09375, + "learning_rate": 3.487179487179487e-07, + "loss": 1.6212, + "step": 193200 + }, + { + "epoch": 1.228596488997928, + "grad_norm": 1.1328125, + "learning_rate": 3.435897435897436e-07, + "loss": 1.6289, + "step": 193300 + }, + { + "epoch": 1.2292320795250868, + "grad_norm": 0.671875, + "learning_rate": 3.3846153846153845e-07, + "loss": 1.6209, + "step": 193400 + }, + { + "epoch": 1.2298676700522455, + "grad_norm": 0.6328125, + "learning_rate": 3.3333333333333335e-07, + "loss": 1.6283, + "step": 193500 + }, + { + "epoch": 1.2305032605794044, + "grad_norm": 0.875, + "learning_rate": 3.2820512820512825e-07, + "loss": 1.6371, + "step": 193600 + }, + { + "epoch": 1.231138851106563, + "grad_norm": 0.66015625, + "learning_rate": 3.230769230769231e-07, + "loss": 1.6415, + "step": 193700 + }, + { + "epoch": 1.231774441633722, + "grad_norm": 0.8828125, + "learning_rate": 3.17948717948718e-07, + "loss": 1.6124, + "step": 193800 + }, + { + "epoch": 1.2324100321608806, + "grad_norm": 1.046875, + "learning_rate": 3.1282051282051284e-07, + "loss": 1.6265, + "step": 193900 + }, + { + "epoch": 1.2330456226880395, + "grad_norm": 0.95703125, + "learning_rate": 3.0769230769230774e-07, + "loss": 1.6312, + "step": 194000 + }, + { + "epoch": 1.2336812132151982, + "grad_norm": 0.98828125, + "learning_rate": 3.025641025641026e-07, + "loss": 1.6242, + "step": 194100 + }, + { + "epoch": 1.234316803742357, + "grad_norm": 0.84375, + "learning_rate": 2.9743589743589744e-07, + "loss": 1.6233, + "step": 194200 + }, + { + "epoch": 1.2349523942695158, + "grad_norm": 0.61328125, + "learning_rate": 2.9230769230769234e-07, + "loss": 1.6164, + "step": 194300 + }, + { + "epoch": 1.2355879847966746, + "grad_norm": 1.21875, + "learning_rate": 2.871794871794872e-07, + "loss": 1.6302, + "step": 194400 + }, + { + "epoch": 1.2362235753238333, + "grad_norm": 0.6953125, + "learning_rate": 2.820512820512821e-07, + "loss": 1.6237, + "step": 194500 + }, + { + "epoch": 1.2368591658509922, + "grad_norm": 1.1328125, + "learning_rate": 2.76923076923077e-07, + "loss": 1.6299, + "step": 194600 + }, + { + "epoch": 1.2374947563781509, + "grad_norm": 1.1953125, + "learning_rate": 2.717948717948718e-07, + "loss": 1.6201, + "step": 194700 + }, + { + "epoch": 1.2381303469053098, + "grad_norm": 0.94921875, + "learning_rate": 2.666666666666667e-07, + "loss": 1.6312, + "step": 194800 + }, + { + "epoch": 1.2387659374324684, + "grad_norm": 0.76953125, + "learning_rate": 2.6153846153846157e-07, + "loss": 1.616, + "step": 194900 + }, + { + "epoch": 1.2394015279596273, + "grad_norm": 0.82421875, + "learning_rate": 2.564102564102564e-07, + "loss": 1.6243, + "step": 195000 + }, + { + "epoch": 1.240037118486786, + "grad_norm": 0.85546875, + "learning_rate": 2.512820512820513e-07, + "loss": 1.6187, + "step": 195100 + }, + { + "epoch": 1.2406727090139449, + "grad_norm": 1.1953125, + "learning_rate": 2.4615384615384616e-07, + "loss": 1.6108, + "step": 195200 + }, + { + "epoch": 1.2413082995411036, + "grad_norm": 0.88671875, + "learning_rate": 2.4102564102564106e-07, + "loss": 1.6293, + "step": 195300 + }, + { + "epoch": 1.2419438900682624, + "grad_norm": 1.1640625, + "learning_rate": 2.3589743589743593e-07, + "loss": 1.624, + "step": 195400 + }, + { + "epoch": 1.2425794805954211, + "grad_norm": 0.9140625, + "learning_rate": 2.307692307692308e-07, + "loss": 1.6333, + "step": 195500 + }, + { + "epoch": 1.24321507112258, + "grad_norm": 0.875, + "learning_rate": 2.2564102564102568e-07, + "loss": 1.6186, + "step": 195600 + }, + { + "epoch": 1.2438506616497387, + "grad_norm": 0.88671875, + "learning_rate": 2.2051282051282053e-07, + "loss": 1.6238, + "step": 195700 + }, + { + "epoch": 1.2444862521768976, + "grad_norm": 0.9765625, + "learning_rate": 2.153846153846154e-07, + "loss": 1.6251, + "step": 195800 + }, + { + "epoch": 1.2451218427040565, + "grad_norm": 0.9375, + "learning_rate": 2.1025641025641027e-07, + "loss": 1.6266, + "step": 195900 + }, + { + "epoch": 1.2457574332312151, + "grad_norm": 1.265625, + "learning_rate": 2.0512820512820514e-07, + "loss": 1.629, + "step": 196000 + }, + { + "epoch": 1.2463930237583738, + "grad_norm": 0.64453125, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.6201, + "step": 196100 + }, + { + "epoch": 1.2470286142855327, + "grad_norm": 1.2265625, + "learning_rate": 1.948717948717949e-07, + "loss": 1.6353, + "step": 196200 + }, + { + "epoch": 1.2476642048126916, + "grad_norm": 0.84375, + "learning_rate": 1.8974358974358976e-07, + "loss": 1.6252, + "step": 196300 + }, + { + "epoch": 1.2482997953398502, + "grad_norm": 0.8046875, + "learning_rate": 1.8461538461538464e-07, + "loss": 1.6206, + "step": 196400 + }, + { + "epoch": 1.248935385867009, + "grad_norm": 0.91796875, + "learning_rate": 1.7948717948717948e-07, + "loss": 1.628, + "step": 196500 + }, + { + "epoch": 1.2495709763941678, + "grad_norm": 0.9375, + "learning_rate": 1.7435897435897435e-07, + "loss": 1.6288, + "step": 196600 + }, + { + "epoch": 1.2502065669213267, + "grad_norm": 0.9609375, + "learning_rate": 1.6923076923076923e-07, + "loss": 1.634, + "step": 196700 + }, + { + "epoch": 1.2508421574484854, + "grad_norm": 0.74609375, + "learning_rate": 1.6410256410256413e-07, + "loss": 1.6227, + "step": 196800 + }, + { + "epoch": 1.251477747975644, + "grad_norm": 0.9609375, + "learning_rate": 1.58974358974359e-07, + "loss": 1.6262, + "step": 196900 + }, + { + "epoch": 1.252113338502803, + "grad_norm": 0.72265625, + "learning_rate": 1.5384615384615387e-07, + "loss": 1.6395, + "step": 197000 + }, + { + "epoch": 1.2527489290299618, + "grad_norm": 0.8515625, + "learning_rate": 1.4871794871794872e-07, + "loss": 1.6084, + "step": 197100 + }, + { + "epoch": 1.2533845195571205, + "grad_norm": 0.8671875, + "learning_rate": 1.435897435897436e-07, + "loss": 1.6321, + "step": 197200 + }, + { + "epoch": 1.2540201100842794, + "grad_norm": 0.70703125, + "learning_rate": 1.384615384615385e-07, + "loss": 1.6181, + "step": 197300 + }, + { + "epoch": 1.254655700611438, + "grad_norm": 0.78125, + "learning_rate": 1.3333333333333336e-07, + "loss": 1.627, + "step": 197400 + }, + { + "epoch": 1.255291291138597, + "grad_norm": 0.6953125, + "learning_rate": 1.282051282051282e-07, + "loss": 1.6061, + "step": 197500 + }, + { + "epoch": 1.2559268816657556, + "grad_norm": 0.9296875, + "learning_rate": 1.2307692307692308e-07, + "loss": 1.629, + "step": 197600 + }, + { + "epoch": 1.2565624721929145, + "grad_norm": 0.98828125, + "learning_rate": 1.1794871794871797e-07, + "loss": 1.6315, + "step": 197700 + }, + { + "epoch": 1.2571980627200732, + "grad_norm": 0.88671875, + "learning_rate": 1.1282051282051284e-07, + "loss": 1.6194, + "step": 197800 + }, + { + "epoch": 1.257833653247232, + "grad_norm": 0.8515625, + "learning_rate": 1.076923076923077e-07, + "loss": 1.6216, + "step": 197900 + }, + { + "epoch": 1.2584692437743907, + "grad_norm": 0.8125, + "learning_rate": 1.0256410256410257e-07, + "loss": 1.6253, + "step": 198000 + }, + { + "epoch": 1.2591048343015496, + "grad_norm": 1.0234375, + "learning_rate": 9.743589743589745e-08, + "loss": 1.6114, + "step": 198100 + }, + { + "epoch": 1.2597404248287083, + "grad_norm": 0.9765625, + "learning_rate": 9.230769230769232e-08, + "loss": 1.6078, + "step": 198200 + }, + { + "epoch": 1.2603760153558672, + "grad_norm": 0.87109375, + "learning_rate": 8.717948717948718e-08, + "loss": 1.6376, + "step": 198300 + }, + { + "epoch": 1.2610116058830259, + "grad_norm": 0.8828125, + "learning_rate": 8.205128205128206e-08, + "loss": 1.6233, + "step": 198400 + }, + { + "epoch": 1.2616471964101847, + "grad_norm": 0.98828125, + "learning_rate": 7.692307692307694e-08, + "loss": 1.6106, + "step": 198500 + }, + { + "epoch": 1.2622827869373434, + "grad_norm": 0.9375, + "learning_rate": 7.17948717948718e-08, + "loss": 1.6388, + "step": 198600 + }, + { + "epoch": 1.2629183774645023, + "grad_norm": 2.09375, + "learning_rate": 6.666666666666668e-08, + "loss": 1.6184, + "step": 198700 + }, + { + "epoch": 1.263553967991661, + "grad_norm": 1.2734375, + "learning_rate": 6.153846153846154e-08, + "loss": 1.616, + "step": 198800 + }, + { + "epoch": 1.2641895585188199, + "grad_norm": 1.1171875, + "learning_rate": 5.641025641025642e-08, + "loss": 1.6208, + "step": 198900 + }, + { + "epoch": 1.2648251490459785, + "grad_norm": 1.0390625, + "learning_rate": 5.1282051282051286e-08, + "loss": 1.606, + "step": 199000 + }, + { + "epoch": 1.2654607395731374, + "grad_norm": 1.015625, + "learning_rate": 4.615384615384616e-08, + "loss": 1.6127, + "step": 199100 + }, + { + "epoch": 1.266096330100296, + "grad_norm": 0.89453125, + "learning_rate": 4.102564102564103e-08, + "loss": 1.6287, + "step": 199200 + }, + { + "epoch": 1.266731920627455, + "grad_norm": 1.2421875, + "learning_rate": 3.58974358974359e-08, + "loss": 1.6168, + "step": 199300 + }, + { + "epoch": 1.2673675111546139, + "grad_norm": 0.93359375, + "learning_rate": 3.076923076923077e-08, + "loss": 1.6341, + "step": 199400 + }, + { + "epoch": 1.2680031016817725, + "grad_norm": 0.8671875, + "learning_rate": 2.5641025641025643e-08, + "loss": 1.6117, + "step": 199500 + }, + { + "epoch": 1.2686386922089312, + "grad_norm": 0.9609375, + "learning_rate": 2.0512820512820516e-08, + "loss": 1.6181, + "step": 199600 + }, + { + "epoch": 1.26927428273609, + "grad_norm": 0.68359375, + "learning_rate": 1.5384615384615385e-08, + "loss": 1.624, + "step": 199700 + }, + { + "epoch": 1.269909873263249, + "grad_norm": 0.9921875, + "learning_rate": 1.0256410256410258e-08, + "loss": 1.6318, + "step": 199800 + }, + { + "epoch": 1.2705454637904077, + "grad_norm": 0.953125, + "learning_rate": 5.128205128205129e-09, + "loss": 1.6126, + "step": 199900 + }, + { + "epoch": 1.2711810543175663, + "grad_norm": 0.9609375, + "learning_rate": 0.0, + "loss": 1.6342, + "step": 200000 + } + ], + "logging_steps": 100, + "max_steps": 200000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.369174972818627e+19, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}