diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.051794127633839504, + "epoch": 0.06363278537871711, "eval_steps": 73, - "global_step": 7665, + "global_step": 9417, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -54510,6 +54510,12462 @@ "eval_samples_per_second": 1.952, "eval_steps_per_second": 1.085, "step": 7665 + }, + { + "epoch": 0.05, + "grad_norm": 4.858198007581066, + "learning_rate": 1.9991746281223257e-06, + "loss": 1.3773, + "step": 7666 + }, + { + "epoch": 0.05, + "grad_norm": 4.418985838139268, + "learning_rate": 1.9991744125235314e-06, + "loss": 1.3106, + "step": 7667 + }, + { + "epoch": 0.05, + "grad_norm": 4.318416281379778, + "learning_rate": 1.999174196896593e-06, + "loss": 1.4869, + "step": 7668 + }, + { + "epoch": 0.05, + "grad_norm": 4.694071144513556, + "learning_rate": 1.9991739812415118e-06, + "loss": 1.5267, + "step": 7669 + }, + { + "epoch": 0.05, + "grad_norm": 4.373950245690968, + "learning_rate": 1.999173765558287e-06, + "loss": 1.4851, + "step": 7670 + }, + { + "epoch": 0.05, + "grad_norm": 4.324197333779506, + "learning_rate": 1.9991735498469185e-06, + "loss": 1.3096, + "step": 7671 + }, + { + "epoch": 0.05, + "grad_norm": 4.394592318974435, + "learning_rate": 1.999173334107407e-06, + "loss": 1.3644, + "step": 7672 + }, + { + "epoch": 0.05, + "grad_norm": 4.266670772743948, + "learning_rate": 1.999173118339752e-06, + "loss": 1.362, + "step": 7673 + }, + { + "epoch": 0.05, + "grad_norm": 4.295955296296668, + "learning_rate": 1.9991729025439536e-06, + "loss": 1.3735, + "step": 7674 + }, + { + "epoch": 0.05, + "grad_norm": 4.246048670721659, + "learning_rate": 1.999172686720012e-06, + "loss": 1.449, + "step": 7675 + }, + { + "epoch": 0.05, + "grad_norm": 4.638944960089106, + "learning_rate": 1.999172470867927e-06, + "loss": 1.2955, + "step": 7676 + }, + { + "epoch": 0.05, + "grad_norm": 5.580028912051613, + "learning_rate": 1.9991722549876982e-06, + "loss": 1.2753, + "step": 7677 + }, + { + "epoch": 0.05, + "grad_norm": 4.138273856542915, + "learning_rate": 1.9991720390793265e-06, + "loss": 1.3861, + "step": 7678 + }, + { + "epoch": 0.05, + "grad_norm": 4.470127672789156, + "learning_rate": 1.9991718231428113e-06, + "loss": 1.4956, + "step": 7679 + }, + { + "epoch": 0.05, + "grad_norm": 8.008273460136488, + "learning_rate": 1.9991716071781525e-06, + "loss": 1.3993, + "step": 7680 + }, + { + "epoch": 0.05, + "grad_norm": 4.878399604988985, + "learning_rate": 1.9991713911853507e-06, + "loss": 1.4606, + "step": 7681 + }, + { + "epoch": 0.05, + "grad_norm": 4.407855914055984, + "learning_rate": 1.9991711751644057e-06, + "loss": 1.3583, + "step": 7682 + }, + { + "epoch": 0.05, + "grad_norm": 4.7127898730579885, + "learning_rate": 1.999170959115317e-06, + "loss": 1.4571, + "step": 7683 + }, + { + "epoch": 0.05, + "grad_norm": 5.769010678931985, + "learning_rate": 1.9991707430380853e-06, + "loss": 1.387, + "step": 7684 + }, + { + "epoch": 0.05, + "grad_norm": 4.755844513192177, + "learning_rate": 1.9991705269327098e-06, + "loss": 1.5078, + "step": 7685 + }, + { + "epoch": 0.05, + "grad_norm": 4.318945303055633, + "learning_rate": 1.999170310799191e-06, + "loss": 1.3274, + "step": 7686 + }, + { + "epoch": 0.05, + "grad_norm": 4.161086412670242, + "learning_rate": 1.9991700946375295e-06, + "loss": 1.3265, + "step": 7687 + }, + { + "epoch": 0.05, + "grad_norm": 4.6455919428608885, + "learning_rate": 1.9991698784477243e-06, + "loss": 1.3427, + "step": 7688 + }, + { + "epoch": 0.05, + "grad_norm": 4.898295652476637, + "learning_rate": 1.999169662229776e-06, + "loss": 1.3903, + "step": 7689 + }, + { + "epoch": 0.05, + "grad_norm": 5.028746890185826, + "learning_rate": 1.9991694459836842e-06, + "loss": 1.4847, + "step": 7690 + }, + { + "epoch": 0.05, + "grad_norm": 4.555505230137718, + "learning_rate": 1.9991692297094493e-06, + "loss": 1.3457, + "step": 7691 + }, + { + "epoch": 0.05, + "grad_norm": 5.253081476633083, + "learning_rate": 1.9991690134070713e-06, + "loss": 1.3342, + "step": 7692 + }, + { + "epoch": 0.05, + "grad_norm": 4.225732921026402, + "learning_rate": 1.99916879707655e-06, + "loss": 1.292, + "step": 7693 + }, + { + "epoch": 0.05, + "grad_norm": 4.265843493123243, + "learning_rate": 1.9991685807178848e-06, + "loss": 1.3243, + "step": 7694 + }, + { + "epoch": 0.05, + "grad_norm": 4.624620250875206, + "learning_rate": 1.9991683643310767e-06, + "loss": 1.1394, + "step": 7695 + }, + { + "epoch": 0.05, + "grad_norm": 4.509476668068783, + "learning_rate": 1.9991681479161255e-06, + "loss": 1.3454, + "step": 7696 + }, + { + "epoch": 0.05, + "grad_norm": 4.790735666943298, + "learning_rate": 1.999167931473031e-06, + "loss": 1.5233, + "step": 7697 + }, + { + "epoch": 0.05, + "grad_norm": 4.199591650612746, + "learning_rate": 1.9991677150017933e-06, + "loss": 1.321, + "step": 7698 + }, + { + "epoch": 0.05, + "grad_norm": 5.17592961582678, + "learning_rate": 1.999167498502412e-06, + "loss": 1.3803, + "step": 7699 + }, + { + "epoch": 0.05, + "grad_norm": 4.6320550743298705, + "learning_rate": 1.9991672819748876e-06, + "loss": 1.373, + "step": 7700 + }, + { + "epoch": 0.05, + "grad_norm": 5.043437303463971, + "learning_rate": 1.9991670654192204e-06, + "loss": 1.3333, + "step": 7701 + }, + { + "epoch": 0.05, + "grad_norm": 4.209274659703086, + "learning_rate": 1.99916684883541e-06, + "loss": 1.319, + "step": 7702 + }, + { + "epoch": 0.05, + "grad_norm": 5.210886527687919, + "learning_rate": 1.999166632223456e-06, + "loss": 1.4947, + "step": 7703 + }, + { + "epoch": 0.05, + "grad_norm": 4.3022353892124965, + "learning_rate": 1.999166415583359e-06, + "loss": 1.3958, + "step": 7704 + }, + { + "epoch": 0.05, + "grad_norm": 5.048515012407663, + "learning_rate": 1.9991661989151186e-06, + "loss": 1.2334, + "step": 7705 + }, + { + "epoch": 0.05, + "grad_norm": 4.616532323992355, + "learning_rate": 1.999165982218735e-06, + "loss": 1.3724, + "step": 7706 + }, + { + "epoch": 0.05, + "grad_norm": 5.2642419668717135, + "learning_rate": 1.9991657654942086e-06, + "loss": 1.3066, + "step": 7707 + }, + { + "epoch": 0.05, + "grad_norm": 4.384429344311389, + "learning_rate": 1.999165548741539e-06, + "loss": 1.3015, + "step": 7708 + }, + { + "epoch": 0.05, + "grad_norm": 5.102380930634415, + "learning_rate": 1.999165331960726e-06, + "loss": 1.2993, + "step": 7709 + }, + { + "epoch": 0.05, + "grad_norm": 4.539353904636387, + "learning_rate": 1.9991651151517693e-06, + "loss": 1.385, + "step": 7710 + }, + { + "epoch": 0.05, + "grad_norm": 5.352630533139624, + "learning_rate": 1.9991648983146704e-06, + "loss": 1.3978, + "step": 7711 + }, + { + "epoch": 0.05, + "grad_norm": 4.280251079924495, + "learning_rate": 1.999164681449428e-06, + "loss": 1.4009, + "step": 7712 + }, + { + "epoch": 0.05, + "grad_norm": 5.394229640539235, + "learning_rate": 1.9991644645560425e-06, + "loss": 1.3591, + "step": 7713 + }, + { + "epoch": 0.05, + "grad_norm": 4.440874286887443, + "learning_rate": 1.9991642476345135e-06, + "loss": 1.4757, + "step": 7714 + }, + { + "epoch": 0.05, + "grad_norm": 5.283008931402802, + "learning_rate": 1.999164030684842e-06, + "loss": 1.5276, + "step": 7715 + }, + { + "epoch": 0.05, + "grad_norm": 6.806909170081613, + "learning_rate": 1.9991638137070266e-06, + "loss": 1.3684, + "step": 7716 + }, + { + "epoch": 0.05, + "grad_norm": 4.289471770878402, + "learning_rate": 1.9991635967010688e-06, + "loss": 1.3916, + "step": 7717 + }, + { + "epoch": 0.05, + "grad_norm": 4.4840091204695725, + "learning_rate": 1.9991633796669674e-06, + "loss": 1.4037, + "step": 7718 + }, + { + "epoch": 0.05, + "grad_norm": 4.632919184504854, + "learning_rate": 1.999163162604723e-06, + "loss": 1.445, + "step": 7719 + }, + { + "epoch": 0.05, + "grad_norm": 4.917869072196511, + "learning_rate": 1.9991629455143354e-06, + "loss": 1.4749, + "step": 7720 + }, + { + "epoch": 0.05, + "grad_norm": 4.872851811456384, + "learning_rate": 1.999162728395805e-06, + "loss": 1.2289, + "step": 7721 + }, + { + "epoch": 0.05, + "grad_norm": 5.599025969389777, + "learning_rate": 1.9991625112491314e-06, + "loss": 1.2262, + "step": 7722 + }, + { + "epoch": 0.05, + "grad_norm": 4.527501135619346, + "learning_rate": 1.9991622940743145e-06, + "loss": 1.3792, + "step": 7723 + }, + { + "epoch": 0.05, + "grad_norm": 5.005597639086992, + "learning_rate": 1.999162076871355e-06, + "loss": 1.4674, + "step": 7724 + }, + { + "epoch": 0.05, + "grad_norm": 4.141539620779096, + "learning_rate": 1.999161859640252e-06, + "loss": 1.2431, + "step": 7725 + }, + { + "epoch": 0.05, + "grad_norm": 4.903599320285552, + "learning_rate": 1.999161642381006e-06, + "loss": 1.3699, + "step": 7726 + }, + { + "epoch": 0.05, + "grad_norm": 4.530716110763893, + "learning_rate": 1.999161425093617e-06, + "loss": 1.409, + "step": 7727 + }, + { + "epoch": 0.05, + "grad_norm": 4.419189511613821, + "learning_rate": 1.999161207778085e-06, + "loss": 1.3161, + "step": 7728 + }, + { + "epoch": 0.05, + "grad_norm": 4.517876307126391, + "learning_rate": 1.99916099043441e-06, + "loss": 1.4033, + "step": 7729 + }, + { + "epoch": 0.05, + "grad_norm": 5.046973948033741, + "learning_rate": 1.999160773062592e-06, + "loss": 1.4555, + "step": 7730 + }, + { + "epoch": 0.05, + "grad_norm": 5.20427528859649, + "learning_rate": 1.999160555662631e-06, + "loss": 1.4801, + "step": 7731 + }, + { + "epoch": 0.05, + "grad_norm": 4.529459234389931, + "learning_rate": 1.9991603382345262e-06, + "loss": 1.2787, + "step": 7732 + }, + { + "epoch": 0.05, + "grad_norm": 4.396261511256881, + "learning_rate": 1.9991601207782793e-06, + "loss": 1.3565, + "step": 7733 + }, + { + "epoch": 0.05, + "grad_norm": 4.436307213266451, + "learning_rate": 1.999159903293889e-06, + "loss": 1.4514, + "step": 7734 + }, + { + "epoch": 0.05, + "grad_norm": 4.504728249316001, + "learning_rate": 1.9991596857813557e-06, + "loss": 1.4202, + "step": 7735 + }, + { + "epoch": 0.05, + "grad_norm": 4.670095542486996, + "learning_rate": 1.9991594682406795e-06, + "loss": 1.3657, + "step": 7736 + }, + { + "epoch": 0.05, + "grad_norm": 4.698157926590413, + "learning_rate": 1.9991592506718602e-06, + "loss": 1.3066, + "step": 7737 + }, + { + "epoch": 0.05, + "grad_norm": 5.174875762511404, + "learning_rate": 1.999159033074898e-06, + "loss": 1.3919, + "step": 7738 + }, + { + "epoch": 0.05, + "eval_loss": 1.5714409351348877, + "eval_runtime": 4.603, + "eval_samples_per_second": 1.955, + "eval_steps_per_second": 1.086, + "step": 7738 + }, + { + "epoch": 0.05, + "grad_norm": 4.175707810276016, + "learning_rate": 1.9991588154497928e-06, + "loss": 1.2561, + "step": 7739 + }, + { + "epoch": 0.05, + "grad_norm": 4.93781445252905, + "learning_rate": 1.9991585977965446e-06, + "loss": 1.3439, + "step": 7740 + }, + { + "epoch": 0.05, + "grad_norm": 4.805994091804873, + "learning_rate": 1.9991583801151534e-06, + "loss": 1.3494, + "step": 7741 + }, + { + "epoch": 0.05, + "grad_norm": 4.410801166793402, + "learning_rate": 1.999158162405619e-06, + "loss": 1.4503, + "step": 7742 + }, + { + "epoch": 0.05, + "grad_norm": 4.420418755361164, + "learning_rate": 1.999157944667942e-06, + "loss": 1.388, + "step": 7743 + }, + { + "epoch": 0.05, + "grad_norm": 4.6956053703000284, + "learning_rate": 1.999157726902122e-06, + "loss": 1.3943, + "step": 7744 + }, + { + "epoch": 0.05, + "grad_norm": 4.8785853587190315, + "learning_rate": 1.999157509108159e-06, + "loss": 1.5745, + "step": 7745 + }, + { + "epoch": 0.05, + "grad_norm": 4.533105281280708, + "learning_rate": 1.9991572912860532e-06, + "loss": 1.166, + "step": 7746 + }, + { + "epoch": 0.05, + "grad_norm": 4.54046659217364, + "learning_rate": 1.9991570734358043e-06, + "loss": 1.3861, + "step": 7747 + }, + { + "epoch": 0.05, + "grad_norm": 7.421652151878216, + "learning_rate": 1.999156855557412e-06, + "loss": 1.4499, + "step": 7748 + }, + { + "epoch": 0.05, + "grad_norm": 4.698522721572353, + "learning_rate": 1.9991566376508775e-06, + "loss": 1.4939, + "step": 7749 + }, + { + "epoch": 0.05, + "grad_norm": 4.422380492771535, + "learning_rate": 1.9991564197161997e-06, + "loss": 1.3063, + "step": 7750 + }, + { + "epoch": 0.05, + "grad_norm": 4.581948781805496, + "learning_rate": 1.999156201753379e-06, + "loss": 1.3506, + "step": 7751 + }, + { + "epoch": 0.05, + "grad_norm": 4.966434544463009, + "learning_rate": 1.9991559837624156e-06, + "loss": 1.4167, + "step": 7752 + }, + { + "epoch": 0.05, + "grad_norm": 4.494576354753205, + "learning_rate": 1.9991557657433093e-06, + "loss": 1.3358, + "step": 7753 + }, + { + "epoch": 0.05, + "grad_norm": 4.443704979027001, + "learning_rate": 1.99915554769606e-06, + "loss": 1.2271, + "step": 7754 + }, + { + "epoch": 0.05, + "grad_norm": 5.05415589509564, + "learning_rate": 1.999155329620668e-06, + "loss": 1.2592, + "step": 7755 + }, + { + "epoch": 0.05, + "grad_norm": 4.658098683784834, + "learning_rate": 1.999155111517133e-06, + "loss": 1.1916, + "step": 7756 + }, + { + "epoch": 0.05, + "grad_norm": 4.146560184197253, + "learning_rate": 1.9991548933854546e-06, + "loss": 1.2936, + "step": 7757 + }, + { + "epoch": 0.05, + "grad_norm": 4.840651733689375, + "learning_rate": 1.9991546752256338e-06, + "loss": 1.5336, + "step": 7758 + }, + { + "epoch": 0.05, + "grad_norm": 4.3703138172329385, + "learning_rate": 1.9991544570376702e-06, + "loss": 1.2503, + "step": 7759 + }, + { + "epoch": 0.05, + "grad_norm": 4.485609813261699, + "learning_rate": 1.9991542388215636e-06, + "loss": 1.3837, + "step": 7760 + }, + { + "epoch": 0.05, + "grad_norm": 4.41336165208518, + "learning_rate": 1.9991540205773143e-06, + "loss": 1.3231, + "step": 7761 + }, + { + "epoch": 0.05, + "grad_norm": 4.694239628147161, + "learning_rate": 1.9991538023049223e-06, + "loss": 1.3579, + "step": 7762 + }, + { + "epoch": 0.05, + "grad_norm": 8.023323210257608, + "learning_rate": 1.9991535840043872e-06, + "loss": 1.1974, + "step": 7763 + }, + { + "epoch": 0.05, + "grad_norm": 7.428292899323734, + "learning_rate": 1.9991533656757095e-06, + "loss": 1.348, + "step": 7764 + }, + { + "epoch": 0.05, + "grad_norm": 4.619712492638562, + "learning_rate": 1.9991531473188887e-06, + "loss": 1.2491, + "step": 7765 + }, + { + "epoch": 0.05, + "grad_norm": 4.451820552376083, + "learning_rate": 1.9991529289339247e-06, + "loss": 1.33, + "step": 7766 + }, + { + "epoch": 0.05, + "grad_norm": 14.239291286538496, + "learning_rate": 1.9991527105208186e-06, + "loss": 1.4719, + "step": 7767 + }, + { + "epoch": 0.05, + "grad_norm": 4.982827409515907, + "learning_rate": 1.9991524920795693e-06, + "loss": 1.2224, + "step": 7768 + }, + { + "epoch": 0.05, + "grad_norm": 4.655043526777409, + "learning_rate": 1.9991522736101774e-06, + "loss": 1.4407, + "step": 7769 + }, + { + "epoch": 0.05, + "grad_norm": 4.651583819715488, + "learning_rate": 1.9991520551126428e-06, + "loss": 1.4398, + "step": 7770 + }, + { + "epoch": 0.05, + "grad_norm": 4.235185466684276, + "learning_rate": 1.999151836586965e-06, + "loss": 1.3399, + "step": 7771 + }, + { + "epoch": 0.05, + "grad_norm": 7.3405089471179, + "learning_rate": 1.9991516180331447e-06, + "loss": 1.3231, + "step": 7772 + }, + { + "epoch": 0.05, + "grad_norm": 4.512421989259268, + "learning_rate": 1.9991513994511817e-06, + "loss": 1.3765, + "step": 7773 + }, + { + "epoch": 0.05, + "grad_norm": 5.326834488426423, + "learning_rate": 1.9991511808410755e-06, + "loss": 1.2044, + "step": 7774 + }, + { + "epoch": 0.05, + "grad_norm": 4.253178385231695, + "learning_rate": 1.999150962202827e-06, + "loss": 1.1378, + "step": 7775 + }, + { + "epoch": 0.05, + "grad_norm": 4.852808156252643, + "learning_rate": 1.9991507435364357e-06, + "loss": 1.4702, + "step": 7776 + }, + { + "epoch": 0.05, + "grad_norm": 4.61770150323408, + "learning_rate": 1.9991505248419016e-06, + "loss": 1.3665, + "step": 7777 + }, + { + "epoch": 0.05, + "grad_norm": 4.332576443131674, + "learning_rate": 1.9991503061192243e-06, + "loss": 1.3262, + "step": 7778 + }, + { + "epoch": 0.05, + "grad_norm": 4.419350171628437, + "learning_rate": 1.999150087368405e-06, + "loss": 1.4025, + "step": 7779 + }, + { + "epoch": 0.05, + "grad_norm": 4.29821925327133, + "learning_rate": 1.9991498685894427e-06, + "loss": 1.344, + "step": 7780 + }, + { + "epoch": 0.05, + "grad_norm": 4.96186103873307, + "learning_rate": 1.999149649782337e-06, + "loss": 1.4244, + "step": 7781 + }, + { + "epoch": 0.05, + "grad_norm": 5.290680575600782, + "learning_rate": 1.9991494309470896e-06, + "loss": 1.4055, + "step": 7782 + }, + { + "epoch": 0.05, + "grad_norm": 4.171848548411604, + "learning_rate": 1.999149212083699e-06, + "loss": 1.4467, + "step": 7783 + }, + { + "epoch": 0.05, + "grad_norm": 4.483008484867138, + "learning_rate": 1.9991489931921657e-06, + "loss": 1.3366, + "step": 7784 + }, + { + "epoch": 0.05, + "grad_norm": 4.716816966629754, + "learning_rate": 1.9991487742724894e-06, + "loss": 1.4159, + "step": 7785 + }, + { + "epoch": 0.05, + "grad_norm": 4.250233570874446, + "learning_rate": 1.999148555324671e-06, + "loss": 1.3413, + "step": 7786 + }, + { + "epoch": 0.05, + "grad_norm": 4.319258666893267, + "learning_rate": 1.9991483363487095e-06, + "loss": 1.4302, + "step": 7787 + }, + { + "epoch": 0.05, + "grad_norm": 4.379931424126575, + "learning_rate": 1.9991481173446056e-06, + "loss": 1.2281, + "step": 7788 + }, + { + "epoch": 0.05, + "grad_norm": 4.4928101612259645, + "learning_rate": 1.999147898312359e-06, + "loss": 1.4116, + "step": 7789 + }, + { + "epoch": 0.05, + "grad_norm": 5.531646572971643, + "learning_rate": 1.9991476792519693e-06, + "loss": 1.2289, + "step": 7790 + }, + { + "epoch": 0.05, + "grad_norm": 4.433755615515491, + "learning_rate": 1.9991474601634374e-06, + "loss": 1.2472, + "step": 7791 + }, + { + "epoch": 0.05, + "grad_norm": 4.555845638794292, + "learning_rate": 1.9991472410467628e-06, + "loss": 1.3909, + "step": 7792 + }, + { + "epoch": 0.05, + "grad_norm": 6.5910918415917115, + "learning_rate": 1.999147021901945e-06, + "loss": 1.4906, + "step": 7793 + }, + { + "epoch": 0.05, + "grad_norm": 7.123243079842171, + "learning_rate": 1.999146802728985e-06, + "loss": 1.2805, + "step": 7794 + }, + { + "epoch": 0.05, + "grad_norm": 5.895052706646069, + "learning_rate": 1.9991465835278825e-06, + "loss": 1.5177, + "step": 7795 + }, + { + "epoch": 0.05, + "grad_norm": 7.281958515549516, + "learning_rate": 1.9991463642986372e-06, + "loss": 1.3783, + "step": 7796 + }, + { + "epoch": 0.05, + "grad_norm": 4.497646384942129, + "learning_rate": 1.9991461450412493e-06, + "loss": 1.2287, + "step": 7797 + }, + { + "epoch": 0.05, + "grad_norm": 4.3093800668051925, + "learning_rate": 1.9991459257557187e-06, + "loss": 1.3017, + "step": 7798 + }, + { + "epoch": 0.05, + "grad_norm": 4.149845365434638, + "learning_rate": 1.9991457064420458e-06, + "loss": 1.2618, + "step": 7799 + }, + { + "epoch": 0.05, + "grad_norm": 4.943966845201791, + "learning_rate": 1.99914548710023e-06, + "loss": 1.4804, + "step": 7800 + }, + { + "epoch": 0.05, + "grad_norm": 4.593668643901737, + "learning_rate": 1.999145267730271e-06, + "loss": 1.4017, + "step": 7801 + }, + { + "epoch": 0.05, + "grad_norm": 4.864193671590243, + "learning_rate": 1.9991450483321703e-06, + "loss": 1.5987, + "step": 7802 + }, + { + "epoch": 0.05, + "grad_norm": 4.858887370929525, + "learning_rate": 1.9991448289059268e-06, + "loss": 1.2643, + "step": 7803 + }, + { + "epoch": 0.05, + "grad_norm": 6.364807491989936, + "learning_rate": 1.9991446094515405e-06, + "loss": 1.3496, + "step": 7804 + }, + { + "epoch": 0.05, + "grad_norm": 4.169090337062379, + "learning_rate": 1.999144389969012e-06, + "loss": 1.3751, + "step": 7805 + }, + { + "epoch": 0.05, + "grad_norm": 4.270304158177507, + "learning_rate": 1.9991441704583405e-06, + "loss": 1.2546, + "step": 7806 + }, + { + "epoch": 0.05, + "grad_norm": 4.65265370191332, + "learning_rate": 1.9991439509195267e-06, + "loss": 1.2049, + "step": 7807 + }, + { + "epoch": 0.05, + "grad_norm": 5.638393088093555, + "learning_rate": 1.9991437313525702e-06, + "loss": 1.3842, + "step": 7808 + }, + { + "epoch": 0.05, + "grad_norm": 4.3623669323791745, + "learning_rate": 1.999143511757471e-06, + "loss": 1.3777, + "step": 7809 + }, + { + "epoch": 0.05, + "grad_norm": 5.332668611820201, + "learning_rate": 1.9991432921342297e-06, + "loss": 1.4373, + "step": 7810 + }, + { + "epoch": 0.05, + "grad_norm": 4.45144845665693, + "learning_rate": 1.9991430724828456e-06, + "loss": 1.4446, + "step": 7811 + }, + { + "epoch": 0.05, + "eval_loss": 1.5688176155090332, + "eval_runtime": 4.5803, + "eval_samples_per_second": 1.965, + "eval_steps_per_second": 1.092, + "step": 7811 + }, + { + "epoch": 0.05, + "grad_norm": 4.32891217625353, + "learning_rate": 1.999142852803319e-06, + "loss": 1.341, + "step": 7812 + }, + { + "epoch": 0.05, + "grad_norm": 4.357331814719709, + "learning_rate": 1.9991426330956495e-06, + "loss": 1.2788, + "step": 7813 + }, + { + "epoch": 0.05, + "grad_norm": 4.47179926701401, + "learning_rate": 1.999142413359838e-06, + "loss": 1.3917, + "step": 7814 + }, + { + "epoch": 0.05, + "grad_norm": 4.635258202399129, + "learning_rate": 1.9991421935958836e-06, + "loss": 1.3874, + "step": 7815 + }, + { + "epoch": 0.05, + "grad_norm": 5.188118711526927, + "learning_rate": 1.999141973803787e-06, + "loss": 1.5923, + "step": 7816 + }, + { + "epoch": 0.05, + "grad_norm": 4.170732677350809, + "learning_rate": 1.9991417539835478e-06, + "loss": 1.2578, + "step": 7817 + }, + { + "epoch": 0.05, + "grad_norm": 4.452852094348596, + "learning_rate": 1.999141534135166e-06, + "loss": 1.2733, + "step": 7818 + }, + { + "epoch": 0.05, + "grad_norm": 4.465874097236842, + "learning_rate": 1.9991413142586417e-06, + "loss": 1.3696, + "step": 7819 + }, + { + "epoch": 0.05, + "grad_norm": 4.375335811664349, + "learning_rate": 1.999141094353975e-06, + "loss": 1.2605, + "step": 7820 + }, + { + "epoch": 0.05, + "grad_norm": 4.88899512779934, + "learning_rate": 1.999140874421166e-06, + "loss": 1.4881, + "step": 7821 + }, + { + "epoch": 0.05, + "grad_norm": 6.507944758478165, + "learning_rate": 1.999140654460214e-06, + "loss": 1.3324, + "step": 7822 + }, + { + "epoch": 0.05, + "grad_norm": 5.161837898418599, + "learning_rate": 1.99914043447112e-06, + "loss": 1.2969, + "step": 7823 + }, + { + "epoch": 0.05, + "grad_norm": 5.5483531089027265, + "learning_rate": 1.9991402144538835e-06, + "loss": 1.338, + "step": 7824 + }, + { + "epoch": 0.05, + "grad_norm": 4.317890046692939, + "learning_rate": 1.999139994408504e-06, + "loss": 1.3808, + "step": 7825 + }, + { + "epoch": 0.05, + "grad_norm": 4.85742461700556, + "learning_rate": 1.9991397743349826e-06, + "loss": 1.2791, + "step": 7826 + }, + { + "epoch": 0.05, + "grad_norm": 4.251258612356232, + "learning_rate": 1.999139554233319e-06, + "loss": 1.3822, + "step": 7827 + }, + { + "epoch": 0.05, + "grad_norm": 4.433372185310212, + "learning_rate": 1.9991393341035123e-06, + "loss": 1.3458, + "step": 7828 + }, + { + "epoch": 0.05, + "grad_norm": 5.56654917233872, + "learning_rate": 1.9991391139455636e-06, + "loss": 1.3783, + "step": 7829 + }, + { + "epoch": 0.05, + "grad_norm": 5.79917965548318, + "learning_rate": 1.9991388937594726e-06, + "loss": 1.4272, + "step": 7830 + }, + { + "epoch": 0.05, + "grad_norm": 4.4833060398499835, + "learning_rate": 1.9991386735452385e-06, + "loss": 1.423, + "step": 7831 + }, + { + "epoch": 0.05, + "grad_norm": 4.423563714754524, + "learning_rate": 1.9991384533028627e-06, + "loss": 1.4966, + "step": 7832 + }, + { + "epoch": 0.05, + "grad_norm": 4.338117109069359, + "learning_rate": 1.999138233032344e-06, + "loss": 1.3705, + "step": 7833 + }, + { + "epoch": 0.05, + "grad_norm": 4.75332256939269, + "learning_rate": 1.9991380127336833e-06, + "loss": 1.3029, + "step": 7834 + }, + { + "epoch": 0.05, + "grad_norm": 3.9882628565471494, + "learning_rate": 1.99913779240688e-06, + "loss": 1.2201, + "step": 7835 + }, + { + "epoch": 0.05, + "grad_norm": 4.411243434288691, + "learning_rate": 1.999137572051934e-06, + "loss": 1.419, + "step": 7836 + }, + { + "epoch": 0.05, + "grad_norm": 4.215995318173886, + "learning_rate": 1.999137351668846e-06, + "loss": 1.3908, + "step": 7837 + }, + { + "epoch": 0.05, + "grad_norm": 4.679897615810644, + "learning_rate": 1.9991371312576155e-06, + "loss": 1.4329, + "step": 7838 + }, + { + "epoch": 0.05, + "grad_norm": 6.274822122131257, + "learning_rate": 1.999136910818243e-06, + "loss": 1.2731, + "step": 7839 + }, + { + "epoch": 0.05, + "grad_norm": 4.444364596853928, + "learning_rate": 1.9991366903507275e-06, + "loss": 1.2914, + "step": 7840 + }, + { + "epoch": 0.05, + "grad_norm": 4.375169619321004, + "learning_rate": 1.99913646985507e-06, + "loss": 1.4331, + "step": 7841 + }, + { + "epoch": 0.05, + "grad_norm": 4.667044912199263, + "learning_rate": 1.99913624933127e-06, + "loss": 1.3761, + "step": 7842 + }, + { + "epoch": 0.05, + "grad_norm": 5.038962012585465, + "learning_rate": 1.999136028779328e-06, + "loss": 1.3423, + "step": 7843 + }, + { + "epoch": 0.05, + "grad_norm": 5.001327055835487, + "learning_rate": 1.9991358081992437e-06, + "loss": 1.3895, + "step": 7844 + }, + { + "epoch": 0.05, + "grad_norm": 5.02449408586461, + "learning_rate": 1.9991355875910165e-06, + "loss": 1.2673, + "step": 7845 + }, + { + "epoch": 0.05, + "grad_norm": 4.581799067890731, + "learning_rate": 1.9991353669546474e-06, + "loss": 1.3802, + "step": 7846 + }, + { + "epoch": 0.05, + "grad_norm": 4.560127091757449, + "learning_rate": 1.9991351462901357e-06, + "loss": 1.3478, + "step": 7847 + }, + { + "epoch": 0.05, + "grad_norm": 4.761668275999001, + "learning_rate": 1.999134925597482e-06, + "loss": 1.3887, + "step": 7848 + }, + { + "epoch": 0.05, + "grad_norm": 4.451504166455566, + "learning_rate": 1.9991347048766856e-06, + "loss": 1.3024, + "step": 7849 + }, + { + "epoch": 0.05, + "grad_norm": 5.440109975467861, + "learning_rate": 1.999134484127747e-06, + "loss": 1.4257, + "step": 7850 + }, + { + "epoch": 0.05, + "grad_norm": 5.141436439016143, + "learning_rate": 1.999134263350667e-06, + "loss": 1.3099, + "step": 7851 + }, + { + "epoch": 0.05, + "grad_norm": 4.860670820133751, + "learning_rate": 1.9991340425454436e-06, + "loss": 1.3657, + "step": 7852 + }, + { + "epoch": 0.05, + "grad_norm": 4.448613419574616, + "learning_rate": 1.9991338217120784e-06, + "loss": 1.3957, + "step": 7853 + }, + { + "epoch": 0.05, + "grad_norm": 4.671247219573718, + "learning_rate": 1.9991336008505705e-06, + "loss": 1.3791, + "step": 7854 + }, + { + "epoch": 0.05, + "grad_norm": 4.959614105606823, + "learning_rate": 1.999133379960921e-06, + "loss": 1.2945, + "step": 7855 + }, + { + "epoch": 0.05, + "grad_norm": 4.468128058062564, + "learning_rate": 1.9991331590431285e-06, + "loss": 1.3073, + "step": 7856 + }, + { + "epoch": 0.05, + "grad_norm": 5.291204533016882, + "learning_rate": 1.9991329380971944e-06, + "loss": 1.3787, + "step": 7857 + }, + { + "epoch": 0.05, + "grad_norm": 4.400267620252943, + "learning_rate": 1.9991327171231176e-06, + "loss": 1.4282, + "step": 7858 + }, + { + "epoch": 0.05, + "grad_norm": 5.033168757191306, + "learning_rate": 1.9991324961208985e-06, + "loss": 1.362, + "step": 7859 + }, + { + "epoch": 0.05, + "grad_norm": 4.515822451343939, + "learning_rate": 1.9991322750905376e-06, + "loss": 1.2719, + "step": 7860 + }, + { + "epoch": 0.05, + "grad_norm": 4.9268503948497395, + "learning_rate": 1.999132054032034e-06, + "loss": 1.3558, + "step": 7861 + }, + { + "epoch": 0.05, + "grad_norm": 8.60209526657174, + "learning_rate": 1.9991318329453886e-06, + "loss": 1.3861, + "step": 7862 + }, + { + "epoch": 0.05, + "grad_norm": 4.697740151769331, + "learning_rate": 1.9991316118306006e-06, + "loss": 1.3295, + "step": 7863 + }, + { + "epoch": 0.05, + "grad_norm": 4.741101177749595, + "learning_rate": 1.9991313906876707e-06, + "loss": 1.496, + "step": 7864 + }, + { + "epoch": 0.05, + "grad_norm": 4.500865273629475, + "learning_rate": 1.9991311695165986e-06, + "loss": 1.5159, + "step": 7865 + }, + { + "epoch": 0.05, + "grad_norm": 4.083464890835658, + "learning_rate": 1.999130948317384e-06, + "loss": 1.4502, + "step": 7866 + }, + { + "epoch": 0.05, + "grad_norm": 4.420747557058192, + "learning_rate": 1.999130727090027e-06, + "loss": 1.2925, + "step": 7867 + }, + { + "epoch": 0.05, + "grad_norm": 5.135998349106828, + "learning_rate": 1.9991305058345283e-06, + "loss": 1.4852, + "step": 7868 + }, + { + "epoch": 0.05, + "grad_norm": 6.6955333591917965, + "learning_rate": 1.9991302845508876e-06, + "loss": 1.4416, + "step": 7869 + }, + { + "epoch": 0.05, + "grad_norm": 4.72760677588766, + "learning_rate": 1.9991300632391042e-06, + "loss": 1.3802, + "step": 7870 + }, + { + "epoch": 0.05, + "grad_norm": 4.420290122651671, + "learning_rate": 1.999129841899179e-06, + "loss": 1.2709, + "step": 7871 + }, + { + "epoch": 0.05, + "grad_norm": 4.333308736928675, + "learning_rate": 1.9991296205311116e-06, + "loss": 1.4473, + "step": 7872 + }, + { + "epoch": 0.05, + "grad_norm": 4.383603976083784, + "learning_rate": 1.999129399134902e-06, + "loss": 1.3902, + "step": 7873 + }, + { + "epoch": 0.05, + "grad_norm": 4.371937519098825, + "learning_rate": 1.99912917771055e-06, + "loss": 1.299, + "step": 7874 + }, + { + "epoch": 0.05, + "grad_norm": 4.700315092913012, + "learning_rate": 1.999128956258056e-06, + "loss": 1.3857, + "step": 7875 + }, + { + "epoch": 0.05, + "grad_norm": 4.189280015488588, + "learning_rate": 1.99912873477742e-06, + "loss": 1.2397, + "step": 7876 + }, + { + "epoch": 0.05, + "grad_norm": 4.220725041703095, + "learning_rate": 1.9991285132686416e-06, + "loss": 1.3956, + "step": 7877 + }, + { + "epoch": 0.05, + "grad_norm": 4.859350391913755, + "learning_rate": 1.999128291731721e-06, + "loss": 1.4487, + "step": 7878 + }, + { + "epoch": 0.05, + "grad_norm": 4.401864852624024, + "learning_rate": 1.999128070166659e-06, + "loss": 1.4258, + "step": 7879 + }, + { + "epoch": 0.05, + "grad_norm": 4.44397745741506, + "learning_rate": 1.9991278485734543e-06, + "loss": 1.3566, + "step": 7880 + }, + { + "epoch": 0.05, + "grad_norm": 4.91309823898804, + "learning_rate": 1.9991276269521075e-06, + "loss": 1.3444, + "step": 7881 + }, + { + "epoch": 0.05, + "grad_norm": 4.667527704893879, + "learning_rate": 1.9991274053026184e-06, + "loss": 1.3744, + "step": 7882 + }, + { + "epoch": 0.05, + "grad_norm": 4.309837037247037, + "learning_rate": 1.9991271836249876e-06, + "loss": 1.2796, + "step": 7883 + }, + { + "epoch": 0.05, + "grad_norm": 4.381518346415893, + "learning_rate": 1.999126961919215e-06, + "loss": 1.5014, + "step": 7884 + }, + { + "epoch": 0.05, + "eval_loss": 1.5658948421478271, + "eval_runtime": 4.6115, + "eval_samples_per_second": 1.952, + "eval_steps_per_second": 1.084, + "step": 7884 + }, + { + "epoch": 0.05, + "grad_norm": 4.283714767175808, + "learning_rate": 1.9991267401852995e-06, + "loss": 1.2823, + "step": 7885 + }, + { + "epoch": 0.05, + "grad_norm": 4.549574675248714, + "learning_rate": 1.9991265184232424e-06, + "loss": 1.24, + "step": 7886 + }, + { + "epoch": 0.05, + "grad_norm": 4.479672952540066, + "learning_rate": 1.9991262966330434e-06, + "loss": 1.3989, + "step": 7887 + }, + { + "epoch": 0.05, + "grad_norm": 10.377616324433669, + "learning_rate": 1.9991260748147017e-06, + "loss": 1.5313, + "step": 7888 + }, + { + "epoch": 0.05, + "grad_norm": 6.573317168040383, + "learning_rate": 1.9991258529682186e-06, + "loss": 1.4143, + "step": 7889 + }, + { + "epoch": 0.05, + "grad_norm": 4.8446186121205495, + "learning_rate": 1.9991256310935933e-06, + "loss": 1.5735, + "step": 7890 + }, + { + "epoch": 0.05, + "grad_norm": 4.757972434994108, + "learning_rate": 1.9991254091908257e-06, + "loss": 1.4053, + "step": 7891 + }, + { + "epoch": 0.05, + "grad_norm": 4.367465286261333, + "learning_rate": 1.9991251872599163e-06, + "loss": 1.4348, + "step": 7892 + }, + { + "epoch": 0.05, + "grad_norm": 4.550739875935156, + "learning_rate": 1.9991249653008647e-06, + "loss": 1.393, + "step": 7893 + }, + { + "epoch": 0.05, + "grad_norm": 4.936416661660528, + "learning_rate": 1.9991247433136713e-06, + "loss": 1.3581, + "step": 7894 + }, + { + "epoch": 0.05, + "grad_norm": 4.614387854754647, + "learning_rate": 1.9991245212983356e-06, + "loss": 1.3308, + "step": 7895 + }, + { + "epoch": 0.05, + "grad_norm": 4.41145194509055, + "learning_rate": 1.999124299254858e-06, + "loss": 1.3587, + "step": 7896 + }, + { + "epoch": 0.05, + "grad_norm": 5.231837605617733, + "learning_rate": 1.9991240771832387e-06, + "loss": 1.29, + "step": 7897 + }, + { + "epoch": 0.05, + "grad_norm": 4.447012506572734, + "learning_rate": 1.9991238550834767e-06, + "loss": 1.3812, + "step": 7898 + }, + { + "epoch": 0.05, + "grad_norm": 4.61486671530214, + "learning_rate": 1.9991236329555732e-06, + "loss": 1.3637, + "step": 7899 + }, + { + "epoch": 0.05, + "grad_norm": 4.420749615143057, + "learning_rate": 1.9991234107995276e-06, + "loss": 1.3119, + "step": 7900 + }, + { + "epoch": 0.05, + "grad_norm": 4.307335991095319, + "learning_rate": 1.99912318861534e-06, + "loss": 1.1966, + "step": 7901 + }, + { + "epoch": 0.05, + "grad_norm": 6.507288920172387, + "learning_rate": 1.9991229664030104e-06, + "loss": 1.3704, + "step": 7902 + }, + { + "epoch": 0.05, + "grad_norm": 4.7613792064088045, + "learning_rate": 1.999122744162539e-06, + "loss": 1.4332, + "step": 7903 + }, + { + "epoch": 0.05, + "grad_norm": 4.863190180472665, + "learning_rate": 1.9991225218939254e-06, + "loss": 1.2518, + "step": 7904 + }, + { + "epoch": 0.05, + "grad_norm": 4.493712066053234, + "learning_rate": 1.99912229959717e-06, + "loss": 1.2204, + "step": 7905 + }, + { + "epoch": 0.05, + "grad_norm": 4.375741940371645, + "learning_rate": 1.9991220772722724e-06, + "loss": 1.4047, + "step": 7906 + }, + { + "epoch": 0.05, + "grad_norm": 4.295892023382768, + "learning_rate": 1.999121854919233e-06, + "loss": 1.3851, + "step": 7907 + }, + { + "epoch": 0.05, + "grad_norm": 4.597007234903246, + "learning_rate": 1.999121632538052e-06, + "loss": 1.3999, + "step": 7908 + }, + { + "epoch": 0.05, + "grad_norm": 5.881100502163143, + "learning_rate": 1.9991214101287287e-06, + "loss": 1.3936, + "step": 7909 + }, + { + "epoch": 0.05, + "grad_norm": 4.350752514009014, + "learning_rate": 1.9991211876912635e-06, + "loss": 1.2896, + "step": 7910 + }, + { + "epoch": 0.05, + "grad_norm": 5.220851368581993, + "learning_rate": 1.9991209652256565e-06, + "loss": 1.4606, + "step": 7911 + }, + { + "epoch": 0.05, + "grad_norm": 4.796976254952187, + "learning_rate": 1.9991207427319073e-06, + "loss": 1.6258, + "step": 7912 + }, + { + "epoch": 0.05, + "grad_norm": 4.768419080865092, + "learning_rate": 1.9991205202100167e-06, + "loss": 1.3625, + "step": 7913 + }, + { + "epoch": 0.05, + "grad_norm": 4.233874005716375, + "learning_rate": 1.999120297659984e-06, + "loss": 1.3188, + "step": 7914 + }, + { + "epoch": 0.05, + "grad_norm": 4.177645778158895, + "learning_rate": 1.999120075081809e-06, + "loss": 1.2694, + "step": 7915 + }, + { + "epoch": 0.05, + "grad_norm": 4.5435422081969445, + "learning_rate": 1.9991198524754926e-06, + "loss": 1.3417, + "step": 7916 + }, + { + "epoch": 0.05, + "grad_norm": 5.641311379320699, + "learning_rate": 1.9991196298410343e-06, + "loss": 1.4612, + "step": 7917 + }, + { + "epoch": 0.05, + "grad_norm": 4.410378331233864, + "learning_rate": 1.9991194071784337e-06, + "loss": 1.2973, + "step": 7918 + }, + { + "epoch": 0.05, + "grad_norm": 4.810443179316793, + "learning_rate": 1.9991191844876917e-06, + "loss": 1.3293, + "step": 7919 + }, + { + "epoch": 0.05, + "grad_norm": 10.2524967534719, + "learning_rate": 1.999118961768808e-06, + "loss": 0.9615, + "step": 7920 + }, + { + "epoch": 0.05, + "grad_norm": 5.186281083510213, + "learning_rate": 1.999118739021782e-06, + "loss": 1.3054, + "step": 7921 + }, + { + "epoch": 0.05, + "grad_norm": 4.40307733933914, + "learning_rate": 1.999118516246614e-06, + "loss": 1.3466, + "step": 7922 + }, + { + "epoch": 0.05, + "grad_norm": 4.573887530740586, + "learning_rate": 1.9991182934433042e-06, + "loss": 1.4167, + "step": 7923 + }, + { + "epoch": 0.05, + "grad_norm": 4.336000923774887, + "learning_rate": 1.9991180706118527e-06, + "loss": 1.2728, + "step": 7924 + }, + { + "epoch": 0.05, + "grad_norm": 4.945110717150292, + "learning_rate": 1.9991178477522594e-06, + "loss": 1.4016, + "step": 7925 + }, + { + "epoch": 0.05, + "grad_norm": 5.166560502236018, + "learning_rate": 1.9991176248645246e-06, + "loss": 1.4965, + "step": 7926 + }, + { + "epoch": 0.05, + "grad_norm": 5.361818264778021, + "learning_rate": 1.9991174019486476e-06, + "loss": 1.4443, + "step": 7927 + }, + { + "epoch": 0.05, + "grad_norm": 4.7580799872674255, + "learning_rate": 1.999117179004629e-06, + "loss": 1.3874, + "step": 7928 + }, + { + "epoch": 0.05, + "grad_norm": 4.317283836336687, + "learning_rate": 1.999116956032468e-06, + "loss": 1.3588, + "step": 7929 + }, + { + "epoch": 0.05, + "grad_norm": 5.746489083705707, + "learning_rate": 1.9991167330321657e-06, + "loss": 1.5891, + "step": 7930 + }, + { + "epoch": 0.05, + "grad_norm": 4.449830229945385, + "learning_rate": 1.999116510003722e-06, + "loss": 1.3317, + "step": 7931 + }, + { + "epoch": 0.05, + "grad_norm": 5.924329193284593, + "learning_rate": 1.9991162869471358e-06, + "loss": 1.3648, + "step": 7932 + }, + { + "epoch": 0.05, + "grad_norm": 4.338714744766864, + "learning_rate": 1.999116063862408e-06, + "loss": 1.4679, + "step": 7933 + }, + { + "epoch": 0.05, + "grad_norm": 4.252926058271161, + "learning_rate": 1.9991158407495385e-06, + "loss": 1.2192, + "step": 7934 + }, + { + "epoch": 0.05, + "grad_norm": 4.681644481339857, + "learning_rate": 1.9991156176085274e-06, + "loss": 1.348, + "step": 7935 + }, + { + "epoch": 0.05, + "grad_norm": 4.811285298925873, + "learning_rate": 1.999115394439374e-06, + "loss": 1.44, + "step": 7936 + }, + { + "epoch": 0.05, + "grad_norm": 5.019647848894639, + "learning_rate": 1.999115171242079e-06, + "loss": 1.4317, + "step": 7937 + }, + { + "epoch": 0.05, + "grad_norm": 4.307822714294956, + "learning_rate": 1.999114948016643e-06, + "loss": 1.5175, + "step": 7938 + }, + { + "epoch": 0.05, + "grad_norm": 4.659200799819313, + "learning_rate": 1.9991147247630646e-06, + "loss": 1.5738, + "step": 7939 + }, + { + "epoch": 0.05, + "grad_norm": 4.323585584812524, + "learning_rate": 1.9991145014813447e-06, + "loss": 1.2841, + "step": 7940 + }, + { + "epoch": 0.05, + "grad_norm": 5.877089498727992, + "learning_rate": 1.9991142781714826e-06, + "loss": 1.2776, + "step": 7941 + }, + { + "epoch": 0.05, + "grad_norm": 4.278507062876658, + "learning_rate": 1.9991140548334796e-06, + "loss": 1.2802, + "step": 7942 + }, + { + "epoch": 0.05, + "grad_norm": 4.628730622051179, + "learning_rate": 1.9991138314673343e-06, + "loss": 1.1402, + "step": 7943 + }, + { + "epoch": 0.05, + "grad_norm": 7.448035146551721, + "learning_rate": 1.999113608073047e-06, + "loss": 1.3304, + "step": 7944 + }, + { + "epoch": 0.05, + "grad_norm": 9.517121551044305, + "learning_rate": 1.9991133846506186e-06, + "loss": 1.2432, + "step": 7945 + }, + { + "epoch": 0.05, + "grad_norm": 4.769881942858478, + "learning_rate": 1.9991131612000483e-06, + "loss": 1.5942, + "step": 7946 + }, + { + "epoch": 0.05, + "grad_norm": 4.697333913534161, + "learning_rate": 1.999112937721336e-06, + "loss": 1.2304, + "step": 7947 + }, + { + "epoch": 0.05, + "grad_norm": 5.007259309617801, + "learning_rate": 1.9991127142144825e-06, + "loss": 1.3207, + "step": 7948 + }, + { + "epoch": 0.05, + "grad_norm": 6.044983166732241, + "learning_rate": 1.999112490679487e-06, + "loss": 1.3671, + "step": 7949 + }, + { + "epoch": 0.05, + "grad_norm": 4.8740643030418465, + "learning_rate": 1.99911226711635e-06, + "loss": 1.4998, + "step": 7950 + }, + { + "epoch": 0.05, + "grad_norm": 4.497670011244053, + "learning_rate": 1.9991120435250713e-06, + "loss": 1.3662, + "step": 7951 + }, + { + "epoch": 0.05, + "grad_norm": 4.482631661738367, + "learning_rate": 1.999111819905651e-06, + "loss": 1.4118, + "step": 7952 + }, + { + "epoch": 0.05, + "grad_norm": 4.51675397584892, + "learning_rate": 1.999111596258089e-06, + "loss": 1.3852, + "step": 7953 + }, + { + "epoch": 0.05, + "grad_norm": 4.19239560401668, + "learning_rate": 1.9991113725823854e-06, + "loss": 1.1872, + "step": 7954 + }, + { + "epoch": 0.05, + "grad_norm": 4.299916242407997, + "learning_rate": 1.99911114887854e-06, + "loss": 1.3695, + "step": 7955 + }, + { + "epoch": 0.05, + "grad_norm": 4.81532161685942, + "learning_rate": 1.999110925146553e-06, + "loss": 1.3898, + "step": 7956 + }, + { + "epoch": 0.05, + "grad_norm": 4.507899544159196, + "learning_rate": 1.9991107013864243e-06, + "loss": 1.365, + "step": 7957 + }, + { + "epoch": 0.05, + "eval_loss": 1.5656800270080566, + "eval_runtime": 4.5989, + "eval_samples_per_second": 1.957, + "eval_steps_per_second": 1.087, + "step": 7957 + }, + { + "epoch": 0.05, + "grad_norm": 5.468155035828772, + "learning_rate": 1.999110477598154e-06, + "loss": 1.4633, + "step": 7958 + }, + { + "epoch": 0.05, + "grad_norm": 4.2164986604969865, + "learning_rate": 1.9991102537817422e-06, + "loss": 1.3522, + "step": 7959 + }, + { + "epoch": 0.05, + "grad_norm": 4.822052602668897, + "learning_rate": 1.9991100299371885e-06, + "loss": 1.5062, + "step": 7960 + }, + { + "epoch": 0.05, + "grad_norm": 4.428624494794631, + "learning_rate": 1.9991098060644933e-06, + "loss": 1.4166, + "step": 7961 + }, + { + "epoch": 0.05, + "grad_norm": 4.444161894170243, + "learning_rate": 1.999109582163657e-06, + "loss": 1.3916, + "step": 7962 + }, + { + "epoch": 0.05, + "grad_norm": 4.27619179863361, + "learning_rate": 1.9991093582346784e-06, + "loss": 1.2963, + "step": 7963 + }, + { + "epoch": 0.05, + "grad_norm": 5.723855075709038, + "learning_rate": 1.9991091342775587e-06, + "loss": 1.4435, + "step": 7964 + }, + { + "epoch": 0.05, + "grad_norm": 4.656683993322921, + "learning_rate": 1.999108910292297e-06, + "loss": 1.4645, + "step": 7965 + }, + { + "epoch": 0.05, + "grad_norm": 5.534865508099635, + "learning_rate": 1.999108686278894e-06, + "loss": 1.3887, + "step": 7966 + }, + { + "epoch": 0.05, + "grad_norm": 4.602022263169227, + "learning_rate": 1.9991084622373497e-06, + "loss": 1.4518, + "step": 7967 + }, + { + "epoch": 0.05, + "grad_norm": 5.004070186843401, + "learning_rate": 1.9991082381676635e-06, + "loss": 1.277, + "step": 7968 + }, + { + "epoch": 0.05, + "grad_norm": 4.458295029714002, + "learning_rate": 1.999108014069836e-06, + "loss": 1.3586, + "step": 7969 + }, + { + "epoch": 0.05, + "grad_norm": 4.511494213746791, + "learning_rate": 1.999107789943867e-06, + "loss": 1.3855, + "step": 7970 + }, + { + "epoch": 0.05, + "grad_norm": 3.955287824354757, + "learning_rate": 1.999107565789756e-06, + "loss": 1.2679, + "step": 7971 + }, + { + "epoch": 0.05, + "grad_norm": 4.182679028897944, + "learning_rate": 1.9991073416075034e-06, + "loss": 1.2102, + "step": 7972 + }, + { + "epoch": 0.05, + "grad_norm": 4.612797814274257, + "learning_rate": 1.9991071173971093e-06, + "loss": 1.0973, + "step": 7973 + }, + { + "epoch": 0.05, + "grad_norm": 4.556509974545994, + "learning_rate": 1.999106893158574e-06, + "loss": 1.4159, + "step": 7974 + }, + { + "epoch": 0.05, + "grad_norm": 4.744357186226214, + "learning_rate": 1.999106668891897e-06, + "loss": 1.4103, + "step": 7975 + }, + { + "epoch": 0.05, + "grad_norm": 5.116167986576523, + "learning_rate": 1.9991064445970787e-06, + "loss": 1.5958, + "step": 7976 + }, + { + "epoch": 0.05, + "grad_norm": 4.751589784125765, + "learning_rate": 1.999106220274119e-06, + "loss": 1.4511, + "step": 7977 + }, + { + "epoch": 0.05, + "grad_norm": 7.532163283927442, + "learning_rate": 1.9991059959230176e-06, + "loss": 1.4321, + "step": 7978 + }, + { + "epoch": 0.05, + "grad_norm": 4.218111466240033, + "learning_rate": 1.9991057715437743e-06, + "loss": 1.2408, + "step": 7979 + }, + { + "epoch": 0.05, + "grad_norm": 4.907482698351131, + "learning_rate": 1.99910554713639e-06, + "loss": 1.3935, + "step": 7980 + }, + { + "epoch": 0.05, + "grad_norm": 4.294816998411919, + "learning_rate": 1.9991053227008644e-06, + "loss": 1.2959, + "step": 7981 + }, + { + "epoch": 0.05, + "grad_norm": 4.583837137387118, + "learning_rate": 1.999105098237197e-06, + "loss": 1.2133, + "step": 7982 + }, + { + "epoch": 0.05, + "grad_norm": 4.710318921218467, + "learning_rate": 1.999104873745388e-06, + "loss": 1.2935, + "step": 7983 + }, + { + "epoch": 0.05, + "grad_norm": 5.0482056465076734, + "learning_rate": 1.9991046492254377e-06, + "loss": 1.3708, + "step": 7984 + }, + { + "epoch": 0.05, + "grad_norm": 4.492323569193223, + "learning_rate": 1.999104424677346e-06, + "loss": 1.4503, + "step": 7985 + }, + { + "epoch": 0.05, + "grad_norm": 6.54824753415298, + "learning_rate": 1.999104200101113e-06, + "loss": 1.3438, + "step": 7986 + }, + { + "epoch": 0.05, + "grad_norm": 5.502040850443269, + "learning_rate": 1.9991039754967384e-06, + "loss": 1.4879, + "step": 7987 + }, + { + "epoch": 0.05, + "grad_norm": 4.6823869156823354, + "learning_rate": 1.999103750864222e-06, + "loss": 1.3546, + "step": 7988 + }, + { + "epoch": 0.05, + "grad_norm": 4.368056132760108, + "learning_rate": 1.9991035262035644e-06, + "loss": 1.3031, + "step": 7989 + }, + { + "epoch": 0.05, + "grad_norm": 4.6645893279900275, + "learning_rate": 1.9991033015147657e-06, + "loss": 1.5497, + "step": 7990 + }, + { + "epoch": 0.05, + "grad_norm": 4.542573730753874, + "learning_rate": 1.999103076797825e-06, + "loss": 1.3482, + "step": 7991 + }, + { + "epoch": 0.05, + "grad_norm": 5.665709613631178, + "learning_rate": 1.9991028520527437e-06, + "loss": 1.2485, + "step": 7992 + }, + { + "epoch": 0.05, + "grad_norm": 4.990763824548995, + "learning_rate": 1.9991026272795204e-06, + "loss": 1.4217, + "step": 7993 + }, + { + "epoch": 0.05, + "grad_norm": 4.967378873268701, + "learning_rate": 1.9991024024781557e-06, + "loss": 1.3438, + "step": 7994 + }, + { + "epoch": 0.05, + "grad_norm": 4.3785765666543845, + "learning_rate": 1.99910217764865e-06, + "loss": 1.264, + "step": 7995 + }, + { + "epoch": 0.05, + "grad_norm": 4.184943661373519, + "learning_rate": 1.9991019527910025e-06, + "loss": 1.2377, + "step": 7996 + }, + { + "epoch": 0.05, + "grad_norm": 6.05297203094123, + "learning_rate": 1.999101727905214e-06, + "loss": 1.4009, + "step": 7997 + }, + { + "epoch": 0.05, + "grad_norm": 4.199034736763347, + "learning_rate": 1.9991015029912837e-06, + "loss": 1.2605, + "step": 7998 + }, + { + "epoch": 0.05, + "grad_norm": 4.911575981094752, + "learning_rate": 1.9991012780492124e-06, + "loss": 1.3555, + "step": 7999 + }, + { + "epoch": 0.05, + "grad_norm": 5.486350257805041, + "learning_rate": 1.9991010530789998e-06, + "loss": 1.3395, + "step": 8000 + }, + { + "epoch": 0.05, + "grad_norm": 6.805556361768766, + "learning_rate": 1.9991008280806453e-06, + "loss": 1.21, + "step": 8001 + }, + { + "epoch": 0.05, + "grad_norm": 4.78064675158563, + "learning_rate": 1.9991006030541498e-06, + "loss": 1.383, + "step": 8002 + }, + { + "epoch": 0.05, + "grad_norm": 4.3981583216140905, + "learning_rate": 1.999100377999513e-06, + "loss": 1.4246, + "step": 8003 + }, + { + "epoch": 0.05, + "grad_norm": 5.032638384032344, + "learning_rate": 1.999100152916735e-06, + "loss": 1.3025, + "step": 8004 + }, + { + "epoch": 0.05, + "grad_norm": 4.482588881582473, + "learning_rate": 1.9990999278058154e-06, + "loss": 1.3108, + "step": 8005 + }, + { + "epoch": 0.05, + "grad_norm": 4.962625365251077, + "learning_rate": 1.9990997026667547e-06, + "loss": 1.3236, + "step": 8006 + }, + { + "epoch": 0.05, + "grad_norm": 5.180179909877052, + "learning_rate": 1.9990994774995522e-06, + "loss": 1.3536, + "step": 8007 + }, + { + "epoch": 0.05, + "grad_norm": 4.802706010129838, + "learning_rate": 1.999099252304209e-06, + "loss": 1.4625, + "step": 8008 + }, + { + "epoch": 0.05, + "grad_norm": 5.110185539558003, + "learning_rate": 1.9990990270807244e-06, + "loss": 1.3365, + "step": 8009 + }, + { + "epoch": 0.05, + "grad_norm": 4.422397453228428, + "learning_rate": 1.999098801829098e-06, + "loss": 1.3289, + "step": 8010 + }, + { + "epoch": 0.05, + "grad_norm": 4.332316772085707, + "learning_rate": 1.999098576549331e-06, + "loss": 1.3325, + "step": 8011 + }, + { + "epoch": 0.05, + "grad_norm": 4.458263958095114, + "learning_rate": 1.9990983512414223e-06, + "loss": 1.2976, + "step": 8012 + }, + { + "epoch": 0.05, + "grad_norm": 4.7067448065935835, + "learning_rate": 1.9990981259053723e-06, + "loss": 1.4172, + "step": 8013 + }, + { + "epoch": 0.05, + "grad_norm": 4.782644696484535, + "learning_rate": 1.999097900541181e-06, + "loss": 1.5318, + "step": 8014 + }, + { + "epoch": 0.05, + "grad_norm": 5.839935554073177, + "learning_rate": 1.9990976751488484e-06, + "loss": 1.285, + "step": 8015 + }, + { + "epoch": 0.05, + "grad_norm": 4.659889508150192, + "learning_rate": 1.9990974497283747e-06, + "loss": 1.4118, + "step": 8016 + }, + { + "epoch": 0.05, + "grad_norm": 4.416726312842264, + "learning_rate": 1.99909722427976e-06, + "loss": 1.4156, + "step": 8017 + }, + { + "epoch": 0.05, + "grad_norm": 4.644568718442651, + "learning_rate": 1.9990969988030037e-06, + "loss": 1.3075, + "step": 8018 + }, + { + "epoch": 0.05, + "grad_norm": 4.8281507216630555, + "learning_rate": 1.999096773298106e-06, + "loss": 1.3449, + "step": 8019 + }, + { + "epoch": 0.05, + "grad_norm": 4.261762514200759, + "learning_rate": 1.9990965477650676e-06, + "loss": 1.3258, + "step": 8020 + }, + { + "epoch": 0.05, + "grad_norm": 4.683571660693128, + "learning_rate": 1.9990963222038877e-06, + "loss": 1.1711, + "step": 8021 + }, + { + "epoch": 0.05, + "grad_norm": 4.613190518882954, + "learning_rate": 1.9990960966145667e-06, + "loss": 1.45, + "step": 8022 + }, + { + "epoch": 0.05, + "grad_norm": 4.498647667229803, + "learning_rate": 1.999095870997104e-06, + "loss": 1.3781, + "step": 8023 + }, + { + "epoch": 0.05, + "grad_norm": 4.843641654557754, + "learning_rate": 1.9990956453515007e-06, + "loss": 1.3764, + "step": 8024 + }, + { + "epoch": 0.05, + "grad_norm": 7.69355776374498, + "learning_rate": 1.999095419677756e-06, + "loss": 1.3676, + "step": 8025 + }, + { + "epoch": 0.05, + "grad_norm": 4.866485525806812, + "learning_rate": 1.99909519397587e-06, + "loss": 1.3919, + "step": 8026 + }, + { + "epoch": 0.05, + "grad_norm": 4.839147512196882, + "learning_rate": 1.999094968245843e-06, + "loss": 1.417, + "step": 8027 + }, + { + "epoch": 0.05, + "grad_norm": 4.822528422916652, + "learning_rate": 1.999094742487675e-06, + "loss": 1.4444, + "step": 8028 + }, + { + "epoch": 0.05, + "grad_norm": 4.54834627752157, + "learning_rate": 1.999094516701365e-06, + "loss": 1.4076, + "step": 8029 + }, + { + "epoch": 0.05, + "grad_norm": 4.852623418740598, + "learning_rate": 1.9990942908869145e-06, + "loss": 1.3949, + "step": 8030 + }, + { + "epoch": 0.05, + "eval_loss": 1.563845157623291, + "eval_runtime": 4.6163, + "eval_samples_per_second": 1.95, + "eval_steps_per_second": 1.083, + "step": 8030 + }, + { + "epoch": 0.05, + "grad_norm": 4.770142324284321, + "learning_rate": 1.9990940650443227e-06, + "loss": 1.3439, + "step": 8031 + }, + { + "epoch": 0.05, + "grad_norm": 4.425790507467108, + "learning_rate": 1.99909383917359e-06, + "loss": 1.4299, + "step": 8032 + }, + { + "epoch": 0.05, + "grad_norm": 4.662183949613705, + "learning_rate": 1.9990936132747157e-06, + "loss": 1.3105, + "step": 8033 + }, + { + "epoch": 0.05, + "grad_norm": 4.524969193411602, + "learning_rate": 1.9990933873477e-06, + "loss": 1.2497, + "step": 8034 + }, + { + "epoch": 0.05, + "grad_norm": 4.710194129926366, + "learning_rate": 1.999093161392544e-06, + "loss": 1.5204, + "step": 8035 + }, + { + "epoch": 0.05, + "grad_norm": 4.416709929502793, + "learning_rate": 1.9990929354092465e-06, + "loss": 1.3901, + "step": 8036 + }, + { + "epoch": 0.05, + "grad_norm": 4.659000921428525, + "learning_rate": 1.9990927093978075e-06, + "loss": 1.3202, + "step": 8037 + }, + { + "epoch": 0.05, + "grad_norm": 4.837349181329828, + "learning_rate": 1.999092483358228e-06, + "loss": 1.3652, + "step": 8038 + }, + { + "epoch": 0.05, + "grad_norm": 4.490570955074692, + "learning_rate": 1.999092257290507e-06, + "loss": 1.5145, + "step": 8039 + }, + { + "epoch": 0.05, + "grad_norm": 4.672958586547291, + "learning_rate": 1.9990920311946453e-06, + "loss": 1.4583, + "step": 8040 + }, + { + "epoch": 0.05, + "grad_norm": 5.354063146841929, + "learning_rate": 1.999091805070642e-06, + "loss": 1.422, + "step": 8041 + }, + { + "epoch": 0.05, + "grad_norm": 4.199683664508161, + "learning_rate": 1.9990915789184977e-06, + "loss": 1.1644, + "step": 8042 + }, + { + "epoch": 0.05, + "grad_norm": 5.436254873694826, + "learning_rate": 1.9990913527382125e-06, + "loss": 1.3262, + "step": 8043 + }, + { + "epoch": 0.05, + "grad_norm": 4.9643955407837685, + "learning_rate": 1.999091126529786e-06, + "loss": 1.3082, + "step": 8044 + }, + { + "epoch": 0.05, + "grad_norm": 4.61008019829296, + "learning_rate": 1.9990909002932187e-06, + "loss": 1.3491, + "step": 8045 + }, + { + "epoch": 0.05, + "grad_norm": 4.3840310204567725, + "learning_rate": 1.99909067402851e-06, + "loss": 1.3867, + "step": 8046 + }, + { + "epoch": 0.05, + "grad_norm": 4.60962567518782, + "learning_rate": 1.9990904477356606e-06, + "loss": 1.3611, + "step": 8047 + }, + { + "epoch": 0.05, + "grad_norm": 4.535853222553209, + "learning_rate": 1.99909022141467e-06, + "loss": 1.3939, + "step": 8048 + }, + { + "epoch": 0.05, + "grad_norm": 4.937640684983281, + "learning_rate": 1.9990899950655386e-06, + "loss": 1.4775, + "step": 8049 + }, + { + "epoch": 0.05, + "grad_norm": 4.1642360975629025, + "learning_rate": 1.9990897686882657e-06, + "loss": 1.2731, + "step": 8050 + }, + { + "epoch": 0.05, + "grad_norm": 4.2612358008255615, + "learning_rate": 1.999089542282852e-06, + "loss": 1.1743, + "step": 8051 + }, + { + "epoch": 0.05, + "grad_norm": 4.258771985240577, + "learning_rate": 1.999089315849297e-06, + "loss": 1.3246, + "step": 8052 + }, + { + "epoch": 0.05, + "grad_norm": 4.376711858300126, + "learning_rate": 1.999089089387601e-06, + "loss": 1.4183, + "step": 8053 + }, + { + "epoch": 0.05, + "grad_norm": 5.72324447125782, + "learning_rate": 1.999088862897765e-06, + "loss": 1.281, + "step": 8054 + }, + { + "epoch": 0.05, + "grad_norm": 7.146245924107082, + "learning_rate": 1.9990886363797866e-06, + "loss": 1.2382, + "step": 8055 + }, + { + "epoch": 0.05, + "grad_norm": 5.281382016603405, + "learning_rate": 1.999088409833668e-06, + "loss": 1.3788, + "step": 8056 + }, + { + "epoch": 0.05, + "grad_norm": 4.717696954685534, + "learning_rate": 1.999088183259408e-06, + "loss": 1.196, + "step": 8057 + }, + { + "epoch": 0.05, + "grad_norm": 6.413650640572415, + "learning_rate": 1.9990879566570074e-06, + "loss": 1.4799, + "step": 8058 + }, + { + "epoch": 0.05, + "grad_norm": 4.5808792742950875, + "learning_rate": 1.9990877300264658e-06, + "loss": 1.45, + "step": 8059 + }, + { + "epoch": 0.05, + "grad_norm": 4.075199271728905, + "learning_rate": 1.9990875033677827e-06, + "loss": 1.2714, + "step": 8060 + }, + { + "epoch": 0.05, + "grad_norm": 4.8411634561770445, + "learning_rate": 1.999087276680959e-06, + "loss": 1.4637, + "step": 8061 + }, + { + "epoch": 0.05, + "grad_norm": 4.730347196004556, + "learning_rate": 1.999087049965994e-06, + "loss": 1.462, + "step": 8062 + }, + { + "epoch": 0.05, + "grad_norm": 5.45352777292585, + "learning_rate": 1.9990868232228885e-06, + "loss": 1.4368, + "step": 8063 + }, + { + "epoch": 0.05, + "grad_norm": 4.95024326032512, + "learning_rate": 1.999086596451642e-06, + "loss": 1.2837, + "step": 8064 + }, + { + "epoch": 0.05, + "grad_norm": 4.546712513743431, + "learning_rate": 1.999086369652255e-06, + "loss": 1.326, + "step": 8065 + }, + { + "epoch": 0.05, + "grad_norm": 4.762441048351948, + "learning_rate": 1.999086142824726e-06, + "loss": 1.3847, + "step": 8066 + }, + { + "epoch": 0.05, + "grad_norm": 4.086265084186629, + "learning_rate": 1.9990859159690564e-06, + "loss": 1.1633, + "step": 8067 + }, + { + "epoch": 0.05, + "grad_norm": 6.362827333991816, + "learning_rate": 1.999085689085246e-06, + "loss": 1.2907, + "step": 8068 + }, + { + "epoch": 0.05, + "grad_norm": 4.84438097168186, + "learning_rate": 1.999085462173295e-06, + "loss": 1.3589, + "step": 8069 + }, + { + "epoch": 0.05, + "grad_norm": 5.430526178904168, + "learning_rate": 1.9990852352332025e-06, + "loss": 1.5309, + "step": 8070 + }, + { + "epoch": 0.05, + "grad_norm": 4.199522070568892, + "learning_rate": 1.9990850082649696e-06, + "loss": 1.3193, + "step": 8071 + }, + { + "epoch": 0.05, + "grad_norm": 4.32869786099321, + "learning_rate": 1.9990847812685956e-06, + "loss": 1.2633, + "step": 8072 + }, + { + "epoch": 0.05, + "grad_norm": 5.751583966232769, + "learning_rate": 1.9990845542440807e-06, + "loss": 1.3492, + "step": 8073 + }, + { + "epoch": 0.05, + "grad_norm": 5.942820926087466, + "learning_rate": 1.999084327191425e-06, + "loss": 1.5252, + "step": 8074 + }, + { + "epoch": 0.05, + "grad_norm": 4.591915314043756, + "learning_rate": 1.999084100110628e-06, + "loss": 1.3366, + "step": 8075 + }, + { + "epoch": 0.05, + "grad_norm": 4.778276917265565, + "learning_rate": 1.9990838730016906e-06, + "loss": 1.2202, + "step": 8076 + }, + { + "epoch": 0.05, + "grad_norm": 5.517214795496894, + "learning_rate": 1.999083645864612e-06, + "loss": 1.4901, + "step": 8077 + }, + { + "epoch": 0.05, + "grad_norm": 4.597442320894966, + "learning_rate": 1.999083418699393e-06, + "loss": 1.3603, + "step": 8078 + }, + { + "epoch": 0.05, + "grad_norm": 5.3166984015696706, + "learning_rate": 1.999083191506033e-06, + "loss": 1.3266, + "step": 8079 + }, + { + "epoch": 0.05, + "grad_norm": 4.346967686875101, + "learning_rate": 1.9990829642845316e-06, + "loss": 1.3003, + "step": 8080 + }, + { + "epoch": 0.05, + "grad_norm": 7.410128572251315, + "learning_rate": 1.99908273703489e-06, + "loss": 1.2521, + "step": 8081 + }, + { + "epoch": 0.05, + "grad_norm": 7.1329079467163226, + "learning_rate": 1.9990825097571073e-06, + "loss": 1.4368, + "step": 8082 + }, + { + "epoch": 0.05, + "grad_norm": 4.634889547626244, + "learning_rate": 1.999082282451184e-06, + "loss": 1.2245, + "step": 8083 + }, + { + "epoch": 0.05, + "grad_norm": 4.839212670748647, + "learning_rate": 1.999082055117119e-06, + "loss": 1.3027, + "step": 8084 + }, + { + "epoch": 0.05, + "grad_norm": 4.150063099936581, + "learning_rate": 1.999081827754914e-06, + "loss": 1.3169, + "step": 8085 + }, + { + "epoch": 0.05, + "grad_norm": 4.554343401129474, + "learning_rate": 1.9990816003645683e-06, + "loss": 1.3174, + "step": 8086 + }, + { + "epoch": 0.05, + "grad_norm": 4.9142787613994585, + "learning_rate": 1.9990813729460814e-06, + "loss": 1.5393, + "step": 8087 + }, + { + "epoch": 0.05, + "grad_norm": 5.055839235927033, + "learning_rate": 1.999081145499454e-06, + "loss": 1.3263, + "step": 8088 + }, + { + "epoch": 0.05, + "grad_norm": 4.193998611837276, + "learning_rate": 1.9990809180246856e-06, + "loss": 1.391, + "step": 8089 + }, + { + "epoch": 0.05, + "grad_norm": 4.874541138813153, + "learning_rate": 1.999080690521776e-06, + "loss": 1.5614, + "step": 8090 + }, + { + "epoch": 0.05, + "grad_norm": 4.20299120814966, + "learning_rate": 1.9990804629907267e-06, + "loss": 1.2315, + "step": 8091 + }, + { + "epoch": 0.05, + "grad_norm": 4.345438405753769, + "learning_rate": 1.999080235431536e-06, + "loss": 1.2839, + "step": 8092 + }, + { + "epoch": 0.05, + "grad_norm": 4.4432134053918, + "learning_rate": 1.9990800078442043e-06, + "loss": 1.3814, + "step": 8093 + }, + { + "epoch": 0.05, + "grad_norm": 4.733003354675106, + "learning_rate": 1.9990797802287323e-06, + "loss": 1.4173, + "step": 8094 + }, + { + "epoch": 0.05, + "grad_norm": 4.810909281814115, + "learning_rate": 1.9990795525851193e-06, + "loss": 1.5015, + "step": 8095 + }, + { + "epoch": 0.05, + "grad_norm": 4.936191269032214, + "learning_rate": 1.9990793249133654e-06, + "loss": 1.2872, + "step": 8096 + }, + { + "epoch": 0.05, + "grad_norm": 4.955941674963165, + "learning_rate": 1.999079097213471e-06, + "loss": 1.1613, + "step": 8097 + }, + { + "epoch": 0.05, + "grad_norm": 4.7887043356105945, + "learning_rate": 1.999078869485436e-06, + "loss": 1.4362, + "step": 8098 + }, + { + "epoch": 0.05, + "grad_norm": 4.838216520686419, + "learning_rate": 1.9990786417292598e-06, + "loss": 1.2953, + "step": 8099 + }, + { + "epoch": 0.05, + "grad_norm": 4.341617675005489, + "learning_rate": 1.999078413944943e-06, + "loss": 1.4635, + "step": 8100 + }, + { + "epoch": 0.05, + "grad_norm": 7.002144125987926, + "learning_rate": 1.9990781861324856e-06, + "loss": 1.3311, + "step": 8101 + }, + { + "epoch": 0.05, + "grad_norm": 6.251194460531906, + "learning_rate": 1.999077958291888e-06, + "loss": 1.5165, + "step": 8102 + }, + { + "epoch": 0.05, + "grad_norm": 4.8632412765528255, + "learning_rate": 1.999077730423149e-06, + "loss": 1.3268, + "step": 8103 + }, + { + "epoch": 0.05, + "eval_loss": 1.5651912689208984, + "eval_runtime": 4.5788, + "eval_samples_per_second": 1.966, + "eval_steps_per_second": 1.092, + "step": 8103 + }, + { + "epoch": 0.05, + "grad_norm": 5.008887883460614, + "learning_rate": 1.9990775025262696e-06, + "loss": 1.1079, + "step": 8104 + }, + { + "epoch": 0.05, + "grad_norm": 4.350921997029759, + "learning_rate": 1.9990772746012494e-06, + "loss": 1.2946, + "step": 8105 + }, + { + "epoch": 0.05, + "grad_norm": 6.089920056338072, + "learning_rate": 1.999077046648088e-06, + "loss": 1.513, + "step": 8106 + }, + { + "epoch": 0.05, + "grad_norm": 4.9552021225881475, + "learning_rate": 1.999076818666787e-06, + "loss": 1.0746, + "step": 8107 + }, + { + "epoch": 0.05, + "grad_norm": 4.505434682306873, + "learning_rate": 1.9990765906573446e-06, + "loss": 1.4548, + "step": 8108 + }, + { + "epoch": 0.05, + "grad_norm": 4.160285517656663, + "learning_rate": 1.999076362619762e-06, + "loss": 1.2986, + "step": 8109 + }, + { + "epoch": 0.05, + "grad_norm": 4.645818342890148, + "learning_rate": 1.999076134554038e-06, + "loss": 1.4311, + "step": 8110 + }, + { + "epoch": 0.05, + "grad_norm": 4.421401785116704, + "learning_rate": 1.999075906460174e-06, + "loss": 1.3205, + "step": 8111 + }, + { + "epoch": 0.05, + "grad_norm": 3.9282532997582704, + "learning_rate": 1.999075678338169e-06, + "loss": 1.1389, + "step": 8112 + }, + { + "epoch": 0.05, + "grad_norm": 4.554066141171839, + "learning_rate": 1.9990754501880237e-06, + "loss": 1.3809, + "step": 8113 + }, + { + "epoch": 0.05, + "grad_norm": 4.322508516203889, + "learning_rate": 1.9990752220097377e-06, + "loss": 1.3827, + "step": 8114 + }, + { + "epoch": 0.05, + "grad_norm": 5.111361288098748, + "learning_rate": 1.999074993803311e-06, + "loss": 1.5507, + "step": 8115 + }, + { + "epoch": 0.05, + "grad_norm": 4.6004343376874495, + "learning_rate": 1.9990747655687435e-06, + "loss": 1.3917, + "step": 8116 + }, + { + "epoch": 0.05, + "grad_norm": 4.425102652583931, + "learning_rate": 1.999074537306036e-06, + "loss": 1.3274, + "step": 8117 + }, + { + "epoch": 0.05, + "grad_norm": 4.092531236559646, + "learning_rate": 1.9990743090151872e-06, + "loss": 1.2099, + "step": 8118 + }, + { + "epoch": 0.05, + "grad_norm": 4.9732572140275995, + "learning_rate": 1.999074080696198e-06, + "loss": 1.4398, + "step": 8119 + }, + { + "epoch": 0.05, + "grad_norm": 5.302127272681002, + "learning_rate": 1.9990738523490683e-06, + "loss": 1.3473, + "step": 8120 + }, + { + "epoch": 0.05, + "grad_norm": 4.454496335329512, + "learning_rate": 1.999073623973798e-06, + "loss": 1.2049, + "step": 8121 + }, + { + "epoch": 0.05, + "grad_norm": 5.092550662199784, + "learning_rate": 1.9990733955703867e-06, + "loss": 1.3565, + "step": 8122 + }, + { + "epoch": 0.05, + "grad_norm": 5.8762127998745886, + "learning_rate": 1.9990731671388353e-06, + "loss": 1.2918, + "step": 8123 + }, + { + "epoch": 0.05, + "grad_norm": 4.92831374647037, + "learning_rate": 1.9990729386791433e-06, + "loss": 1.3664, + "step": 8124 + }, + { + "epoch": 0.05, + "grad_norm": 4.269137439168466, + "learning_rate": 1.999072710191311e-06, + "loss": 1.2979, + "step": 8125 + }, + { + "epoch": 0.05, + "grad_norm": 4.342377066117419, + "learning_rate": 1.9990724816753377e-06, + "loss": 1.273, + "step": 8126 + }, + { + "epoch": 0.05, + "grad_norm": 4.192875267629217, + "learning_rate": 1.9990722531312237e-06, + "loss": 1.2949, + "step": 8127 + }, + { + "epoch": 0.05, + "grad_norm": 4.634802307585735, + "learning_rate": 1.9990720245589695e-06, + "loss": 1.3945, + "step": 8128 + }, + { + "epoch": 0.05, + "grad_norm": 4.469188523963776, + "learning_rate": 1.999071795958575e-06, + "loss": 1.33, + "step": 8129 + }, + { + "epoch": 0.05, + "grad_norm": 4.239769048194955, + "learning_rate": 1.9990715673300395e-06, + "loss": 1.3014, + "step": 8130 + }, + { + "epoch": 0.05, + "grad_norm": 4.920258791962778, + "learning_rate": 1.9990713386733637e-06, + "loss": 1.4194, + "step": 8131 + }, + { + "epoch": 0.05, + "grad_norm": 5.289521187272178, + "learning_rate": 1.9990711099885473e-06, + "loss": 1.4002, + "step": 8132 + }, + { + "epoch": 0.05, + "grad_norm": 5.62927614473075, + "learning_rate": 1.9990708812755904e-06, + "loss": 1.4542, + "step": 8133 + }, + { + "epoch": 0.05, + "grad_norm": 4.871332993893018, + "learning_rate": 1.999070652534493e-06, + "loss": 1.3714, + "step": 8134 + }, + { + "epoch": 0.05, + "grad_norm": 4.422696119803318, + "learning_rate": 1.999070423765255e-06, + "loss": 1.2183, + "step": 8135 + }, + { + "epoch": 0.05, + "grad_norm": 5.9151491999559225, + "learning_rate": 1.9990701949678767e-06, + "loss": 1.5087, + "step": 8136 + }, + { + "epoch": 0.05, + "grad_norm": 4.5374246993029335, + "learning_rate": 1.9990699661423576e-06, + "loss": 1.1925, + "step": 8137 + }, + { + "epoch": 0.05, + "grad_norm": 5.337201794890096, + "learning_rate": 1.9990697372886983e-06, + "loss": 1.342, + "step": 8138 + }, + { + "epoch": 0.05, + "grad_norm": 6.630343140609443, + "learning_rate": 1.9990695084068985e-06, + "loss": 1.1705, + "step": 8139 + }, + { + "epoch": 0.06, + "grad_norm": 4.69765815208526, + "learning_rate": 1.9990692794969586e-06, + "loss": 1.1733, + "step": 8140 + }, + { + "epoch": 0.06, + "grad_norm": 4.345402075168154, + "learning_rate": 1.9990690505588772e-06, + "loss": 1.3698, + "step": 8141 + }, + { + "epoch": 0.06, + "grad_norm": 4.631343672968385, + "learning_rate": 1.999068821592656e-06, + "loss": 1.257, + "step": 8142 + }, + { + "epoch": 0.06, + "grad_norm": 4.442509378135107, + "learning_rate": 1.9990685925982946e-06, + "loss": 1.4532, + "step": 8143 + }, + { + "epoch": 0.06, + "grad_norm": 4.687434316828365, + "learning_rate": 1.999068363575793e-06, + "loss": 1.316, + "step": 8144 + }, + { + "epoch": 0.06, + "grad_norm": 4.462368073655987, + "learning_rate": 1.99906813452515e-06, + "loss": 1.2797, + "step": 8145 + }, + { + "epoch": 0.06, + "grad_norm": 4.5130780002857245, + "learning_rate": 1.9990679054463673e-06, + "loss": 1.3643, + "step": 8146 + }, + { + "epoch": 0.06, + "grad_norm": 4.828893305048497, + "learning_rate": 1.999067676339444e-06, + "loss": 1.2414, + "step": 8147 + }, + { + "epoch": 0.06, + "grad_norm": 6.034146510706116, + "learning_rate": 1.9990674472043804e-06, + "loss": 1.3613, + "step": 8148 + }, + { + "epoch": 0.06, + "grad_norm": 4.297186875229896, + "learning_rate": 1.999067218041176e-06, + "loss": 1.2414, + "step": 8149 + }, + { + "epoch": 0.06, + "grad_norm": 4.7763642568178195, + "learning_rate": 1.9990669888498313e-06, + "loss": 1.3164, + "step": 8150 + }, + { + "epoch": 0.06, + "grad_norm": 5.062451573791261, + "learning_rate": 1.9990667596303465e-06, + "loss": 1.5195, + "step": 8151 + }, + { + "epoch": 0.06, + "grad_norm": 4.612083552367059, + "learning_rate": 1.9990665303827212e-06, + "loss": 1.3243, + "step": 8152 + }, + { + "epoch": 0.06, + "grad_norm": 4.483491492990833, + "learning_rate": 1.9990663011069554e-06, + "loss": 1.3377, + "step": 8153 + }, + { + "epoch": 0.06, + "grad_norm": 4.964050117142661, + "learning_rate": 1.9990660718030494e-06, + "loss": 1.3485, + "step": 8154 + }, + { + "epoch": 0.06, + "grad_norm": 5.014820366858035, + "learning_rate": 1.999065842471003e-06, + "loss": 1.2653, + "step": 8155 + }, + { + "epoch": 0.06, + "grad_norm": 4.754692728145271, + "learning_rate": 1.9990656131108158e-06, + "loss": 1.4429, + "step": 8156 + }, + { + "epoch": 0.06, + "grad_norm": 8.519424652866649, + "learning_rate": 1.9990653837224886e-06, + "loss": 1.402, + "step": 8157 + }, + { + "epoch": 0.06, + "grad_norm": 4.540035419705552, + "learning_rate": 1.9990651543060212e-06, + "loss": 1.2572, + "step": 8158 + }, + { + "epoch": 0.06, + "grad_norm": 6.619455235040363, + "learning_rate": 1.9990649248614133e-06, + "loss": 1.4367, + "step": 8159 + }, + { + "epoch": 0.06, + "grad_norm": 4.71675450608144, + "learning_rate": 1.999064695388665e-06, + "loss": 1.4176, + "step": 8160 + }, + { + "epoch": 0.06, + "grad_norm": 7.993286865850736, + "learning_rate": 1.9990644658877763e-06, + "loss": 1.2892, + "step": 8161 + }, + { + "epoch": 0.06, + "grad_norm": 5.6147445009177295, + "learning_rate": 1.9990642363587476e-06, + "loss": 1.5793, + "step": 8162 + }, + { + "epoch": 0.06, + "grad_norm": 5.238736997453921, + "learning_rate": 1.9990640068015783e-06, + "loss": 1.4, + "step": 8163 + }, + { + "epoch": 0.06, + "grad_norm": 6.103905343401169, + "learning_rate": 1.999063777216269e-06, + "loss": 1.5593, + "step": 8164 + }, + { + "epoch": 0.06, + "grad_norm": 4.584271636284578, + "learning_rate": 1.999063547602819e-06, + "loss": 1.3605, + "step": 8165 + }, + { + "epoch": 0.06, + "grad_norm": 4.658753577897113, + "learning_rate": 1.999063317961229e-06, + "loss": 1.4471, + "step": 8166 + }, + { + "epoch": 0.06, + "grad_norm": 4.447134445371129, + "learning_rate": 1.9990630882914983e-06, + "loss": 1.261, + "step": 8167 + }, + { + "epoch": 0.06, + "grad_norm": 4.620107268350806, + "learning_rate": 1.999062858593628e-06, + "loss": 1.3733, + "step": 8168 + }, + { + "epoch": 0.06, + "grad_norm": 4.403263421456244, + "learning_rate": 1.9990626288676167e-06, + "loss": 1.2944, + "step": 8169 + }, + { + "epoch": 0.06, + "grad_norm": 4.654332866115498, + "learning_rate": 1.9990623991134657e-06, + "loss": 1.311, + "step": 8170 + }, + { + "epoch": 0.06, + "grad_norm": 4.717059859979367, + "learning_rate": 1.999062169331174e-06, + "loss": 1.3374, + "step": 8171 + }, + { + "epoch": 0.06, + "grad_norm": 4.582485380958364, + "learning_rate": 1.9990619395207425e-06, + "loss": 1.2183, + "step": 8172 + }, + { + "epoch": 0.06, + "grad_norm": 4.330372017051996, + "learning_rate": 1.9990617096821707e-06, + "loss": 1.324, + "step": 8173 + }, + { + "epoch": 0.06, + "grad_norm": 5.303122848386714, + "learning_rate": 1.9990614798154583e-06, + "loss": 1.3615, + "step": 8174 + }, + { + "epoch": 0.06, + "grad_norm": 4.593865282301448, + "learning_rate": 1.999061249920606e-06, + "loss": 1.4621, + "step": 8175 + }, + { + "epoch": 0.06, + "grad_norm": 4.706563442228462, + "learning_rate": 1.999061019997613e-06, + "loss": 1.3489, + "step": 8176 + }, + { + "epoch": 0.06, + "eval_loss": 1.5680981874465942, + "eval_runtime": 4.6026, + "eval_samples_per_second": 1.955, + "eval_steps_per_second": 1.086, + "step": 8176 + }, + { + "epoch": 0.06, + "grad_norm": 5.004139685310967, + "learning_rate": 1.99906079004648e-06, + "loss": 1.4981, + "step": 8177 + }, + { + "epoch": 0.06, + "grad_norm": 4.3104376332742405, + "learning_rate": 1.9990605600672067e-06, + "loss": 1.3347, + "step": 8178 + }, + { + "epoch": 0.06, + "grad_norm": 4.80355136067286, + "learning_rate": 1.9990603300597937e-06, + "loss": 1.3438, + "step": 8179 + }, + { + "epoch": 0.06, + "grad_norm": 5.289810607765796, + "learning_rate": 1.99906010002424e-06, + "loss": 1.4534, + "step": 8180 + }, + { + "epoch": 0.06, + "grad_norm": 6.165244859201323, + "learning_rate": 1.999059869960546e-06, + "loss": 1.4254, + "step": 8181 + }, + { + "epoch": 0.06, + "grad_norm": 5.194961481009174, + "learning_rate": 1.999059639868712e-06, + "loss": 1.3516, + "step": 8182 + }, + { + "epoch": 0.06, + "grad_norm": 4.484051437901652, + "learning_rate": 1.9990594097487382e-06, + "loss": 1.5406, + "step": 8183 + }, + { + "epoch": 0.06, + "grad_norm": 7.2052409589842314, + "learning_rate": 1.9990591796006238e-06, + "loss": 1.5056, + "step": 8184 + }, + { + "epoch": 0.06, + "grad_norm": 4.527318203756598, + "learning_rate": 1.999058949424369e-06, + "loss": 1.5218, + "step": 8185 + }, + { + "epoch": 0.06, + "grad_norm": 5.462347342839494, + "learning_rate": 1.9990587192199744e-06, + "loss": 1.4686, + "step": 8186 + }, + { + "epoch": 0.06, + "grad_norm": 4.342169819394465, + "learning_rate": 1.9990584889874395e-06, + "loss": 1.3446, + "step": 8187 + }, + { + "epoch": 0.06, + "grad_norm": 4.3229422682438114, + "learning_rate": 1.9990582587267645e-06, + "loss": 1.2816, + "step": 8188 + }, + { + "epoch": 0.06, + "grad_norm": 5.256379348200149, + "learning_rate": 1.9990580284379493e-06, + "loss": 1.2799, + "step": 8189 + }, + { + "epoch": 0.06, + "grad_norm": 4.416028606240512, + "learning_rate": 1.999057798120994e-06, + "loss": 1.271, + "step": 8190 + }, + { + "epoch": 0.06, + "grad_norm": 4.350518838122808, + "learning_rate": 1.9990575677758987e-06, + "loss": 1.3903, + "step": 8191 + }, + { + "epoch": 0.06, + "grad_norm": 4.423070548389722, + "learning_rate": 1.999057337402663e-06, + "loss": 1.3614, + "step": 8192 + }, + { + "epoch": 0.06, + "grad_norm": 4.665553755458738, + "learning_rate": 1.9990571070012875e-06, + "loss": 1.3726, + "step": 8193 + }, + { + "epoch": 0.06, + "grad_norm": 4.109205964016318, + "learning_rate": 1.9990568765717717e-06, + "loss": 1.2828, + "step": 8194 + }, + { + "epoch": 0.06, + "grad_norm": 5.287821304594056, + "learning_rate": 1.999056646114116e-06, + "loss": 1.5666, + "step": 8195 + }, + { + "epoch": 0.06, + "grad_norm": 4.529713016551012, + "learning_rate": 1.9990564156283197e-06, + "loss": 1.3478, + "step": 8196 + }, + { + "epoch": 0.06, + "grad_norm": 5.784976617486848, + "learning_rate": 1.9990561851143836e-06, + "loss": 1.4471, + "step": 8197 + }, + { + "epoch": 0.06, + "grad_norm": 4.454400798451111, + "learning_rate": 1.9990559545723077e-06, + "loss": 1.2595, + "step": 8198 + }, + { + "epoch": 0.06, + "grad_norm": 5.707120385263027, + "learning_rate": 1.9990557240020913e-06, + "loss": 1.2314, + "step": 8199 + }, + { + "epoch": 0.06, + "grad_norm": 4.633407491887176, + "learning_rate": 1.9990554934037347e-06, + "loss": 1.3608, + "step": 8200 + }, + { + "epoch": 0.06, + "grad_norm": 5.260836740350598, + "learning_rate": 1.9990552627772385e-06, + "loss": 1.3762, + "step": 8201 + }, + { + "epoch": 0.06, + "grad_norm": 4.784082329367463, + "learning_rate": 1.999055032122602e-06, + "loss": 1.5616, + "step": 8202 + }, + { + "epoch": 0.06, + "grad_norm": 4.253611588958306, + "learning_rate": 1.9990548014398256e-06, + "loss": 1.3073, + "step": 8203 + }, + { + "epoch": 0.06, + "grad_norm": 5.1595440466349896, + "learning_rate": 1.999054570728909e-06, + "loss": 1.1739, + "step": 8204 + }, + { + "epoch": 0.06, + "grad_norm": 5.295805369092527, + "learning_rate": 1.999054339989852e-06, + "loss": 1.4336, + "step": 8205 + }, + { + "epoch": 0.06, + "grad_norm": 4.507015401056458, + "learning_rate": 1.9990541092226557e-06, + "loss": 1.2707, + "step": 8206 + }, + { + "epoch": 0.06, + "grad_norm": 4.439768181495246, + "learning_rate": 1.9990538784273186e-06, + "loss": 1.4005, + "step": 8207 + }, + { + "epoch": 0.06, + "grad_norm": 5.466976867515769, + "learning_rate": 1.9990536476038423e-06, + "loss": 1.349, + "step": 8208 + }, + { + "epoch": 0.06, + "grad_norm": 6.636619163511564, + "learning_rate": 1.9990534167522255e-06, + "loss": 1.3271, + "step": 8209 + }, + { + "epoch": 0.06, + "grad_norm": 4.3467939491734615, + "learning_rate": 1.9990531858724685e-06, + "loss": 1.3297, + "step": 8210 + }, + { + "epoch": 0.06, + "grad_norm": 4.581184592576977, + "learning_rate": 1.999052954964572e-06, + "loss": 1.2271, + "step": 8211 + }, + { + "epoch": 0.06, + "grad_norm": 5.144152459181064, + "learning_rate": 1.999052724028535e-06, + "loss": 1.3265, + "step": 8212 + }, + { + "epoch": 0.06, + "grad_norm": 4.153932906417823, + "learning_rate": 1.9990524930643584e-06, + "loss": 1.2782, + "step": 8213 + }, + { + "epoch": 0.06, + "grad_norm": 5.105434301335586, + "learning_rate": 1.9990522620720417e-06, + "loss": 1.4297, + "step": 8214 + }, + { + "epoch": 0.06, + "grad_norm": 6.270416839900056, + "learning_rate": 1.999052031051585e-06, + "loss": 1.2907, + "step": 8215 + }, + { + "epoch": 0.06, + "grad_norm": 4.905599374416815, + "learning_rate": 1.9990518000029885e-06, + "loss": 1.3438, + "step": 8216 + }, + { + "epoch": 0.06, + "grad_norm": 4.670361292521902, + "learning_rate": 1.9990515689262514e-06, + "loss": 1.4586, + "step": 8217 + }, + { + "epoch": 0.06, + "grad_norm": 4.352520738279014, + "learning_rate": 1.999051337821375e-06, + "loss": 1.3927, + "step": 8218 + }, + { + "epoch": 0.06, + "grad_norm": 4.777592430544689, + "learning_rate": 1.9990511066883586e-06, + "loss": 1.4646, + "step": 8219 + }, + { + "epoch": 0.06, + "grad_norm": 4.982101908623657, + "learning_rate": 1.9990508755272016e-06, + "loss": 1.569, + "step": 8220 + }, + { + "epoch": 0.06, + "grad_norm": 4.088346705840035, + "learning_rate": 1.9990506443379053e-06, + "loss": 1.1569, + "step": 8221 + }, + { + "epoch": 0.06, + "grad_norm": 5.438399638784706, + "learning_rate": 1.999050413120469e-06, + "loss": 1.3463, + "step": 8222 + }, + { + "epoch": 0.06, + "grad_norm": 4.770971061912871, + "learning_rate": 1.9990501818748928e-06, + "loss": 1.4733, + "step": 8223 + }, + { + "epoch": 0.06, + "grad_norm": 4.643701043309167, + "learning_rate": 1.9990499506011765e-06, + "loss": 1.3408, + "step": 8224 + }, + { + "epoch": 0.06, + "grad_norm": 5.605134780579443, + "learning_rate": 1.99904971929932e-06, + "loss": 1.3933, + "step": 8225 + }, + { + "epoch": 0.06, + "grad_norm": 5.212881130818967, + "learning_rate": 1.999049487969324e-06, + "loss": 1.3868, + "step": 8226 + }, + { + "epoch": 0.06, + "grad_norm": 4.379932623306622, + "learning_rate": 1.9990492566111883e-06, + "loss": 1.286, + "step": 8227 + }, + { + "epoch": 0.06, + "grad_norm": 4.302565049798104, + "learning_rate": 1.9990490252249123e-06, + "loss": 1.4654, + "step": 8228 + }, + { + "epoch": 0.06, + "grad_norm": 5.230311565955441, + "learning_rate": 1.9990487938104967e-06, + "loss": 1.5078, + "step": 8229 + }, + { + "epoch": 0.06, + "grad_norm": 4.607159448600373, + "learning_rate": 1.999048562367941e-06, + "loss": 1.4102, + "step": 8230 + }, + { + "epoch": 0.06, + "grad_norm": 4.576174547189744, + "learning_rate": 1.9990483308972455e-06, + "loss": 1.3934, + "step": 8231 + }, + { + "epoch": 0.06, + "grad_norm": 4.518751709418994, + "learning_rate": 1.99904809939841e-06, + "loss": 1.4263, + "step": 8232 + }, + { + "epoch": 0.06, + "grad_norm": 4.350862240947364, + "learning_rate": 1.999047867871435e-06, + "loss": 1.31, + "step": 8233 + }, + { + "epoch": 0.06, + "grad_norm": 5.695325429858104, + "learning_rate": 1.9990476363163196e-06, + "loss": 1.3402, + "step": 8234 + }, + { + "epoch": 0.06, + "grad_norm": 4.299995288856469, + "learning_rate": 1.999047404733065e-06, + "loss": 1.3675, + "step": 8235 + }, + { + "epoch": 0.06, + "grad_norm": 5.25777156337636, + "learning_rate": 1.99904717312167e-06, + "loss": 1.4539, + "step": 8236 + }, + { + "epoch": 0.06, + "grad_norm": 4.582696943694878, + "learning_rate": 1.9990469414821356e-06, + "loss": 1.2846, + "step": 8237 + }, + { + "epoch": 0.06, + "grad_norm": 4.343873557143485, + "learning_rate": 1.999046709814461e-06, + "loss": 1.2103, + "step": 8238 + }, + { + "epoch": 0.06, + "grad_norm": 4.810541817048231, + "learning_rate": 1.9990464781186465e-06, + "loss": 1.389, + "step": 8239 + }, + { + "epoch": 0.06, + "grad_norm": 4.948976099055948, + "learning_rate": 1.9990462463946925e-06, + "loss": 1.4962, + "step": 8240 + }, + { + "epoch": 0.06, + "grad_norm": 5.296171927722328, + "learning_rate": 1.999046014642599e-06, + "loss": 1.4292, + "step": 8241 + }, + { + "epoch": 0.06, + "grad_norm": 5.172457818378872, + "learning_rate": 1.999045782862365e-06, + "loss": 1.4609, + "step": 8242 + }, + { + "epoch": 0.06, + "grad_norm": 7.848331689769885, + "learning_rate": 1.9990455510539916e-06, + "loss": 1.2819, + "step": 8243 + }, + { + "epoch": 0.06, + "grad_norm": 4.603434152628308, + "learning_rate": 1.9990453192174783e-06, + "loss": 1.3733, + "step": 8244 + }, + { + "epoch": 0.06, + "grad_norm": 4.06967132791478, + "learning_rate": 1.9990450873528252e-06, + "loss": 1.242, + "step": 8245 + }, + { + "epoch": 0.06, + "grad_norm": 4.409995740321932, + "learning_rate": 1.9990448554600325e-06, + "loss": 1.3252, + "step": 8246 + }, + { + "epoch": 0.06, + "grad_norm": 4.738005509137268, + "learning_rate": 1.9990446235390997e-06, + "loss": 1.561, + "step": 8247 + }, + { + "epoch": 0.06, + "grad_norm": 4.490052438206556, + "learning_rate": 1.9990443915900275e-06, + "loss": 1.4185, + "step": 8248 + }, + { + "epoch": 0.06, + "grad_norm": 5.363836131497134, + "learning_rate": 1.9990441596128152e-06, + "loss": 1.3816, + "step": 8249 + }, + { + "epoch": 0.06, + "eval_loss": 1.5642991065979004, + "eval_runtime": 4.5979, + "eval_samples_per_second": 1.957, + "eval_steps_per_second": 1.087, + "step": 8249 + }, + { + "epoch": 0.06, + "grad_norm": 4.724668924954548, + "learning_rate": 1.9990439276074637e-06, + "loss": 1.4598, + "step": 8250 + }, + { + "epoch": 0.06, + "grad_norm": 4.716430565015037, + "learning_rate": 1.9990436955739716e-06, + "loss": 1.3109, + "step": 8251 + }, + { + "epoch": 0.06, + "grad_norm": 5.888975945354197, + "learning_rate": 1.9990434635123402e-06, + "loss": 1.5034, + "step": 8252 + }, + { + "epoch": 0.06, + "grad_norm": 4.326722261213911, + "learning_rate": 1.9990432314225696e-06, + "loss": 1.4211, + "step": 8253 + }, + { + "epoch": 0.06, + "grad_norm": 5.637418593708245, + "learning_rate": 1.9990429993046583e-06, + "loss": 1.4174, + "step": 8254 + }, + { + "epoch": 0.06, + "grad_norm": 4.485291731666469, + "learning_rate": 1.999042767158608e-06, + "loss": 1.4369, + "step": 8255 + }, + { + "epoch": 0.06, + "grad_norm": 4.431705944415164, + "learning_rate": 1.9990425349844176e-06, + "loss": 1.4001, + "step": 8256 + }, + { + "epoch": 0.06, + "grad_norm": 4.598067318289846, + "learning_rate": 1.9990423027820877e-06, + "loss": 1.4473, + "step": 8257 + }, + { + "epoch": 0.06, + "grad_norm": 5.06084813647273, + "learning_rate": 1.999042070551618e-06, + "loss": 1.4613, + "step": 8258 + }, + { + "epoch": 0.06, + "grad_norm": 4.823026033041981, + "learning_rate": 1.999041838293009e-06, + "loss": 1.3917, + "step": 8259 + }, + { + "epoch": 0.06, + "grad_norm": 4.5145775182747165, + "learning_rate": 1.9990416060062598e-06, + "loss": 1.3649, + "step": 8260 + }, + { + "epoch": 0.06, + "grad_norm": 4.527681517531891, + "learning_rate": 1.999041373691371e-06, + "loss": 1.4601, + "step": 8261 + }, + { + "epoch": 0.06, + "grad_norm": 4.356568913087144, + "learning_rate": 1.9990411413483426e-06, + "loss": 1.2418, + "step": 8262 + }, + { + "epoch": 0.06, + "grad_norm": 4.442988430813115, + "learning_rate": 1.9990409089771745e-06, + "loss": 1.4027, + "step": 8263 + }, + { + "epoch": 0.06, + "grad_norm": 4.568982692907251, + "learning_rate": 1.999040676577867e-06, + "loss": 1.3534, + "step": 8264 + }, + { + "epoch": 0.06, + "grad_norm": 4.795929785949326, + "learning_rate": 1.9990404441504195e-06, + "loss": 1.1863, + "step": 8265 + }, + { + "epoch": 0.06, + "grad_norm": 4.953438182013541, + "learning_rate": 1.9990402116948323e-06, + "loss": 1.4043, + "step": 8266 + }, + { + "epoch": 0.06, + "grad_norm": 4.984216225883633, + "learning_rate": 1.9990399792111057e-06, + "loss": 1.2769, + "step": 8267 + }, + { + "epoch": 0.06, + "grad_norm": 4.172049235607687, + "learning_rate": 1.9990397466992395e-06, + "loss": 1.3389, + "step": 8268 + }, + { + "epoch": 0.06, + "grad_norm": 4.5793249436205405, + "learning_rate": 1.9990395141592335e-06, + "loss": 1.2399, + "step": 8269 + }, + { + "epoch": 0.06, + "grad_norm": 5.1413365370869455, + "learning_rate": 1.999039281591088e-06, + "loss": 1.5652, + "step": 8270 + }, + { + "epoch": 0.06, + "grad_norm": 5.6265954357796195, + "learning_rate": 1.999039048994803e-06, + "loss": 1.3478, + "step": 8271 + }, + { + "epoch": 0.06, + "grad_norm": 4.64472449885365, + "learning_rate": 1.999038816370378e-06, + "loss": 1.2266, + "step": 8272 + }, + { + "epoch": 0.06, + "grad_norm": 4.576713415359496, + "learning_rate": 1.9990385837178136e-06, + "loss": 1.4215, + "step": 8273 + }, + { + "epoch": 0.06, + "grad_norm": 4.311771019591452, + "learning_rate": 1.9990383510371095e-06, + "loss": 1.3022, + "step": 8274 + }, + { + "epoch": 0.06, + "grad_norm": 5.2912808195945065, + "learning_rate": 1.999038118328266e-06, + "loss": 1.3223, + "step": 8275 + }, + { + "epoch": 0.06, + "grad_norm": 4.259185697054471, + "learning_rate": 1.999037885591283e-06, + "loss": 1.3435, + "step": 8276 + }, + { + "epoch": 0.06, + "grad_norm": 5.941675015218654, + "learning_rate": 1.9990376528261604e-06, + "loss": 1.2397, + "step": 8277 + }, + { + "epoch": 0.06, + "grad_norm": 4.350089893975519, + "learning_rate": 1.999037420032898e-06, + "loss": 1.1177, + "step": 8278 + }, + { + "epoch": 0.06, + "grad_norm": 4.867009482812644, + "learning_rate": 1.9990371872114962e-06, + "loss": 1.5145, + "step": 8279 + }, + { + "epoch": 0.06, + "grad_norm": 5.137856817486438, + "learning_rate": 1.999036954361955e-06, + "loss": 1.3328, + "step": 8280 + }, + { + "epoch": 0.06, + "grad_norm": 4.9084280239868106, + "learning_rate": 1.999036721484274e-06, + "loss": 1.4189, + "step": 8281 + }, + { + "epoch": 0.06, + "grad_norm": 4.933105580358515, + "learning_rate": 1.9990364885784533e-06, + "loss": 1.3909, + "step": 8282 + }, + { + "epoch": 0.06, + "grad_norm": 4.121523815240315, + "learning_rate": 1.9990362556444936e-06, + "loss": 1.2405, + "step": 8283 + }, + { + "epoch": 0.06, + "grad_norm": 4.678365530703249, + "learning_rate": 1.9990360226823937e-06, + "loss": 1.3501, + "step": 8284 + }, + { + "epoch": 0.06, + "grad_norm": 4.1656022022504375, + "learning_rate": 1.9990357896921546e-06, + "loss": 1.3065, + "step": 8285 + }, + { + "epoch": 0.06, + "grad_norm": 4.812306620373129, + "learning_rate": 1.9990355566737762e-06, + "loss": 1.2899, + "step": 8286 + }, + { + "epoch": 0.06, + "grad_norm": 5.088029165932999, + "learning_rate": 1.9990353236272586e-06, + "loss": 1.1263, + "step": 8287 + }, + { + "epoch": 0.06, + "grad_norm": 4.277715583582814, + "learning_rate": 1.9990350905526008e-06, + "loss": 1.3021, + "step": 8288 + }, + { + "epoch": 0.06, + "grad_norm": 5.54563260372641, + "learning_rate": 1.9990348574498037e-06, + "loss": 1.3948, + "step": 8289 + }, + { + "epoch": 0.06, + "grad_norm": 4.078209195401177, + "learning_rate": 1.9990346243188673e-06, + "loss": 1.0781, + "step": 8290 + }, + { + "epoch": 0.06, + "grad_norm": 5.005196801851178, + "learning_rate": 1.9990343911597913e-06, + "loss": 1.408, + "step": 8291 + }, + { + "epoch": 0.06, + "grad_norm": 4.053339754093099, + "learning_rate": 1.9990341579725755e-06, + "loss": 1.1854, + "step": 8292 + }, + { + "epoch": 0.06, + "grad_norm": 5.20351410089857, + "learning_rate": 1.9990339247572204e-06, + "loss": 1.4826, + "step": 8293 + }, + { + "epoch": 0.06, + "grad_norm": 4.5590791381126845, + "learning_rate": 1.999033691513726e-06, + "loss": 1.2966, + "step": 8294 + }, + { + "epoch": 0.06, + "grad_norm": 4.433799054499765, + "learning_rate": 1.999033458242092e-06, + "loss": 1.39, + "step": 8295 + }, + { + "epoch": 0.06, + "grad_norm": 5.257488976646479, + "learning_rate": 1.9990332249423187e-06, + "loss": 1.404, + "step": 8296 + }, + { + "epoch": 0.06, + "grad_norm": 5.681593272768738, + "learning_rate": 1.999032991614406e-06, + "loss": 1.3293, + "step": 8297 + }, + { + "epoch": 0.06, + "grad_norm": 4.884480931323584, + "learning_rate": 1.999032758258354e-06, + "loss": 1.477, + "step": 8298 + }, + { + "epoch": 0.06, + "grad_norm": 4.416750305037304, + "learning_rate": 1.999032524874162e-06, + "loss": 1.4386, + "step": 8299 + }, + { + "epoch": 0.06, + "grad_norm": 5.661037174326694, + "learning_rate": 1.9990322914618313e-06, + "loss": 1.2421, + "step": 8300 + }, + { + "epoch": 0.06, + "grad_norm": 4.124176017773704, + "learning_rate": 1.9990320580213608e-06, + "loss": 1.3226, + "step": 8301 + }, + { + "epoch": 0.06, + "grad_norm": 5.111246829435499, + "learning_rate": 1.9990318245527505e-06, + "loss": 1.382, + "step": 8302 + }, + { + "epoch": 0.06, + "grad_norm": 4.072916429433893, + "learning_rate": 1.9990315910560013e-06, + "loss": 1.2825, + "step": 8303 + }, + { + "epoch": 0.06, + "grad_norm": 5.759840034049702, + "learning_rate": 1.9990313575311125e-06, + "loss": 1.3343, + "step": 8304 + }, + { + "epoch": 0.06, + "grad_norm": 4.4830622737046335, + "learning_rate": 1.999031123978085e-06, + "loss": 1.4941, + "step": 8305 + }, + { + "epoch": 0.06, + "grad_norm": 6.960919643282049, + "learning_rate": 1.999030890396917e-06, + "loss": 1.3745, + "step": 8306 + }, + { + "epoch": 0.06, + "grad_norm": 4.4271138937610885, + "learning_rate": 1.9990306567876103e-06, + "loss": 1.3604, + "step": 8307 + }, + { + "epoch": 0.06, + "grad_norm": 4.40386035540864, + "learning_rate": 1.999030423150164e-06, + "loss": 1.3374, + "step": 8308 + }, + { + "epoch": 0.06, + "grad_norm": 4.26099550199566, + "learning_rate": 1.999030189484578e-06, + "loss": 1.2988, + "step": 8309 + }, + { + "epoch": 0.06, + "grad_norm": 4.172925614709736, + "learning_rate": 1.9990299557908532e-06, + "loss": 1.3681, + "step": 8310 + }, + { + "epoch": 0.06, + "grad_norm": 5.224800880866437, + "learning_rate": 1.999029722068989e-06, + "loss": 1.4005, + "step": 8311 + }, + { + "epoch": 0.06, + "grad_norm": 4.542417807291675, + "learning_rate": 1.9990294883189855e-06, + "loss": 1.4132, + "step": 8312 + }, + { + "epoch": 0.06, + "grad_norm": 4.877345293687558, + "learning_rate": 1.9990292545408423e-06, + "loss": 1.402, + "step": 8313 + }, + { + "epoch": 0.06, + "grad_norm": 5.431716513454659, + "learning_rate": 1.9990290207345597e-06, + "loss": 1.415, + "step": 8314 + }, + { + "epoch": 0.06, + "grad_norm": 4.233003724805698, + "learning_rate": 1.9990287869001384e-06, + "loss": 1.3477, + "step": 8315 + }, + { + "epoch": 0.06, + "grad_norm": 4.6576296310136325, + "learning_rate": 1.9990285530375773e-06, + "loss": 1.4549, + "step": 8316 + }, + { + "epoch": 0.06, + "grad_norm": 4.391415226549598, + "learning_rate": 1.999028319146877e-06, + "loss": 1.3342, + "step": 8317 + }, + { + "epoch": 0.06, + "grad_norm": 4.602088029418293, + "learning_rate": 1.9990280852280373e-06, + "loss": 1.2976, + "step": 8318 + }, + { + "epoch": 0.06, + "grad_norm": 4.360325546639366, + "learning_rate": 1.999027851281059e-06, + "loss": 1.3285, + "step": 8319 + }, + { + "epoch": 0.06, + "grad_norm": 4.6403728530383095, + "learning_rate": 1.9990276173059402e-06, + "loss": 1.3746, + "step": 8320 + }, + { + "epoch": 0.06, + "grad_norm": 5.270882820990297, + "learning_rate": 1.999027383302683e-06, + "loss": 1.6054, + "step": 8321 + }, + { + "epoch": 0.06, + "grad_norm": 4.302185532932528, + "learning_rate": 1.9990271492712864e-06, + "loss": 1.2901, + "step": 8322 + }, + { + "epoch": 0.06, + "eval_loss": 1.5656442642211914, + "eval_runtime": 4.5823, + "eval_samples_per_second": 1.964, + "eval_steps_per_second": 1.091, + "step": 8322 + }, + { + "epoch": 0.06, + "grad_norm": 5.020764746159491, + "learning_rate": 1.9990269152117504e-06, + "loss": 1.4232, + "step": 8323 + }, + { + "epoch": 0.06, + "grad_norm": 4.606529395708601, + "learning_rate": 1.999026681124075e-06, + "loss": 1.2384, + "step": 8324 + }, + { + "epoch": 0.06, + "grad_norm": 4.274363960052644, + "learning_rate": 1.9990264470082604e-06, + "loss": 1.188, + "step": 8325 + }, + { + "epoch": 0.06, + "grad_norm": 4.22659651250079, + "learning_rate": 1.9990262128643065e-06, + "loss": 1.3778, + "step": 8326 + }, + { + "epoch": 0.06, + "grad_norm": 4.918397701896192, + "learning_rate": 1.9990259786922134e-06, + "loss": 1.4058, + "step": 8327 + }, + { + "epoch": 0.06, + "grad_norm": 4.325221835622321, + "learning_rate": 1.9990257444919813e-06, + "loss": 1.4229, + "step": 8328 + }, + { + "epoch": 0.06, + "grad_norm": 5.57680022443474, + "learning_rate": 1.99902551026361e-06, + "loss": 1.4133, + "step": 8329 + }, + { + "epoch": 0.06, + "grad_norm": 4.814148517311039, + "learning_rate": 1.999025276007099e-06, + "loss": 1.3071, + "step": 8330 + }, + { + "epoch": 0.06, + "grad_norm": 4.3283132853618325, + "learning_rate": 1.999025041722449e-06, + "loss": 1.2699, + "step": 8331 + }, + { + "epoch": 0.06, + "grad_norm": 4.911050608047389, + "learning_rate": 1.99902480740966e-06, + "loss": 1.1585, + "step": 8332 + }, + { + "epoch": 0.06, + "grad_norm": 4.601249664935928, + "learning_rate": 1.9990245730687316e-06, + "loss": 1.5244, + "step": 8333 + }, + { + "epoch": 0.06, + "grad_norm": 5.498290410267132, + "learning_rate": 1.9990243386996643e-06, + "loss": 1.472, + "step": 8334 + }, + { + "epoch": 0.06, + "grad_norm": 4.830360884280506, + "learning_rate": 1.9990241043024573e-06, + "loss": 1.3676, + "step": 8335 + }, + { + "epoch": 0.06, + "grad_norm": 4.757159460063796, + "learning_rate": 1.9990238698771114e-06, + "loss": 1.3391, + "step": 8336 + }, + { + "epoch": 0.06, + "grad_norm": 4.360488398818196, + "learning_rate": 1.9990236354236263e-06, + "loss": 1.4066, + "step": 8337 + }, + { + "epoch": 0.06, + "grad_norm": 4.411231104148737, + "learning_rate": 1.999023400942002e-06, + "loss": 1.3872, + "step": 8338 + }, + { + "epoch": 0.06, + "grad_norm": 4.180522741794834, + "learning_rate": 1.999023166432238e-06, + "loss": 1.3609, + "step": 8339 + }, + { + "epoch": 0.06, + "grad_norm": 5.193441642868844, + "learning_rate": 1.999022931894336e-06, + "loss": 1.3488, + "step": 8340 + }, + { + "epoch": 0.06, + "grad_norm": 4.471788603914176, + "learning_rate": 1.999022697328294e-06, + "loss": 1.4602, + "step": 8341 + }, + { + "epoch": 0.06, + "grad_norm": 4.341105435630704, + "learning_rate": 1.999022462734113e-06, + "loss": 1.3936, + "step": 8342 + }, + { + "epoch": 0.06, + "grad_norm": 4.237003449736239, + "learning_rate": 1.999022228111793e-06, + "loss": 1.2631, + "step": 8343 + }, + { + "epoch": 0.06, + "grad_norm": 4.575726133702913, + "learning_rate": 1.999021993461334e-06, + "loss": 1.3168, + "step": 8344 + }, + { + "epoch": 0.06, + "grad_norm": 4.876989383729073, + "learning_rate": 1.9990217587827357e-06, + "loss": 1.4856, + "step": 8345 + }, + { + "epoch": 0.06, + "grad_norm": 4.957069769203432, + "learning_rate": 1.999021524075998e-06, + "loss": 1.2782, + "step": 8346 + }, + { + "epoch": 0.06, + "grad_norm": 4.555175055202144, + "learning_rate": 1.9990212893411216e-06, + "loss": 1.3844, + "step": 8347 + }, + { + "epoch": 0.06, + "grad_norm": 4.982702575149782, + "learning_rate": 1.9990210545781057e-06, + "loss": 1.4147, + "step": 8348 + }, + { + "epoch": 0.06, + "grad_norm": 4.511710257653521, + "learning_rate": 1.9990208197869513e-06, + "loss": 1.34, + "step": 8349 + }, + { + "epoch": 0.06, + "grad_norm": 4.579584800817105, + "learning_rate": 1.999020584967657e-06, + "loss": 1.3131, + "step": 8350 + }, + { + "epoch": 0.06, + "grad_norm": 4.3639826779258195, + "learning_rate": 1.9990203501202246e-06, + "loss": 1.3033, + "step": 8351 + }, + { + "epoch": 0.06, + "grad_norm": 4.560570971067625, + "learning_rate": 1.9990201152446524e-06, + "loss": 1.3826, + "step": 8352 + }, + { + "epoch": 0.06, + "grad_norm": 4.771442617526964, + "learning_rate": 1.9990198803409413e-06, + "loss": 1.4731, + "step": 8353 + }, + { + "epoch": 0.06, + "grad_norm": 4.421985550458615, + "learning_rate": 1.999019645409091e-06, + "loss": 1.2672, + "step": 8354 + }, + { + "epoch": 0.06, + "grad_norm": 4.262551774119744, + "learning_rate": 1.9990194104491017e-06, + "loss": 1.2955, + "step": 8355 + }, + { + "epoch": 0.06, + "grad_norm": 4.557969622544508, + "learning_rate": 1.9990191754609736e-06, + "loss": 1.377, + "step": 8356 + }, + { + "epoch": 0.06, + "grad_norm": 4.490612997548548, + "learning_rate": 1.999018940444706e-06, + "loss": 1.3569, + "step": 8357 + }, + { + "epoch": 0.06, + "grad_norm": 4.254726746385519, + "learning_rate": 1.9990187054003e-06, + "loss": 1.3191, + "step": 8358 + }, + { + "epoch": 0.06, + "grad_norm": 4.369880226717757, + "learning_rate": 1.9990184703277544e-06, + "loss": 1.3652, + "step": 8359 + }, + { + "epoch": 0.06, + "grad_norm": 4.368191459851238, + "learning_rate": 1.99901823522707e-06, + "loss": 1.3865, + "step": 8360 + }, + { + "epoch": 0.06, + "grad_norm": 4.492268895198598, + "learning_rate": 1.9990180000982464e-06, + "loss": 1.3948, + "step": 8361 + }, + { + "epoch": 0.06, + "grad_norm": 4.792306538138668, + "learning_rate": 1.9990177649412843e-06, + "loss": 1.6932, + "step": 8362 + }, + { + "epoch": 0.06, + "grad_norm": 7.839851060213818, + "learning_rate": 1.9990175297561825e-06, + "loss": 1.4006, + "step": 8363 + }, + { + "epoch": 0.06, + "grad_norm": 4.2887203007389365, + "learning_rate": 1.9990172945429423e-06, + "loss": 1.181, + "step": 8364 + }, + { + "epoch": 0.06, + "grad_norm": 4.177683976256777, + "learning_rate": 1.9990170593015628e-06, + "loss": 1.3094, + "step": 8365 + }, + { + "epoch": 0.06, + "grad_norm": 5.407259103865772, + "learning_rate": 1.9990168240320444e-06, + "loss": 1.3241, + "step": 8366 + }, + { + "epoch": 0.06, + "grad_norm": 4.351582161927502, + "learning_rate": 1.9990165887343868e-06, + "loss": 1.3269, + "step": 8367 + }, + { + "epoch": 0.06, + "grad_norm": 4.306908350756632, + "learning_rate": 1.9990163534085902e-06, + "loss": 1.3031, + "step": 8368 + }, + { + "epoch": 0.06, + "grad_norm": 6.022174789368055, + "learning_rate": 1.999016118054655e-06, + "loss": 1.2962, + "step": 8369 + }, + { + "epoch": 0.06, + "grad_norm": 5.1253312574813625, + "learning_rate": 1.9990158826725806e-06, + "loss": 1.3777, + "step": 8370 + }, + { + "epoch": 0.06, + "grad_norm": 4.955078193325565, + "learning_rate": 1.9990156472623676e-06, + "loss": 1.4517, + "step": 8371 + }, + { + "epoch": 0.06, + "grad_norm": 4.6813878429700795, + "learning_rate": 1.999015411824015e-06, + "loss": 1.3286, + "step": 8372 + }, + { + "epoch": 0.06, + "grad_norm": 4.987846314192864, + "learning_rate": 1.999015176357524e-06, + "loss": 1.4235, + "step": 8373 + }, + { + "epoch": 0.06, + "grad_norm": 4.473948143583966, + "learning_rate": 1.999014940862894e-06, + "loss": 1.3693, + "step": 8374 + }, + { + "epoch": 0.06, + "grad_norm": 4.2441833787143874, + "learning_rate": 1.999014705340125e-06, + "loss": 1.3634, + "step": 8375 + }, + { + "epoch": 0.06, + "grad_norm": 4.552970496979477, + "learning_rate": 1.9990144697892167e-06, + "loss": 1.4939, + "step": 8376 + }, + { + "epoch": 0.06, + "grad_norm": 4.797322067007558, + "learning_rate": 1.99901423421017e-06, + "loss": 1.4767, + "step": 8377 + }, + { + "epoch": 0.06, + "grad_norm": 4.587597346903606, + "learning_rate": 1.999013998602984e-06, + "loss": 1.1092, + "step": 8378 + }, + { + "epoch": 0.06, + "grad_norm": 5.534941541343506, + "learning_rate": 1.9990137629676597e-06, + "loss": 1.4859, + "step": 8379 + }, + { + "epoch": 0.06, + "grad_norm": 4.9961469123467115, + "learning_rate": 1.999013527304196e-06, + "loss": 1.4147, + "step": 8380 + }, + { + "epoch": 0.06, + "grad_norm": 4.044184154266132, + "learning_rate": 1.9990132916125936e-06, + "loss": 1.2626, + "step": 8381 + }, + { + "epoch": 0.06, + "grad_norm": 4.648917140278735, + "learning_rate": 1.999013055892852e-06, + "loss": 1.3029, + "step": 8382 + }, + { + "epoch": 0.06, + "grad_norm": 4.889605616911022, + "learning_rate": 1.999012820144972e-06, + "loss": 1.3647, + "step": 8383 + }, + { + "epoch": 0.06, + "grad_norm": 5.359942093616338, + "learning_rate": 1.999012584368953e-06, + "loss": 1.4222, + "step": 8384 + }, + { + "epoch": 0.06, + "grad_norm": 4.668293176598957, + "learning_rate": 1.9990123485647945e-06, + "loss": 1.4047, + "step": 8385 + }, + { + "epoch": 0.06, + "grad_norm": 4.536147877502993, + "learning_rate": 1.9990121127324977e-06, + "loss": 1.3475, + "step": 8386 + }, + { + "epoch": 0.06, + "grad_norm": 5.918582636107545, + "learning_rate": 1.9990118768720625e-06, + "loss": 1.3717, + "step": 8387 + }, + { + "epoch": 0.06, + "grad_norm": 4.666692418540283, + "learning_rate": 1.999011640983488e-06, + "loss": 1.3138, + "step": 8388 + }, + { + "epoch": 0.06, + "grad_norm": 4.435728448744984, + "learning_rate": 1.9990114050667746e-06, + "loss": 1.2664, + "step": 8389 + }, + { + "epoch": 0.06, + "grad_norm": 4.91773565268233, + "learning_rate": 1.9990111691219224e-06, + "loss": 1.4333, + "step": 8390 + }, + { + "epoch": 0.06, + "grad_norm": 4.372311084034018, + "learning_rate": 1.9990109331489313e-06, + "loss": 1.4715, + "step": 8391 + }, + { + "epoch": 0.06, + "grad_norm": 4.855966195750034, + "learning_rate": 1.9990106971478013e-06, + "loss": 1.2862, + "step": 8392 + }, + { + "epoch": 0.06, + "grad_norm": 4.197613641758737, + "learning_rate": 1.999010461118533e-06, + "loss": 1.3328, + "step": 8393 + }, + { + "epoch": 0.06, + "grad_norm": 5.473355860344675, + "learning_rate": 1.9990102250611253e-06, + "loss": 1.359, + "step": 8394 + }, + { + "epoch": 0.06, + "grad_norm": 4.92330000920753, + "learning_rate": 1.999009988975579e-06, + "loss": 1.6145, + "step": 8395 + }, + { + "epoch": 0.06, + "eval_loss": 1.567657232284546, + "eval_runtime": 4.5964, + "eval_samples_per_second": 1.958, + "eval_steps_per_second": 1.088, + "step": 8395 + }, + { + "epoch": 0.06, + "grad_norm": 4.5701049314995625, + "learning_rate": 1.999009752861894e-06, + "loss": 1.2682, + "step": 8396 + }, + { + "epoch": 0.06, + "grad_norm": 4.215243480806859, + "learning_rate": 1.9990095167200704e-06, + "loss": 1.3341, + "step": 8397 + }, + { + "epoch": 0.06, + "grad_norm": 4.713273711413565, + "learning_rate": 1.999009280550108e-06, + "loss": 1.3942, + "step": 8398 + }, + { + "epoch": 0.06, + "grad_norm": 5.712640789155021, + "learning_rate": 1.9990090443520066e-06, + "loss": 1.2203, + "step": 8399 + }, + { + "epoch": 0.06, + "grad_norm": 4.596277722691164, + "learning_rate": 1.9990088081257666e-06, + "loss": 1.2669, + "step": 8400 + }, + { + "epoch": 0.06, + "grad_norm": 4.9488258276807935, + "learning_rate": 1.999008571871388e-06, + "loss": 1.3826, + "step": 8401 + }, + { + "epoch": 0.06, + "grad_norm": 4.613795635684511, + "learning_rate": 1.99900833558887e-06, + "loss": 1.218, + "step": 8402 + }, + { + "epoch": 0.06, + "grad_norm": 5.030838257239036, + "learning_rate": 1.999008099278214e-06, + "loss": 1.3821, + "step": 8403 + }, + { + "epoch": 0.06, + "grad_norm": 5.571990186711726, + "learning_rate": 1.9990078629394186e-06, + "loss": 1.6666, + "step": 8404 + }, + { + "epoch": 0.06, + "grad_norm": 5.171066589706494, + "learning_rate": 1.9990076265724848e-06, + "loss": 1.3002, + "step": 8405 + }, + { + "epoch": 0.06, + "grad_norm": 4.543127661532873, + "learning_rate": 1.999007390177412e-06, + "loss": 1.1585, + "step": 8406 + }, + { + "epoch": 0.06, + "grad_norm": 5.327490998974423, + "learning_rate": 1.999007153754201e-06, + "loss": 1.3534, + "step": 8407 + }, + { + "epoch": 0.06, + "grad_norm": 4.435630789880913, + "learning_rate": 1.9990069173028514e-06, + "loss": 1.3469, + "step": 8408 + }, + { + "epoch": 0.06, + "grad_norm": 6.643697238326478, + "learning_rate": 1.9990066808233625e-06, + "loss": 1.3808, + "step": 8409 + }, + { + "epoch": 0.06, + "grad_norm": 5.180431362370178, + "learning_rate": 1.9990064443157353e-06, + "loss": 1.4784, + "step": 8410 + }, + { + "epoch": 0.06, + "grad_norm": 4.199888607186224, + "learning_rate": 1.999006207779969e-06, + "loss": 1.2871, + "step": 8411 + }, + { + "epoch": 0.06, + "grad_norm": 4.234292284844202, + "learning_rate": 1.9990059712160646e-06, + "loss": 1.2079, + "step": 8412 + }, + { + "epoch": 0.06, + "grad_norm": 4.62723876031503, + "learning_rate": 1.999005734624021e-06, + "loss": 1.3377, + "step": 8413 + }, + { + "epoch": 0.06, + "grad_norm": 5.010857816758854, + "learning_rate": 1.999005498003839e-06, + "loss": 1.2954, + "step": 8414 + }, + { + "epoch": 0.06, + "grad_norm": 5.993205867312916, + "learning_rate": 1.9990052613555185e-06, + "loss": 1.2182, + "step": 8415 + }, + { + "epoch": 0.06, + "grad_norm": 7.339850469839931, + "learning_rate": 1.999005024679059e-06, + "loss": 1.4024, + "step": 8416 + }, + { + "epoch": 0.06, + "grad_norm": 4.207679844091889, + "learning_rate": 1.999004787974461e-06, + "loss": 1.3022, + "step": 8417 + }, + { + "epoch": 0.06, + "grad_norm": 4.438127125335588, + "learning_rate": 1.9990045512417245e-06, + "loss": 1.2749, + "step": 8418 + }, + { + "epoch": 0.06, + "grad_norm": 4.84464536686767, + "learning_rate": 1.999004314480849e-06, + "loss": 1.3476, + "step": 8419 + }, + { + "epoch": 0.06, + "grad_norm": 5.003419099622179, + "learning_rate": 1.999004077691835e-06, + "loss": 1.2251, + "step": 8420 + }, + { + "epoch": 0.06, + "grad_norm": 4.68382568667382, + "learning_rate": 1.9990038408746824e-06, + "loss": 1.3134, + "step": 8421 + }, + { + "epoch": 0.06, + "grad_norm": 5.846668941153372, + "learning_rate": 1.9990036040293914e-06, + "loss": 1.4848, + "step": 8422 + }, + { + "epoch": 0.06, + "grad_norm": 72.22770611690466, + "learning_rate": 1.9990033671559615e-06, + "loss": 1.6378, + "step": 8423 + }, + { + "epoch": 0.06, + "grad_norm": 4.913751231042928, + "learning_rate": 1.999003130254393e-06, + "loss": 1.3046, + "step": 8424 + }, + { + "epoch": 0.06, + "grad_norm": 5.563361558333302, + "learning_rate": 1.999002893324686e-06, + "loss": 1.2962, + "step": 8425 + }, + { + "epoch": 0.06, + "grad_norm": 4.621955668736098, + "learning_rate": 1.999002656366841e-06, + "loss": 1.3498, + "step": 8426 + }, + { + "epoch": 0.06, + "grad_norm": 4.335433039296151, + "learning_rate": 1.9990024193808568e-06, + "loss": 1.3384, + "step": 8427 + }, + { + "epoch": 0.06, + "grad_norm": 4.27659742870904, + "learning_rate": 1.999002182366734e-06, + "loss": 1.2137, + "step": 8428 + }, + { + "epoch": 0.06, + "grad_norm": 4.317254755188132, + "learning_rate": 1.9990019453244725e-06, + "loss": 1.3333, + "step": 8429 + }, + { + "epoch": 0.06, + "grad_norm": 4.588194714823662, + "learning_rate": 1.9990017082540727e-06, + "loss": 1.2952, + "step": 8430 + }, + { + "epoch": 0.06, + "grad_norm": 4.536797481229204, + "learning_rate": 1.9990014711555345e-06, + "loss": 1.3067, + "step": 8431 + }, + { + "epoch": 0.06, + "grad_norm": 4.458787242357463, + "learning_rate": 1.9990012340288574e-06, + "loss": 1.408, + "step": 8432 + }, + { + "epoch": 0.06, + "grad_norm": 5.765351765905371, + "learning_rate": 1.999000996874042e-06, + "loss": 1.1525, + "step": 8433 + }, + { + "epoch": 0.06, + "grad_norm": 8.862566168460543, + "learning_rate": 1.999000759691088e-06, + "loss": 1.473, + "step": 8434 + }, + { + "epoch": 0.06, + "grad_norm": 4.150802771803171, + "learning_rate": 1.999000522479995e-06, + "loss": 1.2269, + "step": 8435 + }, + { + "epoch": 0.06, + "grad_norm": 4.356398610103301, + "learning_rate": 1.999000285240764e-06, + "loss": 1.3826, + "step": 8436 + }, + { + "epoch": 0.06, + "grad_norm": 5.2467688215660395, + "learning_rate": 1.9990000479733947e-06, + "loss": 1.5328, + "step": 8437 + }, + { + "epoch": 0.06, + "grad_norm": 4.468282929336187, + "learning_rate": 1.9989998106778866e-06, + "loss": 1.2144, + "step": 8438 + }, + { + "epoch": 0.06, + "grad_norm": 4.361593505439121, + "learning_rate": 1.99899957335424e-06, + "loss": 1.3896, + "step": 8439 + }, + { + "epoch": 0.06, + "grad_norm": 5.53120283296376, + "learning_rate": 1.9989993360024547e-06, + "loss": 1.5312, + "step": 8440 + }, + { + "epoch": 0.06, + "grad_norm": 4.868631773734439, + "learning_rate": 1.9989990986225313e-06, + "loss": 1.2824, + "step": 8441 + }, + { + "epoch": 0.06, + "grad_norm": 4.514616463882857, + "learning_rate": 1.998998861214469e-06, + "loss": 1.3558, + "step": 8442 + }, + { + "epoch": 0.06, + "grad_norm": 4.427873595952543, + "learning_rate": 1.9989986237782687e-06, + "loss": 1.1876, + "step": 8443 + }, + { + "epoch": 0.06, + "grad_norm": 5.060044500434067, + "learning_rate": 1.9989983863139296e-06, + "loss": 1.3369, + "step": 8444 + }, + { + "epoch": 0.06, + "grad_norm": 6.61085970566859, + "learning_rate": 1.998998148821452e-06, + "loss": 1.4488, + "step": 8445 + }, + { + "epoch": 0.06, + "grad_norm": 4.429709658186737, + "learning_rate": 1.998997911300836e-06, + "loss": 1.4065, + "step": 8446 + }, + { + "epoch": 0.06, + "grad_norm": 4.455764795481005, + "learning_rate": 1.9989976737520816e-06, + "loss": 1.3559, + "step": 8447 + }, + { + "epoch": 0.06, + "grad_norm": 4.567685574616305, + "learning_rate": 1.9989974361751887e-06, + "loss": 1.4541, + "step": 8448 + }, + { + "epoch": 0.06, + "grad_norm": 5.869228089840253, + "learning_rate": 1.998997198570158e-06, + "loss": 1.3385, + "step": 8449 + }, + { + "epoch": 0.06, + "grad_norm": 4.325802290156743, + "learning_rate": 1.998996960936988e-06, + "loss": 1.2911, + "step": 8450 + }, + { + "epoch": 0.06, + "grad_norm": 4.317738952887137, + "learning_rate": 1.99899672327568e-06, + "loss": 1.207, + "step": 8451 + }, + { + "epoch": 0.06, + "grad_norm": 5.996256516985147, + "learning_rate": 1.9989964855862333e-06, + "loss": 1.4679, + "step": 8452 + }, + { + "epoch": 0.06, + "grad_norm": 8.550414431812067, + "learning_rate": 1.9989962478686483e-06, + "loss": 1.3527, + "step": 8453 + }, + { + "epoch": 0.06, + "grad_norm": 4.997985580091321, + "learning_rate": 1.998996010122925e-06, + "loss": 1.3554, + "step": 8454 + }, + { + "epoch": 0.06, + "grad_norm": 4.416346521208863, + "learning_rate": 1.9989957723490633e-06, + "loss": 1.22, + "step": 8455 + }, + { + "epoch": 0.06, + "grad_norm": 5.12896451783654, + "learning_rate": 1.9989955345470634e-06, + "loss": 1.1522, + "step": 8456 + }, + { + "epoch": 0.06, + "grad_norm": 4.00349993452822, + "learning_rate": 1.9989952967169246e-06, + "loss": 1.2234, + "step": 8457 + }, + { + "epoch": 0.06, + "grad_norm": 5.317189165747824, + "learning_rate": 1.998995058858648e-06, + "loss": 1.5081, + "step": 8458 + }, + { + "epoch": 0.06, + "grad_norm": 4.330171577519852, + "learning_rate": 1.9989948209722326e-06, + "loss": 1.1255, + "step": 8459 + }, + { + "epoch": 0.06, + "grad_norm": 4.655935757224283, + "learning_rate": 1.998994583057679e-06, + "loss": 1.2203, + "step": 8460 + }, + { + "epoch": 0.06, + "grad_norm": 4.726772061267374, + "learning_rate": 1.9989943451149872e-06, + "loss": 1.3104, + "step": 8461 + }, + { + "epoch": 0.06, + "grad_norm": 6.558257677936896, + "learning_rate": 1.9989941071441567e-06, + "loss": 1.3681, + "step": 8462 + }, + { + "epoch": 0.06, + "grad_norm": 4.606013238089367, + "learning_rate": 1.998993869145188e-06, + "loss": 1.334, + "step": 8463 + }, + { + "epoch": 0.06, + "grad_norm": 4.5024219623394774, + "learning_rate": 1.998993631118081e-06, + "loss": 1.514, + "step": 8464 + }, + { + "epoch": 0.06, + "grad_norm": 4.537845678210268, + "learning_rate": 1.998993393062836e-06, + "loss": 1.3453, + "step": 8465 + }, + { + "epoch": 0.06, + "grad_norm": 4.947351067739524, + "learning_rate": 1.9989931549794523e-06, + "loss": 1.3304, + "step": 8466 + }, + { + "epoch": 0.06, + "grad_norm": 4.6459693351653994, + "learning_rate": 1.99899291686793e-06, + "loss": 1.381, + "step": 8467 + }, + { + "epoch": 0.06, + "grad_norm": 5.210441146279967, + "learning_rate": 1.99899267872827e-06, + "loss": 1.4571, + "step": 8468 + }, + { + "epoch": 0.06, + "eval_loss": 1.5647797584533691, + "eval_runtime": 4.6033, + "eval_samples_per_second": 1.955, + "eval_steps_per_second": 1.086, + "step": 8468 + }, + { + "epoch": 0.06, + "grad_norm": 4.6295052178558675, + "learning_rate": 1.9989924405604715e-06, + "loss": 1.3832, + "step": 8469 + }, + { + "epoch": 0.06, + "grad_norm": 4.476411600889492, + "learning_rate": 1.9989922023645347e-06, + "loss": 1.2713, + "step": 8470 + }, + { + "epoch": 0.06, + "grad_norm": 4.444776633143054, + "learning_rate": 1.9989919641404596e-06, + "loss": 1.4442, + "step": 8471 + }, + { + "epoch": 0.06, + "grad_norm": 4.3045006734689135, + "learning_rate": 1.998991725888246e-06, + "loss": 1.3287, + "step": 8472 + }, + { + "epoch": 0.06, + "grad_norm": 4.949909163837501, + "learning_rate": 1.9989914876078948e-06, + "loss": 1.4534, + "step": 8473 + }, + { + "epoch": 0.06, + "grad_norm": 4.587667424988372, + "learning_rate": 1.9989912492994047e-06, + "loss": 1.2381, + "step": 8474 + }, + { + "epoch": 0.06, + "grad_norm": 4.733889509541536, + "learning_rate": 1.9989910109627762e-06, + "loss": 1.3181, + "step": 8475 + }, + { + "epoch": 0.06, + "grad_norm": 4.495264978351784, + "learning_rate": 1.99899077259801e-06, + "loss": 1.3402, + "step": 8476 + }, + { + "epoch": 0.06, + "grad_norm": 4.967971490452831, + "learning_rate": 1.998990534205105e-06, + "loss": 1.2031, + "step": 8477 + }, + { + "epoch": 0.06, + "grad_norm": 4.494943007961482, + "learning_rate": 1.9989902957840623e-06, + "loss": 1.4279, + "step": 8478 + }, + { + "epoch": 0.06, + "grad_norm": 4.199579469091465, + "learning_rate": 1.998990057334881e-06, + "loss": 1.4212, + "step": 8479 + }, + { + "epoch": 0.06, + "grad_norm": 4.757657046397719, + "learning_rate": 1.9989898188575615e-06, + "loss": 1.2369, + "step": 8480 + }, + { + "epoch": 0.06, + "grad_norm": 5.7576924981494, + "learning_rate": 1.998989580352104e-06, + "loss": 1.4905, + "step": 8481 + }, + { + "epoch": 0.06, + "grad_norm": 5.049433518561161, + "learning_rate": 1.998989341818508e-06, + "loss": 1.4028, + "step": 8482 + }, + { + "epoch": 0.06, + "grad_norm": 4.6262923165133145, + "learning_rate": 1.998989103256774e-06, + "loss": 1.3761, + "step": 8483 + }, + { + "epoch": 0.06, + "grad_norm": 4.655467986349239, + "learning_rate": 1.9989888646669017e-06, + "loss": 1.553, + "step": 8484 + }, + { + "epoch": 0.06, + "grad_norm": 4.986639700257691, + "learning_rate": 1.9989886260488915e-06, + "loss": 1.4301, + "step": 8485 + }, + { + "epoch": 0.06, + "grad_norm": 5.2966895098218885, + "learning_rate": 1.9989883874027427e-06, + "loss": 1.53, + "step": 8486 + }, + { + "epoch": 0.06, + "grad_norm": 4.5444484427768295, + "learning_rate": 1.998988148728456e-06, + "loss": 1.3442, + "step": 8487 + }, + { + "epoch": 0.06, + "grad_norm": 5.008691070223799, + "learning_rate": 1.998987910026031e-06, + "loss": 1.3694, + "step": 8488 + }, + { + "epoch": 0.06, + "grad_norm": 4.448071289407071, + "learning_rate": 1.9989876712954676e-06, + "loss": 1.3987, + "step": 8489 + }, + { + "epoch": 0.06, + "grad_norm": 4.73707677272594, + "learning_rate": 1.9989874325367665e-06, + "loss": 1.2986, + "step": 8490 + }, + { + "epoch": 0.06, + "grad_norm": 4.439899938494986, + "learning_rate": 1.998987193749927e-06, + "loss": 1.2453, + "step": 8491 + }, + { + "epoch": 0.06, + "grad_norm": 4.431748355424933, + "learning_rate": 1.998986954934949e-06, + "loss": 1.289, + "step": 8492 + }, + { + "epoch": 0.06, + "grad_norm": 4.300642657485987, + "learning_rate": 1.9989867160918335e-06, + "loss": 1.249, + "step": 8493 + }, + { + "epoch": 0.06, + "grad_norm": 5.6811034368925135, + "learning_rate": 1.99898647722058e-06, + "loss": 1.1791, + "step": 8494 + }, + { + "epoch": 0.06, + "grad_norm": 4.26550025735318, + "learning_rate": 1.9989862383211874e-06, + "loss": 1.2816, + "step": 8495 + }, + { + "epoch": 0.06, + "grad_norm": 4.53708574090178, + "learning_rate": 1.9989859993936577e-06, + "loss": 1.1602, + "step": 8496 + }, + { + "epoch": 0.06, + "grad_norm": 4.742744124422893, + "learning_rate": 1.998985760437989e-06, + "loss": 1.395, + "step": 8497 + }, + { + "epoch": 0.06, + "grad_norm": 4.4690881576926325, + "learning_rate": 1.998985521454183e-06, + "loss": 1.353, + "step": 8498 + }, + { + "epoch": 0.06, + "grad_norm": 4.2509720499426775, + "learning_rate": 1.9989852824422385e-06, + "loss": 1.4282, + "step": 8499 + }, + { + "epoch": 0.06, + "grad_norm": 4.257946868797118, + "learning_rate": 1.998985043402156e-06, + "loss": 1.4332, + "step": 8500 + }, + { + "epoch": 0.06, + "grad_norm": 4.435618489918155, + "learning_rate": 1.9989848043339353e-06, + "loss": 1.3996, + "step": 8501 + }, + { + "epoch": 0.06, + "grad_norm": 4.331520898794051, + "learning_rate": 1.9989845652375767e-06, + "loss": 1.217, + "step": 8502 + }, + { + "epoch": 0.06, + "grad_norm": 5.3208047617569925, + "learning_rate": 1.9989843261130797e-06, + "loss": 1.4482, + "step": 8503 + }, + { + "epoch": 0.06, + "grad_norm": 4.213078074750517, + "learning_rate": 1.998984086960445e-06, + "loss": 1.3417, + "step": 8504 + }, + { + "epoch": 0.06, + "grad_norm": 4.277358132055113, + "learning_rate": 1.998983847779672e-06, + "loss": 1.2931, + "step": 8505 + }, + { + "epoch": 0.06, + "grad_norm": 4.771436607458458, + "learning_rate": 1.9989836085707614e-06, + "loss": 1.3588, + "step": 8506 + }, + { + "epoch": 0.06, + "grad_norm": 4.872825456159983, + "learning_rate": 1.998983369333712e-06, + "loss": 1.3651, + "step": 8507 + }, + { + "epoch": 0.06, + "grad_norm": 6.177634031680093, + "learning_rate": 1.9989831300685252e-06, + "loss": 1.3515, + "step": 8508 + }, + { + "epoch": 0.06, + "grad_norm": 4.793797796284552, + "learning_rate": 1.9989828907752e-06, + "loss": 1.3324, + "step": 8509 + }, + { + "epoch": 0.06, + "grad_norm": 5.195979347895909, + "learning_rate": 1.998982651453737e-06, + "loss": 1.4251, + "step": 8510 + }, + { + "epoch": 0.06, + "grad_norm": 4.199487295698571, + "learning_rate": 1.998982412104136e-06, + "loss": 1.4004, + "step": 8511 + }, + { + "epoch": 0.06, + "grad_norm": 5.073184223910384, + "learning_rate": 1.9989821727263964e-06, + "loss": 1.369, + "step": 8512 + }, + { + "epoch": 0.06, + "grad_norm": 4.8153718244069665, + "learning_rate": 1.9989819333205196e-06, + "loss": 1.4269, + "step": 8513 + }, + { + "epoch": 0.06, + "grad_norm": 5.342580593413908, + "learning_rate": 1.9989816938865045e-06, + "loss": 1.3005, + "step": 8514 + }, + { + "epoch": 0.06, + "grad_norm": 4.495641755331039, + "learning_rate": 1.998981454424351e-06, + "loss": 1.4519, + "step": 8515 + }, + { + "epoch": 0.06, + "grad_norm": 4.858941609644203, + "learning_rate": 1.99898121493406e-06, + "loss": 1.3741, + "step": 8516 + }, + { + "epoch": 0.06, + "grad_norm": 4.470121672802636, + "learning_rate": 1.998980975415631e-06, + "loss": 1.281, + "step": 8517 + }, + { + "epoch": 0.06, + "grad_norm": 4.609799640590294, + "learning_rate": 1.9989807358690638e-06, + "loss": 1.5039, + "step": 8518 + }, + { + "epoch": 0.06, + "grad_norm": 4.182923009388997, + "learning_rate": 1.998980496294359e-06, + "loss": 1.1797, + "step": 8519 + }, + { + "epoch": 0.06, + "grad_norm": 4.372301603110184, + "learning_rate": 1.998980256691516e-06, + "loss": 1.4187, + "step": 8520 + }, + { + "epoch": 0.06, + "grad_norm": 4.772789349020545, + "learning_rate": 1.998980017060535e-06, + "loss": 1.4531, + "step": 8521 + }, + { + "epoch": 0.06, + "grad_norm": 4.849981144748239, + "learning_rate": 1.998979777401416e-06, + "loss": 1.4215, + "step": 8522 + }, + { + "epoch": 0.06, + "grad_norm": 4.43961261312786, + "learning_rate": 1.9989795377141594e-06, + "loss": 1.4048, + "step": 8523 + }, + { + "epoch": 0.06, + "grad_norm": 4.959397781374239, + "learning_rate": 1.9989792979987645e-06, + "loss": 1.4702, + "step": 8524 + }, + { + "epoch": 0.06, + "grad_norm": 6.017486661827444, + "learning_rate": 1.998979058255232e-06, + "loss": 1.4678, + "step": 8525 + }, + { + "epoch": 0.06, + "grad_norm": 4.336367582601711, + "learning_rate": 1.9989788184835612e-06, + "loss": 1.4022, + "step": 8526 + }, + { + "epoch": 0.06, + "grad_norm": 5.056854763137082, + "learning_rate": 1.998978578683753e-06, + "loss": 1.3941, + "step": 8527 + }, + { + "epoch": 0.06, + "grad_norm": 5.071298356844417, + "learning_rate": 1.9989783388558063e-06, + "loss": 1.2104, + "step": 8528 + }, + { + "epoch": 0.06, + "grad_norm": 5.56109902802452, + "learning_rate": 1.9989780989997223e-06, + "loss": 1.3429, + "step": 8529 + }, + { + "epoch": 0.06, + "grad_norm": 4.272698035148218, + "learning_rate": 1.9989778591155e-06, + "loss": 1.4256, + "step": 8530 + }, + { + "epoch": 0.06, + "grad_norm": 4.4819311694642066, + "learning_rate": 1.9989776192031397e-06, + "loss": 1.3249, + "step": 8531 + }, + { + "epoch": 0.06, + "grad_norm": 4.77110257315929, + "learning_rate": 1.9989773792626417e-06, + "loss": 1.3022, + "step": 8532 + }, + { + "epoch": 0.06, + "grad_norm": 5.6644398684298025, + "learning_rate": 1.998977139294006e-06, + "loss": 1.2276, + "step": 8533 + }, + { + "epoch": 0.06, + "grad_norm": 5.143495642461218, + "learning_rate": 1.9989768992972323e-06, + "loss": 1.6171, + "step": 8534 + }, + { + "epoch": 0.06, + "grad_norm": 4.747245216439308, + "learning_rate": 1.998976659272321e-06, + "loss": 1.2689, + "step": 8535 + }, + { + "epoch": 0.06, + "grad_norm": 5.049445485742834, + "learning_rate": 1.9989764192192714e-06, + "loss": 1.4048, + "step": 8536 + }, + { + "epoch": 0.06, + "grad_norm": 5.059810980609118, + "learning_rate": 1.998976179138084e-06, + "loss": 1.3626, + "step": 8537 + }, + { + "epoch": 0.06, + "grad_norm": 4.626880381269344, + "learning_rate": 1.9989759390287592e-06, + "loss": 1.3461, + "step": 8538 + }, + { + "epoch": 0.06, + "grad_norm": 4.278100266976135, + "learning_rate": 1.9989756988912963e-06, + "loss": 1.404, + "step": 8539 + }, + { + "epoch": 0.06, + "grad_norm": 4.259013662464501, + "learning_rate": 1.9989754587256954e-06, + "loss": 1.2941, + "step": 8540 + }, + { + "epoch": 0.06, + "grad_norm": 4.8646773905077945, + "learning_rate": 1.998975218531957e-06, + "loss": 1.3921, + "step": 8541 + }, + { + "epoch": 0.06, + "eval_loss": 1.5622339248657227, + "eval_runtime": 4.5857, + "eval_samples_per_second": 1.963, + "eval_steps_per_second": 1.09, + "step": 8541 + }, + { + "epoch": 0.06, + "grad_norm": 4.560308396326247, + "learning_rate": 1.9989749783100805e-06, + "loss": 1.2731, + "step": 8542 + }, + { + "epoch": 0.06, + "grad_norm": 4.444471595101668, + "learning_rate": 1.9989747380600664e-06, + "loss": 1.4093, + "step": 8543 + }, + { + "epoch": 0.06, + "grad_norm": 4.67091987159255, + "learning_rate": 1.9989744977819147e-06, + "loss": 1.4359, + "step": 8544 + }, + { + "epoch": 0.06, + "grad_norm": 4.205426977104215, + "learning_rate": 1.9989742574756246e-06, + "loss": 1.2703, + "step": 8545 + }, + { + "epoch": 0.06, + "grad_norm": 4.532299284853898, + "learning_rate": 1.9989740171411973e-06, + "loss": 1.233, + "step": 8546 + }, + { + "epoch": 0.06, + "grad_norm": 4.053406104112797, + "learning_rate": 1.998973776778632e-06, + "loss": 1.1172, + "step": 8547 + }, + { + "epoch": 0.06, + "grad_norm": 4.904905801012794, + "learning_rate": 1.9989735363879288e-06, + "loss": 1.4605, + "step": 8548 + }, + { + "epoch": 0.06, + "grad_norm": 4.886231215167016, + "learning_rate": 1.9989732959690883e-06, + "loss": 1.5226, + "step": 8549 + }, + { + "epoch": 0.06, + "grad_norm": 4.361488596414847, + "learning_rate": 1.9989730555221094e-06, + "loss": 1.2426, + "step": 8550 + }, + { + "epoch": 0.06, + "grad_norm": 4.443425779548887, + "learning_rate": 1.998972815046993e-06, + "loss": 1.4173, + "step": 8551 + }, + { + "epoch": 0.06, + "grad_norm": 5.1952922918194, + "learning_rate": 1.9989725745437393e-06, + "loss": 1.2579, + "step": 8552 + }, + { + "epoch": 0.06, + "grad_norm": 4.3948464438093415, + "learning_rate": 1.9989723340123473e-06, + "loss": 1.4079, + "step": 8553 + }, + { + "epoch": 0.06, + "grad_norm": 4.261899456293534, + "learning_rate": 1.9989720934528176e-06, + "loss": 1.3644, + "step": 8554 + }, + { + "epoch": 0.06, + "grad_norm": 4.506906797424179, + "learning_rate": 1.9989718528651504e-06, + "loss": 1.3937, + "step": 8555 + }, + { + "epoch": 0.06, + "grad_norm": 5.6651610636706735, + "learning_rate": 1.9989716122493455e-06, + "loss": 1.4507, + "step": 8556 + }, + { + "epoch": 0.06, + "grad_norm": 4.243794223588025, + "learning_rate": 1.9989713716054027e-06, + "loss": 1.353, + "step": 8557 + }, + { + "epoch": 0.06, + "grad_norm": 4.589656190547693, + "learning_rate": 1.9989711309333227e-06, + "loss": 1.312, + "step": 8558 + }, + { + "epoch": 0.06, + "grad_norm": 7.183830005026017, + "learning_rate": 1.9989708902331042e-06, + "loss": 1.2717, + "step": 8559 + }, + { + "epoch": 0.06, + "grad_norm": 4.570281138766998, + "learning_rate": 1.9989706495047486e-06, + "loss": 1.5105, + "step": 8560 + }, + { + "epoch": 0.06, + "grad_norm": 4.984154098685025, + "learning_rate": 1.9989704087482554e-06, + "loss": 1.406, + "step": 8561 + }, + { + "epoch": 0.06, + "grad_norm": 4.6692178026189355, + "learning_rate": 1.998970167963624e-06, + "loss": 1.5083, + "step": 8562 + }, + { + "epoch": 0.06, + "grad_norm": 4.354950180037289, + "learning_rate": 1.9989699271508554e-06, + "loss": 1.2876, + "step": 8563 + }, + { + "epoch": 0.06, + "grad_norm": 5.692769757742883, + "learning_rate": 1.998969686309949e-06, + "loss": 1.3777, + "step": 8564 + }, + { + "epoch": 0.06, + "grad_norm": 10.344158078038417, + "learning_rate": 1.9989694454409046e-06, + "loss": 1.523, + "step": 8565 + }, + { + "epoch": 0.06, + "grad_norm": 5.157777099201219, + "learning_rate": 1.998969204543723e-06, + "loss": 1.4596, + "step": 8566 + }, + { + "epoch": 0.06, + "grad_norm": 5.54091234629367, + "learning_rate": 1.9989689636184035e-06, + "loss": 1.321, + "step": 8567 + }, + { + "epoch": 0.06, + "grad_norm": 4.575061344334274, + "learning_rate": 1.9989687226649468e-06, + "loss": 1.4686, + "step": 8568 + }, + { + "epoch": 0.06, + "grad_norm": 4.892550476958275, + "learning_rate": 1.9989684816833525e-06, + "loss": 1.3026, + "step": 8569 + }, + { + "epoch": 0.06, + "grad_norm": 4.828909312102762, + "learning_rate": 1.9989682406736197e-06, + "loss": 1.4125, + "step": 8570 + }, + { + "epoch": 0.06, + "grad_norm": 7.45745693202822, + "learning_rate": 1.9989679996357502e-06, + "loss": 1.3367, + "step": 8571 + }, + { + "epoch": 0.06, + "grad_norm": 4.4022759781226775, + "learning_rate": 1.9989677585697423e-06, + "loss": 1.2662, + "step": 8572 + }, + { + "epoch": 0.06, + "grad_norm": 4.340356873136581, + "learning_rate": 1.998967517475597e-06, + "loss": 1.2306, + "step": 8573 + }, + { + "epoch": 0.06, + "grad_norm": 5.419841733654, + "learning_rate": 1.998967276353315e-06, + "loss": 1.3005, + "step": 8574 + }, + { + "epoch": 0.06, + "grad_norm": 4.832158476472871, + "learning_rate": 1.9989670352028947e-06, + "loss": 1.3494, + "step": 8575 + }, + { + "epoch": 0.06, + "grad_norm": 4.430984219423789, + "learning_rate": 1.998966794024337e-06, + "loss": 1.2929, + "step": 8576 + }, + { + "epoch": 0.06, + "grad_norm": 4.248625008130523, + "learning_rate": 1.9989665528176414e-06, + "loss": 1.3362, + "step": 8577 + }, + { + "epoch": 0.06, + "grad_norm": 5.614643944877151, + "learning_rate": 1.9989663115828083e-06, + "loss": 1.3905, + "step": 8578 + }, + { + "epoch": 0.06, + "grad_norm": 6.63337513444417, + "learning_rate": 1.9989660703198377e-06, + "loss": 1.2821, + "step": 8579 + }, + { + "epoch": 0.06, + "grad_norm": 4.526822811437598, + "learning_rate": 1.9989658290287295e-06, + "loss": 1.4322, + "step": 8580 + }, + { + "epoch": 0.06, + "grad_norm": 4.8388842303922655, + "learning_rate": 1.998965587709484e-06, + "loss": 1.4267, + "step": 8581 + }, + { + "epoch": 0.06, + "grad_norm": 4.323232523098396, + "learning_rate": 1.998965346362101e-06, + "loss": 1.3074, + "step": 8582 + }, + { + "epoch": 0.06, + "grad_norm": 4.417307975487444, + "learning_rate": 1.9989651049865802e-06, + "loss": 1.3595, + "step": 8583 + }, + { + "epoch": 0.06, + "grad_norm": 4.3269613111556255, + "learning_rate": 1.998964863582922e-06, + "loss": 1.2635, + "step": 8584 + }, + { + "epoch": 0.06, + "grad_norm": 4.335371860108603, + "learning_rate": 1.9989646221511264e-06, + "loss": 1.3606, + "step": 8585 + }, + { + "epoch": 0.06, + "grad_norm": 4.359495829838069, + "learning_rate": 1.998964380691193e-06, + "loss": 1.3434, + "step": 8586 + }, + { + "epoch": 0.06, + "grad_norm": 4.44770043890702, + "learning_rate": 1.998964139203122e-06, + "loss": 1.4353, + "step": 8587 + }, + { + "epoch": 0.06, + "grad_norm": 4.915849793456516, + "learning_rate": 1.998963897686914e-06, + "loss": 1.4704, + "step": 8588 + }, + { + "epoch": 0.06, + "grad_norm": 4.30994049417916, + "learning_rate": 1.9989636561425685e-06, + "loss": 1.2936, + "step": 8589 + }, + { + "epoch": 0.06, + "grad_norm": 4.575542425437791, + "learning_rate": 1.9989634145700853e-06, + "loss": 1.2776, + "step": 8590 + }, + { + "epoch": 0.06, + "grad_norm": 4.611122011833436, + "learning_rate": 1.9989631729694644e-06, + "loss": 1.4858, + "step": 8591 + }, + { + "epoch": 0.06, + "grad_norm": 4.243257737791913, + "learning_rate": 1.9989629313407064e-06, + "loss": 1.3865, + "step": 8592 + }, + { + "epoch": 0.06, + "grad_norm": 4.216379044773727, + "learning_rate": 1.998962689683811e-06, + "loss": 1.3655, + "step": 8593 + }, + { + "epoch": 0.06, + "grad_norm": 4.777388875462107, + "learning_rate": 1.9989624479987777e-06, + "loss": 1.3467, + "step": 8594 + }, + { + "epoch": 0.06, + "grad_norm": 4.439491769251264, + "learning_rate": 1.998962206285607e-06, + "loss": 1.2773, + "step": 8595 + }, + { + "epoch": 0.06, + "grad_norm": 4.566005422118384, + "learning_rate": 1.998961964544299e-06, + "loss": 1.3942, + "step": 8596 + }, + { + "epoch": 0.06, + "grad_norm": 4.833735778742291, + "learning_rate": 1.9989617227748536e-06, + "loss": 1.4937, + "step": 8597 + }, + { + "epoch": 0.06, + "grad_norm": 4.460196615545827, + "learning_rate": 1.998961480977271e-06, + "loss": 1.3117, + "step": 8598 + }, + { + "epoch": 0.06, + "grad_norm": 4.8489372465448275, + "learning_rate": 1.9989612391515507e-06, + "loss": 1.5303, + "step": 8599 + }, + { + "epoch": 0.06, + "grad_norm": 4.515736253131246, + "learning_rate": 1.9989609972976932e-06, + "loss": 1.2889, + "step": 8600 + }, + { + "epoch": 0.06, + "grad_norm": 6.851979426866184, + "learning_rate": 1.998960755415698e-06, + "loss": 1.5018, + "step": 8601 + }, + { + "epoch": 0.06, + "grad_norm": 4.320321043670359, + "learning_rate": 1.9989605135055652e-06, + "loss": 1.3428, + "step": 8602 + }, + { + "epoch": 0.06, + "grad_norm": 4.067919097885855, + "learning_rate": 1.9989602715672955e-06, + "loss": 1.3363, + "step": 8603 + }, + { + "epoch": 0.06, + "grad_norm": 4.5896400589841635, + "learning_rate": 1.998960029600888e-06, + "loss": 1.2651, + "step": 8604 + }, + { + "epoch": 0.06, + "grad_norm": 4.521195068481925, + "learning_rate": 1.9989597876063436e-06, + "loss": 1.394, + "step": 8605 + }, + { + "epoch": 0.06, + "grad_norm": 4.333137968003755, + "learning_rate": 1.9989595455836615e-06, + "loss": 1.38, + "step": 8606 + }, + { + "epoch": 0.06, + "grad_norm": 4.2335118225035835, + "learning_rate": 1.998959303532842e-06, + "loss": 1.2555, + "step": 8607 + }, + { + "epoch": 0.06, + "grad_norm": 4.179939185913571, + "learning_rate": 1.9989590614538854e-06, + "loss": 1.3689, + "step": 8608 + }, + { + "epoch": 0.06, + "grad_norm": 8.37935699734852, + "learning_rate": 1.998958819346791e-06, + "loss": 1.4963, + "step": 8609 + }, + { + "epoch": 0.06, + "grad_norm": 5.168749470757185, + "learning_rate": 1.99895857721156e-06, + "loss": 1.4045, + "step": 8610 + }, + { + "epoch": 0.06, + "grad_norm": 4.515727329681631, + "learning_rate": 1.998958335048191e-06, + "loss": 1.274, + "step": 8611 + }, + { + "epoch": 0.06, + "grad_norm": 4.586023389036626, + "learning_rate": 1.9989580928566847e-06, + "loss": 1.4079, + "step": 8612 + }, + { + "epoch": 0.06, + "grad_norm": 4.306414951992419, + "learning_rate": 1.9989578506370417e-06, + "loss": 1.4126, + "step": 8613 + }, + { + "epoch": 0.06, + "grad_norm": 4.822983272936278, + "learning_rate": 1.9989576083892606e-06, + "loss": 1.4169, + "step": 8614 + }, + { + "epoch": 0.06, + "eval_loss": 1.561651587486267, + "eval_runtime": 4.6098, + "eval_samples_per_second": 1.952, + "eval_steps_per_second": 1.085, + "step": 8614 + }, + { + "epoch": 0.06, + "grad_norm": 5.404938213549534, + "learning_rate": 1.9989573661133427e-06, + "loss": 1.48, + "step": 8615 + }, + { + "epoch": 0.06, + "grad_norm": 4.621790196337391, + "learning_rate": 1.9989571238092873e-06, + "loss": 1.283, + "step": 8616 + }, + { + "epoch": 0.06, + "grad_norm": 4.443049372904137, + "learning_rate": 1.9989568814770947e-06, + "loss": 1.3574, + "step": 8617 + }, + { + "epoch": 0.06, + "grad_norm": 4.801069293612943, + "learning_rate": 1.9989566391167645e-06, + "loss": 1.3953, + "step": 8618 + }, + { + "epoch": 0.06, + "grad_norm": 4.90808002632212, + "learning_rate": 1.9989563967282976e-06, + "loss": 1.5248, + "step": 8619 + }, + { + "epoch": 0.06, + "grad_norm": 4.513745233629364, + "learning_rate": 1.9989561543116927e-06, + "loss": 1.4299, + "step": 8620 + }, + { + "epoch": 0.06, + "grad_norm": 4.625267465917626, + "learning_rate": 1.9989559118669506e-06, + "loss": 1.3427, + "step": 8621 + }, + { + "epoch": 0.06, + "grad_norm": 4.596777102150783, + "learning_rate": 1.9989556693940718e-06, + "loss": 1.4271, + "step": 8622 + }, + { + "epoch": 0.06, + "grad_norm": 4.487247482653133, + "learning_rate": 1.9989554268930553e-06, + "loss": 1.2861, + "step": 8623 + }, + { + "epoch": 0.06, + "grad_norm": 5.184012572654904, + "learning_rate": 1.9989551843639018e-06, + "loss": 1.5925, + "step": 8624 + }, + { + "epoch": 0.06, + "grad_norm": 4.9471120778181055, + "learning_rate": 1.998954941806611e-06, + "loss": 1.5653, + "step": 8625 + }, + { + "epoch": 0.06, + "grad_norm": 4.994694318313504, + "learning_rate": 1.998954699221183e-06, + "loss": 1.4042, + "step": 8626 + }, + { + "epoch": 0.06, + "grad_norm": 5.666546802417481, + "learning_rate": 1.9989544566076176e-06, + "loss": 1.5053, + "step": 8627 + }, + { + "epoch": 0.06, + "grad_norm": 4.576464304521912, + "learning_rate": 1.998954213965915e-06, + "loss": 1.3794, + "step": 8628 + }, + { + "epoch": 0.06, + "grad_norm": 6.8434106582639345, + "learning_rate": 1.998953971296075e-06, + "loss": 1.524, + "step": 8629 + }, + { + "epoch": 0.06, + "grad_norm": 4.458196304928374, + "learning_rate": 1.9989537285980986e-06, + "loss": 1.363, + "step": 8630 + }, + { + "epoch": 0.06, + "grad_norm": 5.076340426993319, + "learning_rate": 1.998953485871984e-06, + "loss": 1.4671, + "step": 8631 + }, + { + "epoch": 0.06, + "grad_norm": 4.999092485124452, + "learning_rate": 1.9989532431177327e-06, + "loss": 1.4206, + "step": 8632 + }, + { + "epoch": 0.06, + "grad_norm": 4.135494016866375, + "learning_rate": 1.998953000335344e-06, + "loss": 1.2825, + "step": 8633 + }, + { + "epoch": 0.06, + "grad_norm": 4.289975665728462, + "learning_rate": 1.9989527575248185e-06, + "loss": 1.3606, + "step": 8634 + }, + { + "epoch": 0.06, + "grad_norm": 4.120967424800536, + "learning_rate": 1.9989525146861553e-06, + "loss": 1.0945, + "step": 8635 + }, + { + "epoch": 0.06, + "grad_norm": 6.412239856672185, + "learning_rate": 1.9989522718193553e-06, + "loss": 1.2861, + "step": 8636 + }, + { + "epoch": 0.06, + "grad_norm": 4.124055552597445, + "learning_rate": 1.998952028924418e-06, + "loss": 1.2708, + "step": 8637 + }, + { + "epoch": 0.06, + "grad_norm": 4.2699944061669655, + "learning_rate": 1.998951786001344e-06, + "loss": 1.462, + "step": 8638 + }, + { + "epoch": 0.06, + "grad_norm": 4.808028542220539, + "learning_rate": 1.9989515430501324e-06, + "loss": 1.516, + "step": 8639 + }, + { + "epoch": 0.06, + "grad_norm": 4.530550955331592, + "learning_rate": 1.9989513000707838e-06, + "loss": 1.3129, + "step": 8640 + }, + { + "epoch": 0.06, + "grad_norm": 5.170794436694128, + "learning_rate": 1.998951057063298e-06, + "loss": 1.1989, + "step": 8641 + }, + { + "epoch": 0.06, + "grad_norm": 4.477557905506454, + "learning_rate": 1.9989508140276746e-06, + "loss": 1.3618, + "step": 8642 + }, + { + "epoch": 0.06, + "grad_norm": 4.297419670431285, + "learning_rate": 1.998950570963915e-06, + "loss": 1.3248, + "step": 8643 + }, + { + "epoch": 0.06, + "grad_norm": 4.834191699367694, + "learning_rate": 1.9989503278720176e-06, + "loss": 1.3471, + "step": 8644 + }, + { + "epoch": 0.06, + "grad_norm": 4.4287647826822845, + "learning_rate": 1.998950084751983e-06, + "loss": 1.4201, + "step": 8645 + }, + { + "epoch": 0.06, + "grad_norm": 4.186716153956602, + "learning_rate": 1.998949841603812e-06, + "loss": 1.3603, + "step": 8646 + }, + { + "epoch": 0.06, + "grad_norm": 4.490008277132269, + "learning_rate": 1.998949598427503e-06, + "loss": 1.3985, + "step": 8647 + }, + { + "epoch": 0.06, + "grad_norm": 5.426720452827193, + "learning_rate": 1.9989493552230577e-06, + "loss": 1.2834, + "step": 8648 + }, + { + "epoch": 0.06, + "grad_norm": 4.494441318763501, + "learning_rate": 1.998949111990475e-06, + "loss": 1.4092, + "step": 8649 + }, + { + "epoch": 0.06, + "grad_norm": 4.592067127642173, + "learning_rate": 1.998948868729755e-06, + "loss": 1.3878, + "step": 8650 + }, + { + "epoch": 0.06, + "grad_norm": 4.566106280331906, + "learning_rate": 1.9989486254408986e-06, + "loss": 1.3731, + "step": 8651 + }, + { + "epoch": 0.06, + "grad_norm": 4.179776470294157, + "learning_rate": 1.9989483821239044e-06, + "loss": 1.3391, + "step": 8652 + }, + { + "epoch": 0.06, + "grad_norm": 5.197549322309192, + "learning_rate": 1.9989481387787735e-06, + "loss": 1.5253, + "step": 8653 + }, + { + "epoch": 0.06, + "grad_norm": 5.67446584414418, + "learning_rate": 1.998947895405506e-06, + "loss": 1.4139, + "step": 8654 + }, + { + "epoch": 0.06, + "grad_norm": 4.054368467752239, + "learning_rate": 1.9989476520041006e-06, + "loss": 1.1647, + "step": 8655 + }, + { + "epoch": 0.06, + "grad_norm": 6.16009416378856, + "learning_rate": 1.9989474085745586e-06, + "loss": 1.6016, + "step": 8656 + }, + { + "epoch": 0.06, + "grad_norm": 4.4399785100788876, + "learning_rate": 1.9989471651168795e-06, + "loss": 1.2265, + "step": 8657 + }, + { + "epoch": 0.06, + "grad_norm": 4.580783083064213, + "learning_rate": 1.9989469216310636e-06, + "loss": 1.2891, + "step": 8658 + }, + { + "epoch": 0.06, + "grad_norm": 4.496061509764125, + "learning_rate": 1.9989466781171105e-06, + "loss": 1.3259, + "step": 8659 + }, + { + "epoch": 0.06, + "grad_norm": 4.411478429750405, + "learning_rate": 1.9989464345750203e-06, + "loss": 1.3539, + "step": 8660 + }, + { + "epoch": 0.06, + "grad_norm": 4.684656237641494, + "learning_rate": 1.9989461910047933e-06, + "loss": 1.2303, + "step": 8661 + }, + { + "epoch": 0.06, + "grad_norm": 4.460270820621148, + "learning_rate": 1.9989459474064292e-06, + "loss": 1.4033, + "step": 8662 + }, + { + "epoch": 0.06, + "grad_norm": 4.3521566859203125, + "learning_rate": 1.998945703779928e-06, + "loss": 1.3895, + "step": 8663 + }, + { + "epoch": 0.06, + "grad_norm": 4.153102238423549, + "learning_rate": 1.99894546012529e-06, + "loss": 1.2866, + "step": 8664 + }, + { + "epoch": 0.06, + "grad_norm": 4.535488259940307, + "learning_rate": 1.9989452164425147e-06, + "loss": 1.3091, + "step": 8665 + }, + { + "epoch": 0.06, + "grad_norm": 4.2177008668247735, + "learning_rate": 1.998944972731603e-06, + "loss": 1.3291, + "step": 8666 + }, + { + "epoch": 0.06, + "grad_norm": 4.394143711573607, + "learning_rate": 1.998944728992554e-06, + "loss": 1.1878, + "step": 8667 + }, + { + "epoch": 0.06, + "grad_norm": 4.317990618328454, + "learning_rate": 1.998944485225368e-06, + "loss": 1.2873, + "step": 8668 + }, + { + "epoch": 0.06, + "grad_norm": 4.860610716848633, + "learning_rate": 1.9989442414300453e-06, + "loss": 1.3647, + "step": 8669 + }, + { + "epoch": 0.06, + "grad_norm": 5.418178515526341, + "learning_rate": 1.9989439976065856e-06, + "loss": 1.343, + "step": 8670 + }, + { + "epoch": 0.06, + "grad_norm": 4.309911841757617, + "learning_rate": 1.9989437537549887e-06, + "loss": 1.3987, + "step": 8671 + }, + { + "epoch": 0.06, + "grad_norm": 6.134460312253366, + "learning_rate": 1.998943509875255e-06, + "loss": 1.1095, + "step": 8672 + }, + { + "epoch": 0.06, + "grad_norm": 5.69042170920404, + "learning_rate": 1.9989432659673846e-06, + "loss": 1.4126, + "step": 8673 + }, + { + "epoch": 0.06, + "grad_norm": 4.205104834688021, + "learning_rate": 1.998943022031377e-06, + "loss": 1.2288, + "step": 8674 + }, + { + "epoch": 0.06, + "grad_norm": 4.219213579974913, + "learning_rate": 1.998942778067233e-06, + "loss": 1.3297, + "step": 8675 + }, + { + "epoch": 0.06, + "grad_norm": 4.28436594377733, + "learning_rate": 1.9989425340749514e-06, + "loss": 1.2752, + "step": 8676 + }, + { + "epoch": 0.06, + "grad_norm": 4.792125683770973, + "learning_rate": 1.9989422900545336e-06, + "loss": 1.5153, + "step": 8677 + }, + { + "epoch": 0.06, + "grad_norm": 4.984280147366857, + "learning_rate": 1.9989420460059787e-06, + "loss": 1.4225, + "step": 8678 + }, + { + "epoch": 0.06, + "grad_norm": 4.364243265873441, + "learning_rate": 1.998941801929286e-06, + "loss": 1.4426, + "step": 8679 + }, + { + "epoch": 0.06, + "grad_norm": 5.221518211304695, + "learning_rate": 1.9989415578244577e-06, + "loss": 1.4625, + "step": 8680 + }, + { + "epoch": 0.06, + "grad_norm": 4.699562163985216, + "learning_rate": 1.998941313691492e-06, + "loss": 1.0918, + "step": 8681 + }, + { + "epoch": 0.06, + "grad_norm": 5.133053415829835, + "learning_rate": 1.99894106953039e-06, + "loss": 1.405, + "step": 8682 + }, + { + "epoch": 0.06, + "grad_norm": 5.126993969426706, + "learning_rate": 1.9989408253411504e-06, + "loss": 1.2673, + "step": 8683 + }, + { + "epoch": 0.06, + "grad_norm": 4.243153131967924, + "learning_rate": 1.998940581123774e-06, + "loss": 1.4695, + "step": 8684 + }, + { + "epoch": 0.06, + "grad_norm": 4.6581281023995285, + "learning_rate": 1.9989403368782608e-06, + "loss": 1.3283, + "step": 8685 + }, + { + "epoch": 0.06, + "grad_norm": 4.616248285747535, + "learning_rate": 1.998940092604611e-06, + "loss": 1.4178, + "step": 8686 + }, + { + "epoch": 0.06, + "grad_norm": 4.860542184260749, + "learning_rate": 1.9989398483028246e-06, + "loss": 1.4559, + "step": 8687 + }, + { + "epoch": 0.06, + "eval_loss": 1.5583451986312866, + "eval_runtime": 4.5891, + "eval_samples_per_second": 1.961, + "eval_steps_per_second": 1.09, + "step": 8687 + }, + { + "epoch": 0.06, + "grad_norm": 4.230646743272511, + "learning_rate": 1.998939603972901e-06, + "loss": 1.2765, + "step": 8688 + }, + { + "epoch": 0.06, + "grad_norm": 4.3307652735719895, + "learning_rate": 1.998939359614841e-06, + "loss": 1.3156, + "step": 8689 + }, + { + "epoch": 0.06, + "grad_norm": 5.051706932839966, + "learning_rate": 1.998939115228644e-06, + "loss": 1.3716, + "step": 8690 + }, + { + "epoch": 0.06, + "grad_norm": 4.579711634780233, + "learning_rate": 1.9989388708143097e-06, + "loss": 1.3465, + "step": 8691 + }, + { + "epoch": 0.06, + "grad_norm": 4.713771561682751, + "learning_rate": 1.998938626371839e-06, + "loss": 1.4448, + "step": 8692 + }, + { + "epoch": 0.06, + "grad_norm": 4.358999498209377, + "learning_rate": 1.998938381901232e-06, + "loss": 1.2604, + "step": 8693 + }, + { + "epoch": 0.06, + "grad_norm": 4.78337687012583, + "learning_rate": 1.998938137402488e-06, + "loss": 1.3188, + "step": 8694 + }, + { + "epoch": 0.06, + "grad_norm": 4.819149794860213, + "learning_rate": 1.9989378928756066e-06, + "loss": 1.3173, + "step": 8695 + }, + { + "epoch": 0.06, + "grad_norm": 4.237940696688464, + "learning_rate": 1.998937648320589e-06, + "loss": 1.2149, + "step": 8696 + }, + { + "epoch": 0.06, + "grad_norm": 4.381531228903312, + "learning_rate": 1.9989374037374344e-06, + "loss": 1.392, + "step": 8697 + }, + { + "epoch": 0.06, + "grad_norm": 4.47317585160481, + "learning_rate": 1.9989371591261434e-06, + "loss": 1.3614, + "step": 8698 + }, + { + "epoch": 0.06, + "grad_norm": 4.786103712286903, + "learning_rate": 1.998936914486715e-06, + "loss": 1.3327, + "step": 8699 + }, + { + "epoch": 0.06, + "grad_norm": 4.397710283266012, + "learning_rate": 1.9989366698191507e-06, + "loss": 1.3765, + "step": 8700 + }, + { + "epoch": 0.06, + "grad_norm": 4.659605117600823, + "learning_rate": 1.9989364251234495e-06, + "loss": 1.4559, + "step": 8701 + }, + { + "epoch": 0.06, + "grad_norm": 5.549783460574593, + "learning_rate": 1.998936180399611e-06, + "loss": 1.282, + "step": 8702 + }, + { + "epoch": 0.06, + "grad_norm": 4.832371672535695, + "learning_rate": 1.9989359356476364e-06, + "loss": 1.4638, + "step": 8703 + }, + { + "epoch": 0.06, + "grad_norm": 5.039478393047218, + "learning_rate": 1.998935690867525e-06, + "loss": 1.2765, + "step": 8704 + }, + { + "epoch": 0.06, + "grad_norm": 4.237934104476604, + "learning_rate": 1.9989354460592763e-06, + "loss": 1.3035, + "step": 8705 + }, + { + "epoch": 0.06, + "grad_norm": 5.597502989358775, + "learning_rate": 1.9989352012228914e-06, + "loss": 1.579, + "step": 8706 + }, + { + "epoch": 0.06, + "grad_norm": 4.34738877208113, + "learning_rate": 1.99893495635837e-06, + "loss": 1.3389, + "step": 8707 + }, + { + "epoch": 0.06, + "grad_norm": 4.9117493324578065, + "learning_rate": 1.9989347114657117e-06, + "loss": 1.422, + "step": 8708 + }, + { + "epoch": 0.06, + "grad_norm": 4.756986013116514, + "learning_rate": 1.998934466544917e-06, + "loss": 1.3718, + "step": 8709 + }, + { + "epoch": 0.06, + "grad_norm": 5.218655925139599, + "learning_rate": 1.998934221595985e-06, + "loss": 1.304, + "step": 8710 + }, + { + "epoch": 0.06, + "grad_norm": 4.391285921635442, + "learning_rate": 1.998933976618917e-06, + "loss": 1.2891, + "step": 8711 + }, + { + "epoch": 0.06, + "grad_norm": 4.167040109285794, + "learning_rate": 1.998933731613712e-06, + "loss": 1.3312, + "step": 8712 + }, + { + "epoch": 0.06, + "grad_norm": 4.522685898963279, + "learning_rate": 1.9989334865803706e-06, + "loss": 1.3341, + "step": 8713 + }, + { + "epoch": 0.06, + "grad_norm": 11.437772127217965, + "learning_rate": 1.9989332415188926e-06, + "loss": 1.5211, + "step": 8714 + }, + { + "epoch": 0.06, + "grad_norm": 5.074391210248724, + "learning_rate": 1.9989329964292774e-06, + "loss": 1.0282, + "step": 8715 + }, + { + "epoch": 0.06, + "grad_norm": 4.618355620435837, + "learning_rate": 1.9989327513115264e-06, + "loss": 1.4167, + "step": 8716 + }, + { + "epoch": 0.06, + "grad_norm": 4.466497069781705, + "learning_rate": 1.998932506165638e-06, + "loss": 1.3133, + "step": 8717 + }, + { + "epoch": 0.06, + "grad_norm": 4.611878151625593, + "learning_rate": 1.9989322609916136e-06, + "loss": 1.1923, + "step": 8718 + }, + { + "epoch": 0.06, + "grad_norm": 4.680157195069533, + "learning_rate": 1.998932015789452e-06, + "loss": 1.5067, + "step": 8719 + }, + { + "epoch": 0.06, + "grad_norm": 4.441432214504197, + "learning_rate": 1.9989317705591543e-06, + "loss": 1.31, + "step": 8720 + }, + { + "epoch": 0.06, + "grad_norm": 4.174305541652056, + "learning_rate": 1.99893152530072e-06, + "loss": 1.3555, + "step": 8721 + }, + { + "epoch": 0.06, + "grad_norm": 4.188362270519318, + "learning_rate": 1.998931280014149e-06, + "loss": 1.3384, + "step": 8722 + }, + { + "epoch": 0.06, + "grad_norm": 7.600255299744319, + "learning_rate": 1.9989310346994414e-06, + "loss": 1.215, + "step": 8723 + }, + { + "epoch": 0.06, + "grad_norm": 4.5732688620236415, + "learning_rate": 1.998930789356597e-06, + "loss": 1.4367, + "step": 8724 + }, + { + "epoch": 0.06, + "grad_norm": 4.738497424726799, + "learning_rate": 1.9989305439856164e-06, + "loss": 1.3956, + "step": 8725 + }, + { + "epoch": 0.06, + "grad_norm": 4.271996059356595, + "learning_rate": 1.998930298586499e-06, + "loss": 1.3214, + "step": 8726 + }, + { + "epoch": 0.06, + "grad_norm": 4.300865942253508, + "learning_rate": 1.9989300531592453e-06, + "loss": 1.3691, + "step": 8727 + }, + { + "epoch": 0.06, + "grad_norm": 4.272467364521415, + "learning_rate": 1.998929807703855e-06, + "loss": 1.3222, + "step": 8728 + }, + { + "epoch": 0.06, + "grad_norm": 4.258944322355307, + "learning_rate": 1.998929562220328e-06, + "loss": 1.354, + "step": 8729 + }, + { + "epoch": 0.06, + "grad_norm": 4.402519104960977, + "learning_rate": 1.998929316708665e-06, + "loss": 1.305, + "step": 8730 + }, + { + "epoch": 0.06, + "grad_norm": 4.973747776517347, + "learning_rate": 1.998929071168865e-06, + "loss": 1.5857, + "step": 8731 + }, + { + "epoch": 0.06, + "grad_norm": 4.436818796756212, + "learning_rate": 1.9989288256009283e-06, + "loss": 1.4667, + "step": 8732 + }, + { + "epoch": 0.06, + "grad_norm": 4.166641529143792, + "learning_rate": 1.9989285800048556e-06, + "loss": 1.2593, + "step": 8733 + }, + { + "epoch": 0.06, + "grad_norm": 5.143412085210361, + "learning_rate": 1.998928334380646e-06, + "loss": 1.368, + "step": 8734 + }, + { + "epoch": 0.06, + "grad_norm": 5.623616716017324, + "learning_rate": 1.9989280887283004e-06, + "loss": 1.2999, + "step": 8735 + }, + { + "epoch": 0.06, + "grad_norm": 4.929314548576958, + "learning_rate": 1.998927843047818e-06, + "loss": 1.3587, + "step": 8736 + }, + { + "epoch": 0.06, + "grad_norm": 4.596938892659514, + "learning_rate": 1.998927597339199e-06, + "loss": 1.4456, + "step": 8737 + }, + { + "epoch": 0.06, + "grad_norm": 4.692918653297475, + "learning_rate": 1.9989273516024435e-06, + "loss": 1.2818, + "step": 8738 + }, + { + "epoch": 0.06, + "grad_norm": 4.344206382567416, + "learning_rate": 1.998927105837552e-06, + "loss": 1.3174, + "step": 8739 + }, + { + "epoch": 0.06, + "grad_norm": 4.794268839973716, + "learning_rate": 1.9989268600445235e-06, + "loss": 1.3462, + "step": 8740 + }, + { + "epoch": 0.06, + "grad_norm": 5.428331293785562, + "learning_rate": 1.998926614223359e-06, + "loss": 1.342, + "step": 8741 + }, + { + "epoch": 0.06, + "grad_norm": 4.360292445286974, + "learning_rate": 1.9989263683740578e-06, + "loss": 1.4407, + "step": 8742 + }, + { + "epoch": 0.06, + "grad_norm": 4.980192833133715, + "learning_rate": 1.9989261224966202e-06, + "loss": 1.3852, + "step": 8743 + }, + { + "epoch": 0.06, + "grad_norm": 5.0662858232348285, + "learning_rate": 1.9989258765910463e-06, + "loss": 1.2963, + "step": 8744 + }, + { + "epoch": 0.06, + "grad_norm": 4.422522279617291, + "learning_rate": 1.9989256306573357e-06, + "loss": 1.2535, + "step": 8745 + }, + { + "epoch": 0.06, + "grad_norm": 4.5365366347764, + "learning_rate": 1.9989253846954892e-06, + "loss": 1.3592, + "step": 8746 + }, + { + "epoch": 0.06, + "grad_norm": 4.304538924155177, + "learning_rate": 1.998925138705506e-06, + "loss": 1.4012, + "step": 8747 + }, + { + "epoch": 0.06, + "grad_norm": 4.884147183983975, + "learning_rate": 1.9989248926873864e-06, + "loss": 1.268, + "step": 8748 + }, + { + "epoch": 0.06, + "grad_norm": 4.609824951436014, + "learning_rate": 1.9989246466411305e-06, + "loss": 1.2965, + "step": 8749 + }, + { + "epoch": 0.06, + "grad_norm": 5.480861767927759, + "learning_rate": 1.998924400566738e-06, + "loss": 1.1923, + "step": 8750 + }, + { + "epoch": 0.06, + "grad_norm": 5.195946348947488, + "learning_rate": 1.998924154464209e-06, + "loss": 1.3513, + "step": 8751 + }, + { + "epoch": 0.06, + "grad_norm": 4.545555038200293, + "learning_rate": 1.998923908333544e-06, + "loss": 1.2329, + "step": 8752 + }, + { + "epoch": 0.06, + "grad_norm": 4.556411712333217, + "learning_rate": 1.9989236621747426e-06, + "loss": 1.4029, + "step": 8753 + }, + { + "epoch": 0.06, + "grad_norm": 4.405448473310726, + "learning_rate": 1.9989234159878047e-06, + "loss": 1.279, + "step": 8754 + }, + { + "epoch": 0.06, + "grad_norm": 5.125783994932461, + "learning_rate": 1.9989231697727305e-06, + "loss": 1.3239, + "step": 8755 + }, + { + "epoch": 0.06, + "grad_norm": 4.436118542277149, + "learning_rate": 1.9989229235295195e-06, + "loss": 1.3637, + "step": 8756 + }, + { + "epoch": 0.06, + "grad_norm": 4.488235923339999, + "learning_rate": 1.9989226772581727e-06, + "loss": 1.4432, + "step": 8757 + }, + { + "epoch": 0.06, + "grad_norm": 4.501830793620562, + "learning_rate": 1.99892243095869e-06, + "loss": 1.318, + "step": 8758 + }, + { + "epoch": 0.06, + "grad_norm": 4.5392638161088685, + "learning_rate": 1.99892218463107e-06, + "loss": 1.4454, + "step": 8759 + }, + { + "epoch": 0.06, + "grad_norm": 4.9460297613034765, + "learning_rate": 1.9989219382753143e-06, + "loss": 1.5006, + "step": 8760 + }, + { + "epoch": 0.06, + "eval_loss": 1.555009126663208, + "eval_runtime": 4.5781, + "eval_samples_per_second": 1.966, + "eval_steps_per_second": 1.092, + "step": 8760 + }, + { + "epoch": 0.06, + "grad_norm": 4.839212746469922, + "learning_rate": 1.998921691891422e-06, + "loss": 1.5159, + "step": 8761 + }, + { + "epoch": 0.06, + "grad_norm": 4.682904810042825, + "learning_rate": 1.9989214454793938e-06, + "loss": 1.4336, + "step": 8762 + }, + { + "epoch": 0.06, + "grad_norm": 4.4672441639336915, + "learning_rate": 1.998921199039229e-06, + "loss": 1.3792, + "step": 8763 + }, + { + "epoch": 0.06, + "grad_norm": 4.389071968080816, + "learning_rate": 1.998920952570928e-06, + "loss": 1.3526, + "step": 8764 + }, + { + "epoch": 0.06, + "grad_norm": 4.692541912597106, + "learning_rate": 1.9989207060744906e-06, + "loss": 1.3597, + "step": 8765 + }, + { + "epoch": 0.06, + "grad_norm": 4.371014152314616, + "learning_rate": 1.998920459549917e-06, + "loss": 1.4378, + "step": 8766 + }, + { + "epoch": 0.06, + "grad_norm": 7.6450488555112, + "learning_rate": 1.9989202129972073e-06, + "loss": 1.5821, + "step": 8767 + }, + { + "epoch": 0.06, + "grad_norm": 4.389847254791679, + "learning_rate": 1.998919966416361e-06, + "loss": 1.3623, + "step": 8768 + }, + { + "epoch": 0.06, + "grad_norm": 4.310722944850334, + "learning_rate": 1.9989197198073788e-06, + "loss": 1.4095, + "step": 8769 + }, + { + "epoch": 0.06, + "grad_norm": 5.896039638219889, + "learning_rate": 1.9989194731702602e-06, + "loss": 1.3831, + "step": 8770 + }, + { + "epoch": 0.06, + "grad_norm": 5.094709366572391, + "learning_rate": 1.9989192265050054e-06, + "loss": 1.5436, + "step": 8771 + }, + { + "epoch": 0.06, + "grad_norm": 5.081231840658383, + "learning_rate": 1.998918979811614e-06, + "loss": 1.2478, + "step": 8772 + }, + { + "epoch": 0.06, + "grad_norm": 4.765541311742645, + "learning_rate": 1.9989187330900863e-06, + "loss": 1.2019, + "step": 8773 + }, + { + "epoch": 0.06, + "grad_norm": 4.507442922285264, + "learning_rate": 1.9989184863404234e-06, + "loss": 1.497, + "step": 8774 + }, + { + "epoch": 0.06, + "grad_norm": 4.650385783759415, + "learning_rate": 1.9989182395626232e-06, + "loss": 1.341, + "step": 8775 + }, + { + "epoch": 0.06, + "grad_norm": 5.193658928719714, + "learning_rate": 1.9989179927566876e-06, + "loss": 1.3883, + "step": 8776 + }, + { + "epoch": 0.06, + "grad_norm": 4.708588008961572, + "learning_rate": 1.9989177459226153e-06, + "loss": 1.2086, + "step": 8777 + }, + { + "epoch": 0.06, + "grad_norm": 4.361636087189225, + "learning_rate": 1.9989174990604067e-06, + "loss": 1.5196, + "step": 8778 + }, + { + "epoch": 0.06, + "grad_norm": 4.93887406872926, + "learning_rate": 1.9989172521700626e-06, + "loss": 1.3296, + "step": 8779 + }, + { + "epoch": 0.06, + "grad_norm": 4.308513529177304, + "learning_rate": 1.9989170052515817e-06, + "loss": 1.1928, + "step": 8780 + }, + { + "epoch": 0.06, + "grad_norm": 6.029849476676653, + "learning_rate": 1.998916758304965e-06, + "loss": 1.4123, + "step": 8781 + }, + { + "epoch": 0.06, + "grad_norm": 4.9678471569973, + "learning_rate": 1.998916511330212e-06, + "loss": 1.3926, + "step": 8782 + }, + { + "epoch": 0.06, + "grad_norm": 4.7508932027583635, + "learning_rate": 1.9989162643273225e-06, + "loss": 1.2513, + "step": 8783 + }, + { + "epoch": 0.06, + "grad_norm": 5.298068532157554, + "learning_rate": 1.9989160172962972e-06, + "loss": 1.4054, + "step": 8784 + }, + { + "epoch": 0.06, + "grad_norm": 4.358156610070261, + "learning_rate": 1.9989157702371356e-06, + "loss": 1.1538, + "step": 8785 + }, + { + "epoch": 0.06, + "grad_norm": 5.452888444017439, + "learning_rate": 1.998915523149838e-06, + "loss": 1.4679, + "step": 8786 + }, + { + "epoch": 0.06, + "grad_norm": 4.489425381541063, + "learning_rate": 1.9989152760344043e-06, + "loss": 1.2096, + "step": 8787 + }, + { + "epoch": 0.06, + "grad_norm": 4.264896254010867, + "learning_rate": 1.9989150288908346e-06, + "loss": 1.4319, + "step": 8788 + }, + { + "epoch": 0.06, + "grad_norm": 4.757954697110054, + "learning_rate": 1.9989147817191286e-06, + "loss": 1.3849, + "step": 8789 + }, + { + "epoch": 0.06, + "grad_norm": 4.209735455715297, + "learning_rate": 1.9989145345192863e-06, + "loss": 1.1999, + "step": 8790 + }, + { + "epoch": 0.06, + "grad_norm": 4.136721460927477, + "learning_rate": 1.998914287291308e-06, + "loss": 1.3417, + "step": 8791 + }, + { + "epoch": 0.06, + "grad_norm": 4.111017128544717, + "learning_rate": 1.998914040035194e-06, + "loss": 1.3239, + "step": 8792 + }, + { + "epoch": 0.06, + "grad_norm": 4.12195419282272, + "learning_rate": 1.9989137927509435e-06, + "loss": 1.2935, + "step": 8793 + }, + { + "epoch": 0.06, + "grad_norm": 4.679635603064866, + "learning_rate": 1.998913545438557e-06, + "loss": 1.2381, + "step": 8794 + }, + { + "epoch": 0.06, + "grad_norm": 4.403710047305166, + "learning_rate": 1.9989132980980345e-06, + "loss": 1.4084, + "step": 8795 + }, + { + "epoch": 0.06, + "grad_norm": 8.025200756047498, + "learning_rate": 1.998913050729376e-06, + "loss": 1.3225, + "step": 8796 + }, + { + "epoch": 0.06, + "grad_norm": 4.758372436662555, + "learning_rate": 1.9989128033325816e-06, + "loss": 1.4158, + "step": 8797 + }, + { + "epoch": 0.06, + "grad_norm": 5.640940112631746, + "learning_rate": 1.998912555907651e-06, + "loss": 1.347, + "step": 8798 + }, + { + "epoch": 0.06, + "grad_norm": 5.391049757984428, + "learning_rate": 1.9989123084545837e-06, + "loss": 1.24, + "step": 8799 + }, + { + "epoch": 0.06, + "grad_norm": 5.036625817707134, + "learning_rate": 1.998912060973381e-06, + "loss": 1.3422, + "step": 8800 + }, + { + "epoch": 0.06, + "grad_norm": 4.817900719965218, + "learning_rate": 1.9989118134640424e-06, + "loss": 1.487, + "step": 8801 + }, + { + "epoch": 0.06, + "grad_norm": 4.319819293700203, + "learning_rate": 1.9989115659265676e-06, + "loss": 1.2542, + "step": 8802 + }, + { + "epoch": 0.06, + "grad_norm": 4.991466332708066, + "learning_rate": 1.9989113183609565e-06, + "loss": 1.3914, + "step": 8803 + }, + { + "epoch": 0.06, + "grad_norm": 4.553171235603451, + "learning_rate": 1.99891107076721e-06, + "loss": 1.2282, + "step": 8804 + }, + { + "epoch": 0.06, + "grad_norm": 4.748633228554743, + "learning_rate": 1.9989108231453267e-06, + "loss": 1.4943, + "step": 8805 + }, + { + "epoch": 0.06, + "grad_norm": 4.318508498223785, + "learning_rate": 1.998910575495308e-06, + "loss": 1.2343, + "step": 8806 + }, + { + "epoch": 0.06, + "grad_norm": 4.24344702131763, + "learning_rate": 1.9989103278171533e-06, + "loss": 1.4504, + "step": 8807 + }, + { + "epoch": 0.06, + "grad_norm": 4.3328169325124675, + "learning_rate": 1.9989100801108624e-06, + "loss": 1.32, + "step": 8808 + }, + { + "epoch": 0.06, + "grad_norm": 4.666296720918735, + "learning_rate": 1.9989098323764355e-06, + "loss": 1.4708, + "step": 8809 + }, + { + "epoch": 0.06, + "grad_norm": 4.663540340753861, + "learning_rate": 1.9989095846138732e-06, + "loss": 1.2516, + "step": 8810 + }, + { + "epoch": 0.06, + "grad_norm": 4.879755583128598, + "learning_rate": 1.998909336823174e-06, + "loss": 1.213, + "step": 8811 + }, + { + "epoch": 0.06, + "grad_norm": 5.82249615089797, + "learning_rate": 1.9989090890043396e-06, + "loss": 1.5187, + "step": 8812 + }, + { + "epoch": 0.06, + "grad_norm": 4.7303714460669175, + "learning_rate": 1.998908841157369e-06, + "loss": 1.4238, + "step": 8813 + }, + { + "epoch": 0.06, + "grad_norm": 4.990304035232559, + "learning_rate": 1.9989085932822625e-06, + "loss": 1.3205, + "step": 8814 + }, + { + "epoch": 0.06, + "grad_norm": 5.467170374744285, + "learning_rate": 1.99890834537902e-06, + "loss": 1.559, + "step": 8815 + }, + { + "epoch": 0.06, + "grad_norm": 4.307507709697949, + "learning_rate": 1.9989080974476417e-06, + "loss": 1.3504, + "step": 8816 + }, + { + "epoch": 0.06, + "grad_norm": 4.733747372979935, + "learning_rate": 1.998907849488127e-06, + "loss": 1.4028, + "step": 8817 + }, + { + "epoch": 0.06, + "grad_norm": 16.527564507850006, + "learning_rate": 1.998907601500477e-06, + "loss": 1.4501, + "step": 8818 + }, + { + "epoch": 0.06, + "grad_norm": 4.40096857873657, + "learning_rate": 1.998907353484691e-06, + "loss": 1.2448, + "step": 8819 + }, + { + "epoch": 0.06, + "grad_norm": 4.459333126206982, + "learning_rate": 1.9989071054407683e-06, + "loss": 1.3535, + "step": 8820 + }, + { + "epoch": 0.06, + "grad_norm": 4.506137806984103, + "learning_rate": 1.9989068573687107e-06, + "loss": 1.3735, + "step": 8821 + }, + { + "epoch": 0.06, + "grad_norm": 4.474364100939082, + "learning_rate": 1.998906609268517e-06, + "loss": 1.4221, + "step": 8822 + }, + { + "epoch": 0.06, + "grad_norm": 4.735553303601279, + "learning_rate": 1.998906361140187e-06, + "loss": 1.3305, + "step": 8823 + }, + { + "epoch": 0.06, + "grad_norm": 5.127362221019991, + "learning_rate": 1.998906112983722e-06, + "loss": 1.3965, + "step": 8824 + }, + { + "epoch": 0.06, + "grad_norm": 4.9303345751613, + "learning_rate": 1.9989058647991202e-06, + "loss": 1.3227, + "step": 8825 + }, + { + "epoch": 0.06, + "grad_norm": 4.783778776675495, + "learning_rate": 1.9989056165863828e-06, + "loss": 1.4039, + "step": 8826 + }, + { + "epoch": 0.06, + "grad_norm": 4.70280304045451, + "learning_rate": 1.99890536834551e-06, + "loss": 1.1958, + "step": 8827 + }, + { + "epoch": 0.06, + "grad_norm": 4.7621594928258695, + "learning_rate": 1.9989051200765006e-06, + "loss": 1.5909, + "step": 8828 + }, + { + "epoch": 0.06, + "grad_norm": 8.10383376704116, + "learning_rate": 1.998904871779356e-06, + "loss": 1.4304, + "step": 8829 + }, + { + "epoch": 0.06, + "grad_norm": 4.2341673557291815, + "learning_rate": 1.9989046234540753e-06, + "loss": 1.0553, + "step": 8830 + }, + { + "epoch": 0.06, + "grad_norm": 4.526921310757564, + "learning_rate": 1.9989043751006588e-06, + "loss": 1.2812, + "step": 8831 + }, + { + "epoch": 0.06, + "grad_norm": 4.335854028565122, + "learning_rate": 1.9989041267191064e-06, + "loss": 1.4074, + "step": 8832 + }, + { + "epoch": 0.06, + "grad_norm": 4.726335316129505, + "learning_rate": 1.9989038783094185e-06, + "loss": 1.45, + "step": 8833 + }, + { + "epoch": 0.06, + "eval_loss": 1.558302402496338, + "eval_runtime": 4.6017, + "eval_samples_per_second": 1.956, + "eval_steps_per_second": 1.087, + "step": 8833 + }, + { + "epoch": 0.06, + "grad_norm": 4.363316698415397, + "learning_rate": 1.9989036298715948e-06, + "loss": 1.2738, + "step": 8834 + }, + { + "epoch": 0.06, + "grad_norm": 4.699976256764717, + "learning_rate": 1.9989033814056347e-06, + "loss": 1.4648, + "step": 8835 + }, + { + "epoch": 0.06, + "grad_norm": 4.886362899403995, + "learning_rate": 1.998903132911539e-06, + "loss": 1.2614, + "step": 8836 + }, + { + "epoch": 0.06, + "grad_norm": 4.679037576520284, + "learning_rate": 1.998902884389308e-06, + "loss": 1.367, + "step": 8837 + }, + { + "epoch": 0.06, + "grad_norm": 4.450518318121184, + "learning_rate": 1.998902635838941e-06, + "loss": 1.3282, + "step": 8838 + }, + { + "epoch": 0.06, + "grad_norm": 4.103511848760463, + "learning_rate": 1.9989023872604384e-06, + "loss": 1.1773, + "step": 8839 + }, + { + "epoch": 0.06, + "grad_norm": 4.356978446538244, + "learning_rate": 1.9989021386537997e-06, + "loss": 1.3099, + "step": 8840 + }, + { + "epoch": 0.06, + "grad_norm": 5.043122752496365, + "learning_rate": 1.9989018900190255e-06, + "loss": 1.5796, + "step": 8841 + }, + { + "epoch": 0.06, + "grad_norm": 4.811375667955704, + "learning_rate": 1.9989016413561155e-06, + "loss": 1.4509, + "step": 8842 + }, + { + "epoch": 0.06, + "grad_norm": 4.525444669181647, + "learning_rate": 1.99890139266507e-06, + "loss": 1.4023, + "step": 8843 + }, + { + "epoch": 0.06, + "grad_norm": 4.279821968144671, + "learning_rate": 1.998901143945888e-06, + "loss": 1.238, + "step": 8844 + }, + { + "epoch": 0.06, + "grad_norm": 4.4510274186849825, + "learning_rate": 1.998900895198571e-06, + "loss": 1.4324, + "step": 8845 + }, + { + "epoch": 0.06, + "grad_norm": 4.191724069124076, + "learning_rate": 1.998900646423118e-06, + "loss": 1.2724, + "step": 8846 + }, + { + "epoch": 0.06, + "grad_norm": 5.175557266684348, + "learning_rate": 1.9989003976195293e-06, + "loss": 1.3548, + "step": 8847 + }, + { + "epoch": 0.06, + "grad_norm": 4.587299792626568, + "learning_rate": 1.998900148787805e-06, + "loss": 1.4213, + "step": 8848 + }, + { + "epoch": 0.06, + "grad_norm": 4.1151803926653905, + "learning_rate": 1.998899899927945e-06, + "loss": 1.3732, + "step": 8849 + }, + { + "epoch": 0.06, + "grad_norm": 4.367591021656302, + "learning_rate": 1.998899651039949e-06, + "loss": 1.3188, + "step": 8850 + }, + { + "epoch": 0.06, + "grad_norm": 4.751278274380196, + "learning_rate": 1.9988994021238178e-06, + "loss": 1.2984, + "step": 8851 + }, + { + "epoch": 0.06, + "grad_norm": 4.783135464172729, + "learning_rate": 1.998899153179551e-06, + "loss": 1.4884, + "step": 8852 + }, + { + "epoch": 0.06, + "grad_norm": 7.8611914946675405, + "learning_rate": 1.998898904207148e-06, + "loss": 1.3191, + "step": 8853 + }, + { + "epoch": 0.06, + "grad_norm": 5.906791464476312, + "learning_rate": 1.9988986552066095e-06, + "loss": 1.2687, + "step": 8854 + }, + { + "epoch": 0.06, + "grad_norm": 4.385300189761708, + "learning_rate": 1.9988984061779358e-06, + "loss": 1.245, + "step": 8855 + }, + { + "epoch": 0.06, + "grad_norm": 4.971463879300862, + "learning_rate": 1.9988981571211257e-06, + "loss": 1.1891, + "step": 8856 + }, + { + "epoch": 0.06, + "grad_norm": 4.6166773470662426, + "learning_rate": 1.9988979080361807e-06, + "loss": 1.2598, + "step": 8857 + }, + { + "epoch": 0.06, + "grad_norm": 4.724677461858467, + "learning_rate": 1.9988976589230993e-06, + "loss": 1.3926, + "step": 8858 + }, + { + "epoch": 0.06, + "grad_norm": 4.451596700122062, + "learning_rate": 1.998897409781883e-06, + "loss": 1.3847, + "step": 8859 + }, + { + "epoch": 0.06, + "grad_norm": 5.353190518247788, + "learning_rate": 1.9988971606125305e-06, + "loss": 1.2629, + "step": 8860 + }, + { + "epoch": 0.06, + "grad_norm": 4.768436682447021, + "learning_rate": 1.9988969114150427e-06, + "loss": 1.2988, + "step": 8861 + }, + { + "epoch": 0.06, + "grad_norm": 4.173315215551296, + "learning_rate": 1.9988966621894194e-06, + "loss": 1.286, + "step": 8862 + }, + { + "epoch": 0.06, + "grad_norm": 5.041523559896114, + "learning_rate": 1.9988964129356603e-06, + "loss": 1.2269, + "step": 8863 + }, + { + "epoch": 0.06, + "grad_norm": 4.325590935683932, + "learning_rate": 1.9988961636537656e-06, + "loss": 1.2875, + "step": 8864 + }, + { + "epoch": 0.06, + "grad_norm": 4.115782126940359, + "learning_rate": 1.9988959143437355e-06, + "loss": 1.3078, + "step": 8865 + }, + { + "epoch": 0.06, + "grad_norm": 4.480126941503472, + "learning_rate": 1.9988956650055695e-06, + "loss": 1.4237, + "step": 8866 + }, + { + "epoch": 0.06, + "grad_norm": 4.54970557504812, + "learning_rate": 1.9988954156392685e-06, + "loss": 1.4411, + "step": 8867 + }, + { + "epoch": 0.06, + "grad_norm": 4.278451381604128, + "learning_rate": 1.9988951662448316e-06, + "loss": 1.4004, + "step": 8868 + }, + { + "epoch": 0.06, + "grad_norm": 4.735404366057957, + "learning_rate": 1.998894916822259e-06, + "loss": 1.4554, + "step": 8869 + }, + { + "epoch": 0.06, + "grad_norm": 4.378085473898133, + "learning_rate": 1.998894667371551e-06, + "loss": 1.3601, + "step": 8870 + }, + { + "epoch": 0.06, + "grad_norm": 4.543131960179484, + "learning_rate": 1.9988944178927075e-06, + "loss": 1.2967, + "step": 8871 + }, + { + "epoch": 0.06, + "grad_norm": 4.037717908122634, + "learning_rate": 1.9988941683857283e-06, + "loss": 1.0675, + "step": 8872 + }, + { + "epoch": 0.06, + "grad_norm": 3.9960044939604873, + "learning_rate": 1.9988939188506136e-06, + "loss": 1.4066, + "step": 8873 + }, + { + "epoch": 0.06, + "grad_norm": 4.271650937109512, + "learning_rate": 1.998893669287364e-06, + "loss": 1.3711, + "step": 8874 + }, + { + "epoch": 0.06, + "grad_norm": 4.4055312781165, + "learning_rate": 1.998893419695978e-06, + "loss": 1.3049, + "step": 8875 + }, + { + "epoch": 0.06, + "grad_norm": 4.205210231980988, + "learning_rate": 1.998893170076457e-06, + "loss": 1.1131, + "step": 8876 + }, + { + "epoch": 0.06, + "grad_norm": 4.52296723687895, + "learning_rate": 1.9988929204288005e-06, + "loss": 1.3961, + "step": 8877 + }, + { + "epoch": 0.06, + "grad_norm": 5.468240764241983, + "learning_rate": 1.998892670753008e-06, + "loss": 1.426, + "step": 8878 + }, + { + "epoch": 0.06, + "grad_norm": 4.428518120557946, + "learning_rate": 1.9988924210490804e-06, + "loss": 1.3621, + "step": 8879 + }, + { + "epoch": 0.06, + "grad_norm": 4.4233618382909645, + "learning_rate": 1.9988921713170174e-06, + "loss": 1.3834, + "step": 8880 + }, + { + "epoch": 0.06, + "grad_norm": 4.788668460219144, + "learning_rate": 1.9988919215568186e-06, + "loss": 1.3543, + "step": 8881 + }, + { + "epoch": 0.06, + "grad_norm": 5.038344659421539, + "learning_rate": 1.9988916717684846e-06, + "loss": 1.2472, + "step": 8882 + }, + { + "epoch": 0.06, + "grad_norm": 4.448190152642244, + "learning_rate": 1.9988914219520156e-06, + "loss": 1.3846, + "step": 8883 + }, + { + "epoch": 0.06, + "grad_norm": 5.015883758953264, + "learning_rate": 1.9988911721074103e-06, + "loss": 1.3266, + "step": 8884 + }, + { + "epoch": 0.06, + "grad_norm": 5.604425102482683, + "learning_rate": 1.99889092223467e-06, + "loss": 1.5366, + "step": 8885 + }, + { + "epoch": 0.06, + "grad_norm": 4.853075950448697, + "learning_rate": 1.998890672333794e-06, + "loss": 1.3602, + "step": 8886 + }, + { + "epoch": 0.06, + "grad_norm": 4.382559289220733, + "learning_rate": 1.998890422404783e-06, + "loss": 1.2622, + "step": 8887 + }, + { + "epoch": 0.06, + "grad_norm": 4.533039206904667, + "learning_rate": 1.9988901724476365e-06, + "loss": 1.2871, + "step": 8888 + }, + { + "epoch": 0.06, + "grad_norm": 4.448063347993088, + "learning_rate": 1.998889922462354e-06, + "loss": 1.3113, + "step": 8889 + }, + { + "epoch": 0.06, + "grad_norm": 4.448566612499572, + "learning_rate": 1.9988896724489366e-06, + "loss": 1.3776, + "step": 8890 + }, + { + "epoch": 0.06, + "grad_norm": 5.452867809262882, + "learning_rate": 1.998889422407384e-06, + "loss": 1.5249, + "step": 8891 + }, + { + "epoch": 0.06, + "grad_norm": 4.599794126337971, + "learning_rate": 1.9988891723376957e-06, + "loss": 1.3296, + "step": 8892 + }, + { + "epoch": 0.06, + "grad_norm": 5.05562711237251, + "learning_rate": 1.998888922239872e-06, + "loss": 1.1367, + "step": 8893 + }, + { + "epoch": 0.06, + "grad_norm": 6.0460495218161485, + "learning_rate": 1.998888672113913e-06, + "loss": 1.2502, + "step": 8894 + }, + { + "epoch": 0.06, + "grad_norm": 5.765418887979006, + "learning_rate": 1.9988884219598186e-06, + "loss": 1.3728, + "step": 8895 + }, + { + "epoch": 0.06, + "grad_norm": 4.679369204625647, + "learning_rate": 1.9988881717775885e-06, + "loss": 1.2669, + "step": 8896 + }, + { + "epoch": 0.06, + "grad_norm": 5.065217443313155, + "learning_rate": 1.998887921567224e-06, + "loss": 1.5629, + "step": 8897 + }, + { + "epoch": 0.06, + "grad_norm": 4.478375261594609, + "learning_rate": 1.9988876713287232e-06, + "loss": 1.3335, + "step": 8898 + }, + { + "epoch": 0.06, + "grad_norm": 5.407675825978634, + "learning_rate": 1.998887421062087e-06, + "loss": 1.4492, + "step": 8899 + }, + { + "epoch": 0.06, + "grad_norm": 5.200237987636907, + "learning_rate": 1.998887170767316e-06, + "loss": 1.2939, + "step": 8900 + }, + { + "epoch": 0.06, + "grad_norm": 4.532814601759366, + "learning_rate": 1.99888692044441e-06, + "loss": 1.3333, + "step": 8901 + }, + { + "epoch": 0.06, + "grad_norm": 4.465312311772244, + "learning_rate": 1.9988866700933678e-06, + "loss": 1.4924, + "step": 8902 + }, + { + "epoch": 0.06, + "grad_norm": 4.466904480519897, + "learning_rate": 1.9988864197141907e-06, + "loss": 1.3114, + "step": 8903 + }, + { + "epoch": 0.06, + "grad_norm": 4.682485277654333, + "learning_rate": 1.9988861693068785e-06, + "loss": 1.3864, + "step": 8904 + }, + { + "epoch": 0.06, + "grad_norm": 9.31636658719417, + "learning_rate": 1.9988859188714305e-06, + "loss": 1.281, + "step": 8905 + }, + { + "epoch": 0.06, + "grad_norm": 4.914615163990817, + "learning_rate": 1.9988856684078474e-06, + "loss": 1.5427, + "step": 8906 + }, + { + "epoch": 0.06, + "eval_loss": 1.5543663501739502, + "eval_runtime": 4.5787, + "eval_samples_per_second": 1.966, + "eval_steps_per_second": 1.092, + "step": 8906 + }, + { + "epoch": 0.06, + "grad_norm": 4.489190529489661, + "learning_rate": 1.998885417916129e-06, + "loss": 1.3335, + "step": 8907 + }, + { + "epoch": 0.06, + "grad_norm": 4.4726409644589085, + "learning_rate": 1.9988851673962756e-06, + "loss": 1.4104, + "step": 8908 + }, + { + "epoch": 0.06, + "grad_norm": 4.5195462156905455, + "learning_rate": 1.9988849168482865e-06, + "loss": 1.309, + "step": 8909 + }, + { + "epoch": 0.06, + "grad_norm": 4.652669844355484, + "learning_rate": 1.9988846662721624e-06, + "loss": 1.421, + "step": 8910 + }, + { + "epoch": 0.06, + "grad_norm": 4.927666429060232, + "learning_rate": 1.998884415667903e-06, + "loss": 1.248, + "step": 8911 + }, + { + "epoch": 0.06, + "grad_norm": 5.027081136036229, + "learning_rate": 1.9988841650355086e-06, + "loss": 1.2158, + "step": 8912 + }, + { + "epoch": 0.06, + "grad_norm": 4.2101052194981, + "learning_rate": 1.9988839143749784e-06, + "loss": 1.3401, + "step": 8913 + }, + { + "epoch": 0.06, + "grad_norm": 4.366447884123436, + "learning_rate": 1.9988836636863137e-06, + "loss": 1.4023, + "step": 8914 + }, + { + "epoch": 0.06, + "grad_norm": 4.603617986530109, + "learning_rate": 1.998883412969513e-06, + "loss": 1.3939, + "step": 8915 + }, + { + "epoch": 0.06, + "grad_norm": 4.781767465595025, + "learning_rate": 1.9988831622245774e-06, + "loss": 1.4682, + "step": 8916 + }, + { + "epoch": 0.06, + "grad_norm": 5.207559511450623, + "learning_rate": 1.9988829114515067e-06, + "loss": 1.2992, + "step": 8917 + }, + { + "epoch": 0.06, + "grad_norm": 10.133034512086452, + "learning_rate": 1.9988826606503005e-06, + "loss": 1.3581, + "step": 8918 + }, + { + "epoch": 0.06, + "grad_norm": 4.281864915731136, + "learning_rate": 1.9988824098209597e-06, + "loss": 1.3436, + "step": 8919 + }, + { + "epoch": 0.06, + "grad_norm": 4.361095459871603, + "learning_rate": 1.9988821589634834e-06, + "loss": 1.3927, + "step": 8920 + }, + { + "epoch": 0.06, + "grad_norm": 4.3010799083719835, + "learning_rate": 1.998881908077871e-06, + "loss": 1.3578, + "step": 8921 + }, + { + "epoch": 0.06, + "grad_norm": 4.1188084511681655, + "learning_rate": 1.998881657164125e-06, + "loss": 1.2002, + "step": 8922 + }, + { + "epoch": 0.06, + "grad_norm": 4.5025913689646835, + "learning_rate": 1.998881406222243e-06, + "loss": 1.329, + "step": 8923 + }, + { + "epoch": 0.06, + "grad_norm": 4.529673633173111, + "learning_rate": 1.9988811552522256e-06, + "loss": 1.3608, + "step": 8924 + }, + { + "epoch": 0.06, + "grad_norm": 4.569770946380013, + "learning_rate": 1.9988809042540737e-06, + "loss": 1.2989, + "step": 8925 + }, + { + "epoch": 0.06, + "grad_norm": 5.557240589904327, + "learning_rate": 1.998880653227786e-06, + "loss": 1.1284, + "step": 8926 + }, + { + "epoch": 0.06, + "grad_norm": 4.28813762250133, + "learning_rate": 1.9988804021733634e-06, + "loss": 1.396, + "step": 8927 + }, + { + "epoch": 0.06, + "grad_norm": 5.400184757435649, + "learning_rate": 1.998880151090806e-06, + "loss": 1.2865, + "step": 8928 + }, + { + "epoch": 0.06, + "grad_norm": 6.339021754186371, + "learning_rate": 1.998879899980113e-06, + "loss": 1.3962, + "step": 8929 + }, + { + "epoch": 0.06, + "grad_norm": 4.314780556483214, + "learning_rate": 1.998879648841285e-06, + "loss": 1.4211, + "step": 8930 + }, + { + "epoch": 0.06, + "grad_norm": 4.594912157470313, + "learning_rate": 1.998879397674322e-06, + "loss": 1.4905, + "step": 8931 + }, + { + "epoch": 0.06, + "grad_norm": 5.106468959389977, + "learning_rate": 1.9988791464792237e-06, + "loss": 1.5001, + "step": 8932 + }, + { + "epoch": 0.06, + "grad_norm": 4.4737293133162135, + "learning_rate": 1.9988788952559905e-06, + "loss": 1.399, + "step": 8933 + }, + { + "epoch": 0.06, + "grad_norm": 4.652228634028416, + "learning_rate": 1.998878644004622e-06, + "loss": 1.3587, + "step": 8934 + }, + { + "epoch": 0.06, + "grad_norm": 4.45502457916985, + "learning_rate": 1.9988783927251187e-06, + "loss": 1.4417, + "step": 8935 + }, + { + "epoch": 0.06, + "grad_norm": 4.445395169182832, + "learning_rate": 1.99887814141748e-06, + "loss": 1.346, + "step": 8936 + }, + { + "epoch": 0.06, + "grad_norm": 5.132407819507094, + "learning_rate": 1.998877890081706e-06, + "loss": 1.4948, + "step": 8937 + }, + { + "epoch": 0.06, + "grad_norm": 4.588242467876485, + "learning_rate": 1.9988776387177973e-06, + "loss": 1.345, + "step": 8938 + }, + { + "epoch": 0.06, + "grad_norm": 6.067830352222538, + "learning_rate": 1.998877387325754e-06, + "loss": 1.5489, + "step": 8939 + }, + { + "epoch": 0.06, + "grad_norm": 4.714850688819917, + "learning_rate": 1.998877135905575e-06, + "loss": 1.3105, + "step": 8940 + }, + { + "epoch": 0.06, + "grad_norm": 4.1786756721600335, + "learning_rate": 1.998876884457261e-06, + "loss": 1.3166, + "step": 8941 + }, + { + "epoch": 0.06, + "grad_norm": 4.459295861189716, + "learning_rate": 1.998876632980812e-06, + "loss": 1.3105, + "step": 8942 + }, + { + "epoch": 0.06, + "grad_norm": 4.2369627871224065, + "learning_rate": 1.9988763814762284e-06, + "loss": 1.0185, + "step": 8943 + }, + { + "epoch": 0.06, + "grad_norm": 4.479324234333639, + "learning_rate": 1.9988761299435093e-06, + "loss": 1.3638, + "step": 8944 + }, + { + "epoch": 0.06, + "grad_norm": 5.447393268254241, + "learning_rate": 1.998875878382655e-06, + "loss": 1.2788, + "step": 8945 + }, + { + "epoch": 0.06, + "grad_norm": 4.5828523366326595, + "learning_rate": 1.9988756267936664e-06, + "loss": 1.3339, + "step": 8946 + }, + { + "epoch": 0.06, + "grad_norm": 4.325089835344213, + "learning_rate": 1.998875375176542e-06, + "loss": 1.2877, + "step": 8947 + }, + { + "epoch": 0.06, + "grad_norm": 4.564531541461375, + "learning_rate": 1.9988751235312833e-06, + "loss": 1.2966, + "step": 8948 + }, + { + "epoch": 0.06, + "grad_norm": 4.296073373679613, + "learning_rate": 1.9988748718578894e-06, + "loss": 1.4188, + "step": 8949 + }, + { + "epoch": 0.06, + "grad_norm": 4.32396496786447, + "learning_rate": 1.9988746201563605e-06, + "loss": 1.4169, + "step": 8950 + }, + { + "epoch": 0.06, + "grad_norm": 4.138408818532786, + "learning_rate": 1.9988743684266965e-06, + "loss": 1.2979, + "step": 8951 + }, + { + "epoch": 0.06, + "grad_norm": 4.232780116774831, + "learning_rate": 1.9988741166688974e-06, + "loss": 1.2763, + "step": 8952 + }, + { + "epoch": 0.06, + "grad_norm": 4.601664517313853, + "learning_rate": 1.9988738648829638e-06, + "loss": 1.3497, + "step": 8953 + }, + { + "epoch": 0.06, + "grad_norm": 6.916078946149807, + "learning_rate": 1.9988736130688946e-06, + "loss": 1.2813, + "step": 8954 + }, + { + "epoch": 0.06, + "grad_norm": 4.5223440926608065, + "learning_rate": 1.9988733612266913e-06, + "loss": 1.3428, + "step": 8955 + }, + { + "epoch": 0.06, + "grad_norm": 4.11933876690971, + "learning_rate": 1.9988731093563525e-06, + "loss": 1.1801, + "step": 8956 + }, + { + "epoch": 0.06, + "grad_norm": 4.77892602384412, + "learning_rate": 1.9988728574578786e-06, + "loss": 1.3533, + "step": 8957 + }, + { + "epoch": 0.06, + "grad_norm": 4.491787013798658, + "learning_rate": 1.99887260553127e-06, + "loss": 1.3315, + "step": 8958 + }, + { + "epoch": 0.06, + "grad_norm": 4.463416607886815, + "learning_rate": 1.9988723535765266e-06, + "loss": 1.3055, + "step": 8959 + }, + { + "epoch": 0.06, + "grad_norm": 4.466109919275049, + "learning_rate": 1.998872101593648e-06, + "loss": 1.336, + "step": 8960 + }, + { + "epoch": 0.06, + "grad_norm": 5.08836560764422, + "learning_rate": 1.9988718495826353e-06, + "loss": 1.4302, + "step": 8961 + }, + { + "epoch": 0.06, + "grad_norm": 4.711433007796799, + "learning_rate": 1.998871597543487e-06, + "loss": 1.3982, + "step": 8962 + }, + { + "epoch": 0.06, + "grad_norm": 4.368650154835614, + "learning_rate": 1.998871345476204e-06, + "loss": 1.2549, + "step": 8963 + }, + { + "epoch": 0.06, + "grad_norm": 4.796732836900713, + "learning_rate": 1.998871093380786e-06, + "loss": 1.3864, + "step": 8964 + }, + { + "epoch": 0.06, + "grad_norm": 4.238552048880954, + "learning_rate": 1.998870841257233e-06, + "loss": 1.4041, + "step": 8965 + }, + { + "epoch": 0.06, + "grad_norm": 4.842170994219262, + "learning_rate": 1.998870589105545e-06, + "loss": 1.317, + "step": 8966 + }, + { + "epoch": 0.06, + "grad_norm": 4.452796361111205, + "learning_rate": 1.9988703369257223e-06, + "loss": 1.4161, + "step": 8967 + }, + { + "epoch": 0.06, + "grad_norm": 4.788521128772244, + "learning_rate": 1.998870084717765e-06, + "loss": 1.3019, + "step": 8968 + }, + { + "epoch": 0.06, + "grad_norm": 4.328572650147352, + "learning_rate": 1.9988698324816728e-06, + "loss": 1.2451, + "step": 8969 + }, + { + "epoch": 0.06, + "grad_norm": 10.840972499942911, + "learning_rate": 1.9988695802174454e-06, + "loss": 1.4126, + "step": 8970 + }, + { + "epoch": 0.06, + "grad_norm": 4.288973331266906, + "learning_rate": 1.9988693279250835e-06, + "loss": 1.2919, + "step": 8971 + }, + { + "epoch": 0.06, + "grad_norm": 4.059719407412008, + "learning_rate": 1.998869075604587e-06, + "loss": 1.2728, + "step": 8972 + }, + { + "epoch": 0.06, + "grad_norm": 6.451005985306049, + "learning_rate": 1.9988688232559553e-06, + "loss": 1.6012, + "step": 8973 + }, + { + "epoch": 0.06, + "grad_norm": 4.3410327220214855, + "learning_rate": 1.998868570879189e-06, + "loss": 1.2263, + "step": 8974 + }, + { + "epoch": 0.06, + "grad_norm": 3.999191136292253, + "learning_rate": 1.9988683184742877e-06, + "loss": 1.2986, + "step": 8975 + }, + { + "epoch": 0.06, + "grad_norm": 4.572950767432566, + "learning_rate": 1.998868066041252e-06, + "loss": 1.4045, + "step": 8976 + }, + { + "epoch": 0.06, + "grad_norm": 4.767952435722493, + "learning_rate": 1.998867813580081e-06, + "loss": 1.5217, + "step": 8977 + }, + { + "epoch": 0.06, + "grad_norm": 5.11152585487058, + "learning_rate": 1.9988675610907752e-06, + "loss": 1.4652, + "step": 8978 + }, + { + "epoch": 0.06, + "grad_norm": 4.518933749702347, + "learning_rate": 1.998867308573335e-06, + "loss": 1.5219, + "step": 8979 + }, + { + "epoch": 0.06, + "eval_loss": 1.5553007125854492, + "eval_runtime": 4.6042, + "eval_samples_per_second": 1.955, + "eval_steps_per_second": 1.086, + "step": 8979 + }, + { + "epoch": 0.06, + "grad_norm": 5.60210528703573, + "learning_rate": 1.9988670560277598e-06, + "loss": 1.506, + "step": 8980 + }, + { + "epoch": 0.06, + "grad_norm": 4.660456001265746, + "learning_rate": 1.99886680345405e-06, + "loss": 1.3371, + "step": 8981 + }, + { + "epoch": 0.06, + "grad_norm": 4.851341565771996, + "learning_rate": 1.998866550852205e-06, + "loss": 1.3738, + "step": 8982 + }, + { + "epoch": 0.06, + "grad_norm": 4.79938957161957, + "learning_rate": 1.998866298222226e-06, + "loss": 1.116, + "step": 8983 + }, + { + "epoch": 0.06, + "grad_norm": 4.38864501046462, + "learning_rate": 1.9988660455641117e-06, + "loss": 1.2905, + "step": 8984 + }, + { + "epoch": 0.06, + "grad_norm": 4.255559335307141, + "learning_rate": 1.998865792877863e-06, + "loss": 1.4645, + "step": 8985 + }, + { + "epoch": 0.06, + "grad_norm": 4.394914108937541, + "learning_rate": 1.998865540163479e-06, + "loss": 1.3422, + "step": 8986 + }, + { + "epoch": 0.06, + "grad_norm": 4.549130965535259, + "learning_rate": 1.9988652874209606e-06, + "loss": 1.3085, + "step": 8987 + }, + { + "epoch": 0.06, + "grad_norm": 4.313659621286637, + "learning_rate": 1.998865034650308e-06, + "loss": 1.26, + "step": 8988 + }, + { + "epoch": 0.06, + "grad_norm": 4.489065298698566, + "learning_rate": 1.99886478185152e-06, + "loss": 1.3966, + "step": 8989 + }, + { + "epoch": 0.06, + "grad_norm": 4.267065028983889, + "learning_rate": 1.9988645290245974e-06, + "loss": 1.3753, + "step": 8990 + }, + { + "epoch": 0.06, + "grad_norm": 4.947449398550815, + "learning_rate": 1.99886427616954e-06, + "loss": 1.4579, + "step": 8991 + }, + { + "epoch": 0.06, + "grad_norm": 5.558811076207193, + "learning_rate": 1.9988640232863485e-06, + "loss": 1.3795, + "step": 8992 + }, + { + "epoch": 0.06, + "grad_norm": 5.0790335795573895, + "learning_rate": 1.998863770375022e-06, + "loss": 1.4484, + "step": 8993 + }, + { + "epoch": 0.06, + "grad_norm": 5.0138216757250476, + "learning_rate": 1.998863517435561e-06, + "loss": 1.3437, + "step": 8994 + }, + { + "epoch": 0.06, + "grad_norm": 4.5056106488928895, + "learning_rate": 1.9988632644679647e-06, + "loss": 1.4137, + "step": 8995 + }, + { + "epoch": 0.06, + "grad_norm": 4.602550611027919, + "learning_rate": 1.9988630114722346e-06, + "loss": 1.4645, + "step": 8996 + }, + { + "epoch": 0.06, + "grad_norm": 4.661575315677727, + "learning_rate": 1.998862758448369e-06, + "loss": 1.3124, + "step": 8997 + }, + { + "epoch": 0.06, + "grad_norm": 4.351667010009662, + "learning_rate": 1.9988625053963694e-06, + "loss": 1.4001, + "step": 8998 + }, + { + "epoch": 0.06, + "grad_norm": 4.483493069528355, + "learning_rate": 1.9988622523162354e-06, + "loss": 1.477, + "step": 8999 + }, + { + "epoch": 0.06, + "grad_norm": 4.167612226402738, + "learning_rate": 1.998861999207966e-06, + "loss": 1.2781, + "step": 9000 + }, + { + "epoch": 0.06, + "grad_norm": 4.750547480446069, + "learning_rate": 1.9988617460715624e-06, + "loss": 1.3683, + "step": 9001 + }, + { + "epoch": 0.06, + "grad_norm": 5.292162839259224, + "learning_rate": 1.998861492907024e-06, + "loss": 1.5366, + "step": 9002 + }, + { + "epoch": 0.06, + "grad_norm": 4.609356748192954, + "learning_rate": 1.9988612397143513e-06, + "loss": 1.2481, + "step": 9003 + }, + { + "epoch": 0.06, + "grad_norm": 4.554353075504676, + "learning_rate": 1.998860986493544e-06, + "loss": 1.2943, + "step": 9004 + }, + { + "epoch": 0.06, + "grad_norm": 4.4289470785945095, + "learning_rate": 1.9988607332446017e-06, + "loss": 1.3443, + "step": 9005 + }, + { + "epoch": 0.06, + "grad_norm": 4.600551169801115, + "learning_rate": 1.998860479967525e-06, + "loss": 1.4291, + "step": 9006 + }, + { + "epoch": 0.06, + "grad_norm": 4.668299697183073, + "learning_rate": 1.9988602266623137e-06, + "loss": 1.5173, + "step": 9007 + }, + { + "epoch": 0.06, + "grad_norm": 4.231037382439115, + "learning_rate": 1.998859973328968e-06, + "loss": 1.3386, + "step": 9008 + }, + { + "epoch": 0.06, + "grad_norm": 6.297794802363379, + "learning_rate": 1.9988597199674875e-06, + "loss": 1.4288, + "step": 9009 + }, + { + "epoch": 0.06, + "grad_norm": 4.868180986069198, + "learning_rate": 1.9988594665778727e-06, + "loss": 1.2063, + "step": 9010 + }, + { + "epoch": 0.06, + "grad_norm": 5.362600955403026, + "learning_rate": 1.9988592131601233e-06, + "loss": 1.3834, + "step": 9011 + }, + { + "epoch": 0.06, + "grad_norm": 4.407803982727705, + "learning_rate": 1.9988589597142393e-06, + "loss": 1.417, + "step": 9012 + }, + { + "epoch": 0.06, + "grad_norm": 4.7563480578361075, + "learning_rate": 1.9988587062402206e-06, + "loss": 1.594, + "step": 9013 + }, + { + "epoch": 0.06, + "grad_norm": 4.646984395306063, + "learning_rate": 1.9988584527380678e-06, + "loss": 1.3546, + "step": 9014 + }, + { + "epoch": 0.06, + "grad_norm": 4.6616927620878394, + "learning_rate": 1.99885819920778e-06, + "loss": 1.4772, + "step": 9015 + }, + { + "epoch": 0.06, + "grad_norm": 4.578605251102135, + "learning_rate": 1.9988579456493577e-06, + "loss": 1.5212, + "step": 9016 + }, + { + "epoch": 0.06, + "grad_norm": 4.539410903890842, + "learning_rate": 1.9988576920628014e-06, + "loss": 1.3062, + "step": 9017 + }, + { + "epoch": 0.06, + "grad_norm": 4.06144243687616, + "learning_rate": 1.99885743844811e-06, + "loss": 1.2502, + "step": 9018 + }, + { + "epoch": 0.06, + "grad_norm": 5.440801598505531, + "learning_rate": 1.9988571848052845e-06, + "loss": 1.3371, + "step": 9019 + }, + { + "epoch": 0.06, + "grad_norm": 4.586879669813342, + "learning_rate": 1.9988569311343244e-06, + "loss": 1.3869, + "step": 9020 + }, + { + "epoch": 0.06, + "grad_norm": 5.482282733225696, + "learning_rate": 1.99885667743523e-06, + "loss": 1.404, + "step": 9021 + }, + { + "epoch": 0.06, + "grad_norm": 5.031821009158009, + "learning_rate": 1.9988564237080006e-06, + "loss": 1.6249, + "step": 9022 + }, + { + "epoch": 0.06, + "grad_norm": 4.962595085343715, + "learning_rate": 1.9988561699526374e-06, + "loss": 1.3203, + "step": 9023 + }, + { + "epoch": 0.06, + "grad_norm": 5.694113615090797, + "learning_rate": 1.9988559161691396e-06, + "loss": 1.3867, + "step": 9024 + }, + { + "epoch": 0.06, + "grad_norm": 4.742365519040978, + "learning_rate": 1.9988556623575067e-06, + "loss": 1.4539, + "step": 9025 + }, + { + "epoch": 0.06, + "grad_norm": 4.830335538176732, + "learning_rate": 1.99885540851774e-06, + "loss": 1.3798, + "step": 9026 + }, + { + "epoch": 0.06, + "grad_norm": 5.276387047185486, + "learning_rate": 1.998855154649839e-06, + "loss": 1.4987, + "step": 9027 + }, + { + "epoch": 0.06, + "grad_norm": 4.853044083121314, + "learning_rate": 1.9988549007538033e-06, + "loss": 1.3321, + "step": 9028 + }, + { + "epoch": 0.06, + "grad_norm": 4.284981764679088, + "learning_rate": 1.998854646829633e-06, + "loss": 0.9964, + "step": 9029 + }, + { + "epoch": 0.06, + "grad_norm": 4.346686675214998, + "learning_rate": 1.9988543928773285e-06, + "loss": 1.4307, + "step": 9030 + }, + { + "epoch": 0.06, + "grad_norm": 4.495526724877368, + "learning_rate": 1.9988541388968896e-06, + "loss": 1.4653, + "step": 9031 + }, + { + "epoch": 0.06, + "grad_norm": 4.239632068789658, + "learning_rate": 1.998853884888316e-06, + "loss": 1.3582, + "step": 9032 + }, + { + "epoch": 0.06, + "grad_norm": 4.98748674096309, + "learning_rate": 1.9988536308516084e-06, + "loss": 1.2501, + "step": 9033 + }, + { + "epoch": 0.06, + "grad_norm": 5.556394646926006, + "learning_rate": 1.998853376786766e-06, + "loss": 1.1305, + "step": 9034 + }, + { + "epoch": 0.06, + "grad_norm": 4.346098874865625, + "learning_rate": 1.99885312269379e-06, + "loss": 1.2558, + "step": 9035 + }, + { + "epoch": 0.06, + "grad_norm": 4.494707023554734, + "learning_rate": 1.9988528685726787e-06, + "loss": 1.3644, + "step": 9036 + }, + { + "epoch": 0.06, + "grad_norm": 6.874976902701597, + "learning_rate": 1.9988526144234338e-06, + "loss": 1.3911, + "step": 9037 + }, + { + "epoch": 0.06, + "grad_norm": 5.248752040120656, + "learning_rate": 1.998852360246054e-06, + "loss": 1.4355, + "step": 9038 + }, + { + "epoch": 0.06, + "grad_norm": 5.165077934887538, + "learning_rate": 1.99885210604054e-06, + "loss": 1.4702, + "step": 9039 + }, + { + "epoch": 0.06, + "grad_norm": 4.980124759726575, + "learning_rate": 1.9988518518068916e-06, + "loss": 1.3591, + "step": 9040 + }, + { + "epoch": 0.06, + "grad_norm": 4.67548279943575, + "learning_rate": 1.9988515975451094e-06, + "loss": 1.3922, + "step": 9041 + }, + { + "epoch": 0.06, + "grad_norm": 4.63766518016973, + "learning_rate": 1.9988513432551922e-06, + "loss": 1.4185, + "step": 9042 + }, + { + "epoch": 0.06, + "grad_norm": 4.782559994653726, + "learning_rate": 1.998851088937141e-06, + "loss": 1.3569, + "step": 9043 + }, + { + "epoch": 0.06, + "grad_norm": 4.357415285535007, + "learning_rate": 1.998850834590955e-06, + "loss": 1.3202, + "step": 9044 + }, + { + "epoch": 0.06, + "grad_norm": 4.189087610099267, + "learning_rate": 1.9988505802166354e-06, + "loss": 1.301, + "step": 9045 + }, + { + "epoch": 0.06, + "grad_norm": 4.570560161610986, + "learning_rate": 1.9988503258141814e-06, + "loss": 1.3757, + "step": 9046 + }, + { + "epoch": 0.06, + "grad_norm": 4.307296931492565, + "learning_rate": 1.9988500713835927e-06, + "loss": 1.4011, + "step": 9047 + }, + { + "epoch": 0.06, + "grad_norm": 3.9675940034991855, + "learning_rate": 1.9988498169248703e-06, + "loss": 1.1557, + "step": 9048 + }, + { + "epoch": 0.06, + "grad_norm": 4.291378655674192, + "learning_rate": 1.9988495624380133e-06, + "loss": 1.324, + "step": 9049 + }, + { + "epoch": 0.06, + "grad_norm": 8.532552952167226, + "learning_rate": 1.9988493079230216e-06, + "loss": 1.4429, + "step": 9050 + }, + { + "epoch": 0.06, + "grad_norm": 5.493875214686658, + "learning_rate": 1.998849053379896e-06, + "loss": 1.5635, + "step": 9051 + }, + { + "epoch": 0.06, + "grad_norm": 4.792197174638157, + "learning_rate": 1.9988487988086365e-06, + "loss": 1.3605, + "step": 9052 + }, + { + "epoch": 0.06, + "eval_loss": 1.5568945407867432, + "eval_runtime": 4.5932, + "eval_samples_per_second": 1.959, + "eval_steps_per_second": 1.089, + "step": 9052 + }, + { + "epoch": 0.06, + "grad_norm": 4.6190225029601875, + "learning_rate": 1.9988485442092423e-06, + "loss": 1.2421, + "step": 9053 + }, + { + "epoch": 0.06, + "grad_norm": 4.535802137716677, + "learning_rate": 1.998848289581714e-06, + "loss": 1.3614, + "step": 9054 + }, + { + "epoch": 0.06, + "grad_norm": 8.31164491897128, + "learning_rate": 1.9988480349260516e-06, + "loss": 1.2074, + "step": 9055 + }, + { + "epoch": 0.06, + "grad_norm": 4.1293617867803825, + "learning_rate": 1.9988477802422547e-06, + "loss": 1.2049, + "step": 9056 + }, + { + "epoch": 0.06, + "grad_norm": 5.348364919042677, + "learning_rate": 1.9988475255303237e-06, + "loss": 1.4955, + "step": 9057 + }, + { + "epoch": 0.06, + "grad_norm": 4.803246772346318, + "learning_rate": 1.9988472707902584e-06, + "loss": 1.3898, + "step": 9058 + }, + { + "epoch": 0.06, + "grad_norm": 5.098680525051035, + "learning_rate": 1.9988470160220594e-06, + "loss": 1.5915, + "step": 9059 + }, + { + "epoch": 0.06, + "grad_norm": 5.970280760389105, + "learning_rate": 1.9988467612257253e-06, + "loss": 1.313, + "step": 9060 + }, + { + "epoch": 0.06, + "grad_norm": 4.667421976546424, + "learning_rate": 1.9988465064012575e-06, + "loss": 1.3201, + "step": 9061 + }, + { + "epoch": 0.06, + "grad_norm": 7.401429763170494, + "learning_rate": 1.998846251548656e-06, + "loss": 1.3229, + "step": 9062 + }, + { + "epoch": 0.06, + "grad_norm": 4.785533056094324, + "learning_rate": 1.9988459966679196e-06, + "loss": 1.3631, + "step": 9063 + }, + { + "epoch": 0.06, + "grad_norm": 4.487519182889941, + "learning_rate": 1.998845741759049e-06, + "loss": 1.437, + "step": 9064 + }, + { + "epoch": 0.06, + "grad_norm": 5.13642628749504, + "learning_rate": 1.9988454868220445e-06, + "loss": 1.37, + "step": 9065 + }, + { + "epoch": 0.06, + "grad_norm": 4.9419697819098625, + "learning_rate": 1.998845231856906e-06, + "loss": 1.3161, + "step": 9066 + }, + { + "epoch": 0.06, + "grad_norm": 4.420915488648716, + "learning_rate": 1.9988449768636334e-06, + "loss": 1.373, + "step": 9067 + }, + { + "epoch": 0.06, + "grad_norm": 4.221702918453699, + "learning_rate": 1.9988447218422266e-06, + "loss": 1.2426, + "step": 9068 + }, + { + "epoch": 0.06, + "grad_norm": 4.740525474146342, + "learning_rate": 1.998844466792685e-06, + "loss": 1.2266, + "step": 9069 + }, + { + "epoch": 0.06, + "grad_norm": 5.090429617574333, + "learning_rate": 1.99884421171501e-06, + "loss": 1.3431, + "step": 9070 + }, + { + "epoch": 0.06, + "grad_norm": 4.715382170229637, + "learning_rate": 1.998843956609201e-06, + "loss": 1.3327, + "step": 9071 + }, + { + "epoch": 0.06, + "grad_norm": 4.4022969934206895, + "learning_rate": 1.9988437014752573e-06, + "loss": 1.3137, + "step": 9072 + }, + { + "epoch": 0.06, + "grad_norm": 5.181835744906127, + "learning_rate": 1.99884344631318e-06, + "loss": 1.549, + "step": 9073 + }, + { + "epoch": 0.06, + "grad_norm": 4.819269662989522, + "learning_rate": 1.9988431911229683e-06, + "loss": 1.282, + "step": 9074 + }, + { + "epoch": 0.06, + "grad_norm": 4.363681009387404, + "learning_rate": 1.9988429359046225e-06, + "loss": 1.4101, + "step": 9075 + }, + { + "epoch": 0.06, + "grad_norm": 4.7055582389899175, + "learning_rate": 1.9988426806581425e-06, + "loss": 1.4084, + "step": 9076 + }, + { + "epoch": 0.06, + "grad_norm": 4.124273417076292, + "learning_rate": 1.9988424253835288e-06, + "loss": 1.2397, + "step": 9077 + }, + { + "epoch": 0.06, + "grad_norm": 5.231691072861776, + "learning_rate": 1.998842170080781e-06, + "loss": 1.4207, + "step": 9078 + }, + { + "epoch": 0.06, + "grad_norm": 5.1471474759237426, + "learning_rate": 1.998841914749899e-06, + "loss": 1.514, + "step": 9079 + }, + { + "epoch": 0.06, + "grad_norm": 4.661508206419235, + "learning_rate": 1.9988416593908827e-06, + "loss": 1.4136, + "step": 9080 + }, + { + "epoch": 0.06, + "grad_norm": 4.768479744757336, + "learning_rate": 1.9988414040037325e-06, + "loss": 1.3385, + "step": 9081 + }, + { + "epoch": 0.06, + "grad_norm": 4.741697760571189, + "learning_rate": 1.9988411485884486e-06, + "loss": 1.4071, + "step": 9082 + }, + { + "epoch": 0.06, + "grad_norm": 4.074112572353006, + "learning_rate": 1.99884089314503e-06, + "loss": 1.2509, + "step": 9083 + }, + { + "epoch": 0.06, + "grad_norm": 6.175344434197922, + "learning_rate": 1.998840637673478e-06, + "loss": 1.3827, + "step": 9084 + }, + { + "epoch": 0.06, + "grad_norm": 6.016929703788691, + "learning_rate": 1.998840382173792e-06, + "loss": 1.3087, + "step": 9085 + }, + { + "epoch": 0.06, + "grad_norm": 6.244994069291114, + "learning_rate": 1.9988401266459718e-06, + "loss": 1.4782, + "step": 9086 + }, + { + "epoch": 0.06, + "grad_norm": 4.3925647532790215, + "learning_rate": 1.9988398710900173e-06, + "loss": 1.2169, + "step": 9087 + }, + { + "epoch": 0.06, + "grad_norm": 4.522437735505425, + "learning_rate": 1.998839615505929e-06, + "loss": 1.4083, + "step": 9088 + }, + { + "epoch": 0.06, + "grad_norm": 4.819405069864338, + "learning_rate": 1.998839359893707e-06, + "loss": 1.2948, + "step": 9089 + }, + { + "epoch": 0.06, + "grad_norm": 4.763161479989202, + "learning_rate": 1.9988391042533507e-06, + "loss": 1.2271, + "step": 9090 + }, + { + "epoch": 0.06, + "grad_norm": 4.565434660115241, + "learning_rate": 1.9988388485848607e-06, + "loss": 1.3826, + "step": 9091 + }, + { + "epoch": 0.06, + "grad_norm": 5.219237169197936, + "learning_rate": 1.9988385928882365e-06, + "loss": 1.2761, + "step": 9092 + }, + { + "epoch": 0.06, + "grad_norm": 6.063411684504128, + "learning_rate": 1.998838337163478e-06, + "loss": 1.3223, + "step": 9093 + }, + { + "epoch": 0.06, + "grad_norm": 4.479134736791678, + "learning_rate": 1.9988380814105863e-06, + "loss": 1.3446, + "step": 9094 + }, + { + "epoch": 0.06, + "grad_norm": 5.422095452244349, + "learning_rate": 1.9988378256295603e-06, + "loss": 1.3384, + "step": 9095 + }, + { + "epoch": 0.06, + "grad_norm": 4.84737701951572, + "learning_rate": 1.9988375698204e-06, + "loss": 1.2363, + "step": 9096 + }, + { + "epoch": 0.06, + "grad_norm": 4.382312669869536, + "learning_rate": 1.9988373139831066e-06, + "loss": 1.4566, + "step": 9097 + }, + { + "epoch": 0.06, + "grad_norm": 4.013224196049158, + "learning_rate": 1.9988370581176785e-06, + "loss": 1.3884, + "step": 9098 + }, + { + "epoch": 0.06, + "grad_norm": 4.818846037522861, + "learning_rate": 1.998836802224117e-06, + "loss": 1.2727, + "step": 9099 + }, + { + "epoch": 0.06, + "grad_norm": 4.407544132466331, + "learning_rate": 1.9988365463024213e-06, + "loss": 1.3083, + "step": 9100 + }, + { + "epoch": 0.06, + "grad_norm": 4.944094663921774, + "learning_rate": 1.9988362903525914e-06, + "loss": 1.4612, + "step": 9101 + }, + { + "epoch": 0.06, + "grad_norm": 4.6406257423464705, + "learning_rate": 1.998836034374628e-06, + "loss": 1.2318, + "step": 9102 + }, + { + "epoch": 0.06, + "grad_norm": 4.484612273315136, + "learning_rate": 1.998835778368531e-06, + "loss": 1.2144, + "step": 9103 + }, + { + "epoch": 0.06, + "grad_norm": 6.531856078046369, + "learning_rate": 1.9988355223342995e-06, + "loss": 1.2987, + "step": 9104 + }, + { + "epoch": 0.06, + "grad_norm": 4.655267378407986, + "learning_rate": 1.9988352662719345e-06, + "loss": 1.5162, + "step": 9105 + }, + { + "epoch": 0.06, + "grad_norm": 5.095374997853681, + "learning_rate": 1.9988350101814357e-06, + "loss": 1.3566, + "step": 9106 + }, + { + "epoch": 0.06, + "grad_norm": 5.857680330258707, + "learning_rate": 1.9988347540628027e-06, + "loss": 1.5086, + "step": 9107 + }, + { + "epoch": 0.06, + "grad_norm": 4.653106006016631, + "learning_rate": 1.9988344979160364e-06, + "loss": 1.3639, + "step": 9108 + }, + { + "epoch": 0.06, + "grad_norm": 4.441432248123494, + "learning_rate": 1.998834241741136e-06, + "loss": 1.3261, + "step": 9109 + }, + { + "epoch": 0.06, + "grad_norm": 5.087121192776212, + "learning_rate": 1.998833985538101e-06, + "loss": 1.4353, + "step": 9110 + }, + { + "epoch": 0.06, + "grad_norm": 4.11190563382986, + "learning_rate": 1.9988337293069335e-06, + "loss": 1.2746, + "step": 9111 + }, + { + "epoch": 0.06, + "grad_norm": 6.716606119826262, + "learning_rate": 1.9988334730476312e-06, + "loss": 1.3709, + "step": 9112 + }, + { + "epoch": 0.06, + "grad_norm": 4.630104371625799, + "learning_rate": 1.998833216760195e-06, + "loss": 1.3448, + "step": 9113 + }, + { + "epoch": 0.06, + "grad_norm": 4.941266886240506, + "learning_rate": 1.9988329604446258e-06, + "loss": 1.2529, + "step": 9114 + }, + { + "epoch": 0.06, + "grad_norm": 4.663865704742076, + "learning_rate": 1.998832704100922e-06, + "loss": 1.3354, + "step": 9115 + }, + { + "epoch": 0.06, + "grad_norm": 4.614423454119124, + "learning_rate": 1.998832447729085e-06, + "loss": 1.5157, + "step": 9116 + }, + { + "epoch": 0.06, + "grad_norm": 4.493157455094239, + "learning_rate": 1.998832191329114e-06, + "loss": 1.3067, + "step": 9117 + }, + { + "epoch": 0.06, + "grad_norm": 4.574117095039273, + "learning_rate": 1.998831934901009e-06, + "loss": 1.3158, + "step": 9118 + }, + { + "epoch": 0.06, + "grad_norm": 4.779692251344268, + "learning_rate": 1.9988316784447704e-06, + "loss": 1.2858, + "step": 9119 + }, + { + "epoch": 0.06, + "grad_norm": 4.242701008375293, + "learning_rate": 1.9988314219603983e-06, + "loss": 1.3797, + "step": 9120 + }, + { + "epoch": 0.06, + "grad_norm": 10.034828223330901, + "learning_rate": 1.998831165447892e-06, + "loss": 1.2989, + "step": 9121 + }, + { + "epoch": 0.06, + "grad_norm": 4.673336156658645, + "learning_rate": 1.9988309089072525e-06, + "loss": 1.37, + "step": 9122 + }, + { + "epoch": 0.06, + "grad_norm": 4.247683852114555, + "learning_rate": 1.9988306523384786e-06, + "loss": 1.3382, + "step": 9123 + }, + { + "epoch": 0.06, + "grad_norm": 4.7928360311565905, + "learning_rate": 1.9988303957415715e-06, + "loss": 1.4073, + "step": 9124 + }, + { + "epoch": 0.06, + "grad_norm": 5.7822777633284845, + "learning_rate": 1.9988301391165305e-06, + "loss": 1.4012, + "step": 9125 + }, + { + "epoch": 0.06, + "eval_loss": 1.5588743686676025, + "eval_runtime": 4.5701, + "eval_samples_per_second": 1.969, + "eval_steps_per_second": 1.094, + "step": 9125 + }, + { + "epoch": 0.06, + "grad_norm": 4.226827420184808, + "learning_rate": 1.9988298824633554e-06, + "loss": 1.2245, + "step": 9126 + }, + { + "epoch": 0.06, + "grad_norm": 5.011844815348574, + "learning_rate": 1.9988296257820473e-06, + "loss": 1.34, + "step": 9127 + }, + { + "epoch": 0.06, + "grad_norm": 4.941820242282499, + "learning_rate": 1.998829369072605e-06, + "loss": 1.507, + "step": 9128 + }, + { + "epoch": 0.06, + "grad_norm": 4.600639367810583, + "learning_rate": 1.998829112335029e-06, + "loss": 1.3989, + "step": 9129 + }, + { + "epoch": 0.06, + "grad_norm": 4.406622869695, + "learning_rate": 1.998828855569319e-06, + "loss": 1.4553, + "step": 9130 + }, + { + "epoch": 0.06, + "grad_norm": 4.252996143827648, + "learning_rate": 1.9988285987754765e-06, + "loss": 1.3364, + "step": 9131 + }, + { + "epoch": 0.06, + "grad_norm": 4.821985360292999, + "learning_rate": 1.998828341953499e-06, + "loss": 1.3659, + "step": 9132 + }, + { + "epoch": 0.06, + "grad_norm": 6.094220456286474, + "learning_rate": 1.9988280851033884e-06, + "loss": 1.4298, + "step": 9133 + }, + { + "epoch": 0.06, + "grad_norm": 4.970492624145565, + "learning_rate": 1.9988278282251443e-06, + "loss": 1.3314, + "step": 9134 + }, + { + "epoch": 0.06, + "grad_norm": 4.597286782721286, + "learning_rate": 1.9988275713187664e-06, + "loss": 1.1891, + "step": 9135 + }, + { + "epoch": 0.06, + "grad_norm": 4.719326172064779, + "learning_rate": 1.998827314384255e-06, + "loss": 1.3526, + "step": 9136 + }, + { + "epoch": 0.06, + "grad_norm": 4.964344684563564, + "learning_rate": 1.9988270574216094e-06, + "loss": 1.3893, + "step": 9137 + }, + { + "epoch": 0.06, + "grad_norm": 5.048002248308627, + "learning_rate": 1.9988268004308307e-06, + "loss": 1.2759, + "step": 9138 + }, + { + "epoch": 0.06, + "grad_norm": 4.254350270877056, + "learning_rate": 1.998826543411918e-06, + "loss": 1.3175, + "step": 9139 + }, + { + "epoch": 0.06, + "grad_norm": 4.5864422760779275, + "learning_rate": 1.998826286364872e-06, + "loss": 1.44, + "step": 9140 + }, + { + "epoch": 0.06, + "grad_norm": 5.19006686791242, + "learning_rate": 1.998826029289692e-06, + "loss": 1.3203, + "step": 9141 + }, + { + "epoch": 0.06, + "grad_norm": 4.661070933704734, + "learning_rate": 1.9988257721863787e-06, + "loss": 1.2694, + "step": 9142 + }, + { + "epoch": 0.06, + "grad_norm": 4.813334077646508, + "learning_rate": 1.998825515054932e-06, + "loss": 1.4046, + "step": 9143 + }, + { + "epoch": 0.06, + "grad_norm": 4.449255562761521, + "learning_rate": 1.9988252578953514e-06, + "loss": 1.3647, + "step": 9144 + }, + { + "epoch": 0.06, + "grad_norm": 4.075167876070271, + "learning_rate": 1.998825000707637e-06, + "loss": 1.1863, + "step": 9145 + }, + { + "epoch": 0.06, + "grad_norm": 4.74127015718803, + "learning_rate": 1.9988247434917893e-06, + "loss": 1.1165, + "step": 9146 + }, + { + "epoch": 0.06, + "grad_norm": 6.840163155762476, + "learning_rate": 1.998824486247808e-06, + "loss": 1.4104, + "step": 9147 + }, + { + "epoch": 0.06, + "grad_norm": 5.102885647246682, + "learning_rate": 1.9988242289756935e-06, + "loss": 1.4741, + "step": 9148 + }, + { + "epoch": 0.06, + "grad_norm": 4.380287801595398, + "learning_rate": 1.998823971675445e-06, + "loss": 1.3417, + "step": 9149 + }, + { + "epoch": 0.06, + "grad_norm": 4.387236463016363, + "learning_rate": 1.998823714347063e-06, + "loss": 1.3466, + "step": 9150 + }, + { + "epoch": 0.06, + "grad_norm": 7.42239010672559, + "learning_rate": 1.9988234569905472e-06, + "loss": 1.2259, + "step": 9151 + }, + { + "epoch": 0.06, + "grad_norm": 4.523996935259947, + "learning_rate": 1.9988231996058986e-06, + "loss": 1.3438, + "step": 9152 + }, + { + "epoch": 0.06, + "grad_norm": 4.912178189540081, + "learning_rate": 1.998822942193116e-06, + "loss": 1.384, + "step": 9153 + }, + { + "epoch": 0.06, + "grad_norm": 4.646735695375992, + "learning_rate": 1.9988226847522e-06, + "loss": 1.3713, + "step": 9154 + }, + { + "epoch": 0.06, + "grad_norm": 4.220331488041842, + "learning_rate": 1.9988224272831504e-06, + "loss": 1.3364, + "step": 9155 + }, + { + "epoch": 0.06, + "grad_norm": 4.917934915293237, + "learning_rate": 1.998822169785967e-06, + "loss": 1.3533, + "step": 9156 + }, + { + "epoch": 0.06, + "grad_norm": 4.136267104770313, + "learning_rate": 1.9988219122606504e-06, + "loss": 1.2179, + "step": 9157 + }, + { + "epoch": 0.06, + "grad_norm": 4.418373349181749, + "learning_rate": 1.9988216547072003e-06, + "loss": 1.332, + "step": 9158 + }, + { + "epoch": 0.06, + "grad_norm": 4.359669714393945, + "learning_rate": 1.9988213971256165e-06, + "loss": 1.4237, + "step": 9159 + }, + { + "epoch": 0.06, + "grad_norm": 4.908105067327183, + "learning_rate": 1.9988211395158998e-06, + "loss": 1.4135, + "step": 9160 + }, + { + "epoch": 0.06, + "grad_norm": 4.318013319108017, + "learning_rate": 1.9988208818780492e-06, + "loss": 1.2633, + "step": 9161 + }, + { + "epoch": 0.06, + "grad_norm": 4.402614559006522, + "learning_rate": 1.9988206242120654e-06, + "loss": 1.4503, + "step": 9162 + }, + { + "epoch": 0.06, + "grad_norm": 5.2079967940421845, + "learning_rate": 1.9988203665179477e-06, + "loss": 1.4091, + "step": 9163 + }, + { + "epoch": 0.06, + "grad_norm": 4.734608864419093, + "learning_rate": 1.998820108795697e-06, + "loss": 1.2797, + "step": 9164 + }, + { + "epoch": 0.06, + "grad_norm": 4.54596989147798, + "learning_rate": 1.998819851045313e-06, + "loss": 1.2652, + "step": 9165 + }, + { + "epoch": 0.06, + "grad_norm": 4.965599831733548, + "learning_rate": 1.9988195932667947e-06, + "loss": 1.3534, + "step": 9166 + }, + { + "epoch": 0.06, + "grad_norm": 4.306252303362465, + "learning_rate": 1.9988193354601436e-06, + "loss": 1.3887, + "step": 9167 + }, + { + "epoch": 0.06, + "grad_norm": 4.586662322138531, + "learning_rate": 1.9988190776253592e-06, + "loss": 1.2433, + "step": 9168 + }, + { + "epoch": 0.06, + "grad_norm": 4.487606906380667, + "learning_rate": 1.998818819762441e-06, + "loss": 1.3448, + "step": 9169 + }, + { + "epoch": 0.06, + "grad_norm": 4.796842913016864, + "learning_rate": 1.99881856187139e-06, + "loss": 1.2425, + "step": 9170 + }, + { + "epoch": 0.06, + "grad_norm": 5.331051173433661, + "learning_rate": 1.998818303952205e-06, + "loss": 1.5691, + "step": 9171 + }, + { + "epoch": 0.06, + "grad_norm": 4.5032447554233155, + "learning_rate": 1.998818046004887e-06, + "loss": 1.2678, + "step": 9172 + }, + { + "epoch": 0.06, + "grad_norm": 4.328273220446713, + "learning_rate": 1.9988177880294352e-06, + "loss": 1.3201, + "step": 9173 + }, + { + "epoch": 0.06, + "grad_norm": 8.132203810399535, + "learning_rate": 1.9988175300258503e-06, + "loss": 1.1892, + "step": 9174 + }, + { + "epoch": 0.06, + "grad_norm": 4.426246765016404, + "learning_rate": 1.998817271994132e-06, + "loss": 1.3821, + "step": 9175 + }, + { + "epoch": 0.06, + "grad_norm": 4.647626125267229, + "learning_rate": 1.9988170139342803e-06, + "loss": 1.4799, + "step": 9176 + }, + { + "epoch": 0.06, + "grad_norm": 4.276910513814594, + "learning_rate": 1.9988167558462953e-06, + "loss": 1.1761, + "step": 9177 + }, + { + "epoch": 0.06, + "grad_norm": 5.0994784697641915, + "learning_rate": 1.9988164977301765e-06, + "loss": 1.2105, + "step": 9178 + }, + { + "epoch": 0.06, + "grad_norm": 5.392130693321368, + "learning_rate": 1.998816239585925e-06, + "loss": 1.5226, + "step": 9179 + }, + { + "epoch": 0.06, + "grad_norm": 4.29486562875608, + "learning_rate": 1.99881598141354e-06, + "loss": 1.5095, + "step": 9180 + }, + { + "epoch": 0.06, + "grad_norm": 5.661862846028829, + "learning_rate": 1.998815723213022e-06, + "loss": 1.1643, + "step": 9181 + }, + { + "epoch": 0.06, + "grad_norm": 4.600914266221847, + "learning_rate": 1.99881546498437e-06, + "loss": 1.3136, + "step": 9182 + }, + { + "epoch": 0.06, + "grad_norm": 4.9206532309919115, + "learning_rate": 1.998815206727585e-06, + "loss": 1.2937, + "step": 9183 + }, + { + "epoch": 0.06, + "grad_norm": 4.500316891522118, + "learning_rate": 1.9988149484426665e-06, + "loss": 1.3292, + "step": 9184 + }, + { + "epoch": 0.06, + "grad_norm": 4.654472328955603, + "learning_rate": 1.998814690129615e-06, + "loss": 1.2866, + "step": 9185 + }, + { + "epoch": 0.06, + "grad_norm": 4.260705343729918, + "learning_rate": 1.9988144317884303e-06, + "loss": 1.3803, + "step": 9186 + }, + { + "epoch": 0.06, + "grad_norm": 4.728087632517888, + "learning_rate": 1.9988141734191118e-06, + "loss": 1.4177, + "step": 9187 + }, + { + "epoch": 0.06, + "grad_norm": 4.412481904348956, + "learning_rate": 1.9988139150216603e-06, + "loss": 1.3198, + "step": 9188 + }, + { + "epoch": 0.06, + "grad_norm": 4.440521569612287, + "learning_rate": 1.998813656596076e-06, + "loss": 1.4435, + "step": 9189 + }, + { + "epoch": 0.06, + "grad_norm": 5.413933265413556, + "learning_rate": 1.998813398142358e-06, + "loss": 1.5202, + "step": 9190 + }, + { + "epoch": 0.06, + "grad_norm": 4.9382782805913745, + "learning_rate": 1.9988131396605067e-06, + "loss": 1.3668, + "step": 9191 + }, + { + "epoch": 0.06, + "grad_norm": 4.504718320545054, + "learning_rate": 1.9988128811505223e-06, + "loss": 1.4625, + "step": 9192 + }, + { + "epoch": 0.06, + "grad_norm": 4.56992680428897, + "learning_rate": 1.9988126226124045e-06, + "loss": 1.483, + "step": 9193 + }, + { + "epoch": 0.06, + "grad_norm": 4.983931505334358, + "learning_rate": 1.9988123640461538e-06, + "loss": 1.303, + "step": 9194 + }, + { + "epoch": 0.06, + "grad_norm": 4.061058860247226, + "learning_rate": 1.9988121054517697e-06, + "loss": 1.2034, + "step": 9195 + }, + { + "epoch": 0.06, + "grad_norm": 4.210479928823161, + "learning_rate": 1.9988118468292523e-06, + "loss": 1.3605, + "step": 9196 + }, + { + "epoch": 0.06, + "grad_norm": 4.580943998647816, + "learning_rate": 1.9988115881786015e-06, + "loss": 1.3324, + "step": 9197 + }, + { + "epoch": 0.06, + "grad_norm": 4.654744937300995, + "learning_rate": 1.9988113294998178e-06, + "loss": 1.3696, + "step": 9198 + }, + { + "epoch": 0.06, + "eval_loss": 1.5551848411560059, + "eval_runtime": 4.5741, + "eval_samples_per_second": 1.968, + "eval_steps_per_second": 1.093, + "step": 9198 + }, + { + "epoch": 0.06, + "grad_norm": 4.885414016752089, + "learning_rate": 1.9988110707929007e-06, + "loss": 1.3691, + "step": 9199 + }, + { + "epoch": 0.06, + "grad_norm": 4.473691951387402, + "learning_rate": 1.9988108120578507e-06, + "loss": 1.3502, + "step": 9200 + }, + { + "epoch": 0.06, + "grad_norm": 4.551769682896951, + "learning_rate": 1.9988105532946674e-06, + "loss": 1.2203, + "step": 9201 + }, + { + "epoch": 0.06, + "grad_norm": 4.263827357327292, + "learning_rate": 1.9988102945033507e-06, + "loss": 1.2445, + "step": 9202 + }, + { + "epoch": 0.06, + "grad_norm": 5.0858326278244395, + "learning_rate": 1.998810035683901e-06, + "loss": 1.3465, + "step": 9203 + }, + { + "epoch": 0.06, + "grad_norm": 4.669388133745964, + "learning_rate": 1.9988097768363185e-06, + "loss": 1.383, + "step": 9204 + }, + { + "epoch": 0.06, + "grad_norm": 4.650010881826172, + "learning_rate": 1.998809517960602e-06, + "loss": 1.3574, + "step": 9205 + }, + { + "epoch": 0.06, + "grad_norm": 4.089131123337762, + "learning_rate": 1.9988092590567534e-06, + "loss": 1.3197, + "step": 9206 + }, + { + "epoch": 0.06, + "grad_norm": 4.5883544020075115, + "learning_rate": 1.998809000124771e-06, + "loss": 1.3156, + "step": 9207 + }, + { + "epoch": 0.06, + "grad_norm": 3.9427521496043374, + "learning_rate": 1.9988087411646557e-06, + "loss": 1.1175, + "step": 9208 + }, + { + "epoch": 0.06, + "grad_norm": 4.971540794459495, + "learning_rate": 1.998808482176407e-06, + "loss": 1.1763, + "step": 9209 + }, + { + "epoch": 0.06, + "grad_norm": 5.250689400116675, + "learning_rate": 1.9988082231600254e-06, + "loss": 1.3241, + "step": 9210 + }, + { + "epoch": 0.06, + "grad_norm": 5.723022770698001, + "learning_rate": 1.9988079641155106e-06, + "loss": 1.453, + "step": 9211 + }, + { + "epoch": 0.06, + "grad_norm": 4.625770044710968, + "learning_rate": 1.998807705042863e-06, + "loss": 1.4092, + "step": 9212 + }, + { + "epoch": 0.06, + "grad_norm": 4.292208043151595, + "learning_rate": 1.998807445942082e-06, + "loss": 1.3142, + "step": 9213 + }, + { + "epoch": 0.06, + "grad_norm": 4.688632080400614, + "learning_rate": 1.998807186813168e-06, + "loss": 1.2623, + "step": 9214 + }, + { + "epoch": 0.06, + "grad_norm": 4.962966344888703, + "learning_rate": 1.998806927656121e-06, + "loss": 1.3753, + "step": 9215 + }, + { + "epoch": 0.06, + "grad_norm": 6.3746591523351, + "learning_rate": 1.998806668470941e-06, + "loss": 1.2383, + "step": 9216 + }, + { + "epoch": 0.06, + "grad_norm": 5.239264773042854, + "learning_rate": 1.9988064092576277e-06, + "loss": 1.3787, + "step": 9217 + }, + { + "epoch": 0.06, + "grad_norm": 4.67627214590265, + "learning_rate": 1.9988061500161816e-06, + "loss": 1.3105, + "step": 9218 + }, + { + "epoch": 0.06, + "grad_norm": 40.299816237909404, + "learning_rate": 1.998805890746602e-06, + "loss": 1.2077, + "step": 9219 + }, + { + "epoch": 0.06, + "grad_norm": 4.3000625923041, + "learning_rate": 1.99880563144889e-06, + "loss": 1.3591, + "step": 9220 + }, + { + "epoch": 0.06, + "grad_norm": 11.819164773059967, + "learning_rate": 1.9988053721230445e-06, + "loss": 1.4325, + "step": 9221 + }, + { + "epoch": 0.06, + "grad_norm": 6.602682277265542, + "learning_rate": 1.9988051127690663e-06, + "loss": 1.4859, + "step": 9222 + }, + { + "epoch": 0.06, + "grad_norm": 4.397121461783765, + "learning_rate": 1.9988048533869547e-06, + "loss": 1.3326, + "step": 9223 + }, + { + "epoch": 0.06, + "grad_norm": 6.223462950176888, + "learning_rate": 1.99880459397671e-06, + "loss": 1.4692, + "step": 9224 + }, + { + "epoch": 0.06, + "grad_norm": 4.685747149224842, + "learning_rate": 1.9988043345383327e-06, + "loss": 1.5234, + "step": 9225 + }, + { + "epoch": 0.06, + "grad_norm": 4.567393106628273, + "learning_rate": 1.9988040750718223e-06, + "loss": 1.2359, + "step": 9226 + }, + { + "epoch": 0.06, + "grad_norm": 4.805400839191366, + "learning_rate": 1.998803815577179e-06, + "loss": 1.3772, + "step": 9227 + }, + { + "epoch": 0.06, + "grad_norm": 4.3623309908031525, + "learning_rate": 1.998803556054403e-06, + "loss": 1.3121, + "step": 9228 + }, + { + "epoch": 0.06, + "grad_norm": 4.667877487910693, + "learning_rate": 1.9988032965034932e-06, + "loss": 1.1697, + "step": 9229 + }, + { + "epoch": 0.06, + "grad_norm": 5.152682760918998, + "learning_rate": 1.998803036924451e-06, + "loss": 1.3827, + "step": 9230 + }, + { + "epoch": 0.06, + "grad_norm": 4.308201467006308, + "learning_rate": 1.9988027773172757e-06, + "loss": 1.3683, + "step": 9231 + }, + { + "epoch": 0.06, + "grad_norm": 4.6805524903343265, + "learning_rate": 1.9988025176819674e-06, + "loss": 1.4236, + "step": 9232 + }, + { + "epoch": 0.06, + "grad_norm": 4.643916265722326, + "learning_rate": 1.9988022580185265e-06, + "loss": 1.254, + "step": 9233 + }, + { + "epoch": 0.06, + "grad_norm": 4.909438057129668, + "learning_rate": 1.9988019983269523e-06, + "loss": 1.3937, + "step": 9234 + }, + { + "epoch": 0.06, + "grad_norm": 4.300536284251366, + "learning_rate": 1.998801738607245e-06, + "loss": 1.2756, + "step": 9235 + }, + { + "epoch": 0.06, + "grad_norm": 4.2915941448670365, + "learning_rate": 1.998801478859405e-06, + "loss": 1.3561, + "step": 9236 + }, + { + "epoch": 0.06, + "grad_norm": 4.919475393069577, + "learning_rate": 1.9988012190834324e-06, + "loss": 1.5328, + "step": 9237 + }, + { + "epoch": 0.06, + "grad_norm": 6.211193517788841, + "learning_rate": 1.9988009592793265e-06, + "loss": 1.1965, + "step": 9238 + }, + { + "epoch": 0.06, + "grad_norm": 4.694982072556466, + "learning_rate": 1.998800699447088e-06, + "loss": 1.4447, + "step": 9239 + }, + { + "epoch": 0.06, + "grad_norm": 4.50952391959699, + "learning_rate": 1.9988004395867163e-06, + "loss": 1.1954, + "step": 9240 + }, + { + "epoch": 0.06, + "grad_norm": 4.192278098395553, + "learning_rate": 1.9988001796982115e-06, + "loss": 1.3435, + "step": 9241 + }, + { + "epoch": 0.06, + "grad_norm": 7.456890300219677, + "learning_rate": 1.9987999197815743e-06, + "loss": 1.3874, + "step": 9242 + }, + { + "epoch": 0.06, + "grad_norm": 4.43883051682153, + "learning_rate": 1.998799659836804e-06, + "loss": 1.3825, + "step": 9243 + }, + { + "epoch": 0.06, + "grad_norm": 3.9919724656906244, + "learning_rate": 1.998799399863901e-06, + "loss": 1.2054, + "step": 9244 + }, + { + "epoch": 0.06, + "grad_norm": 5.415586743663469, + "learning_rate": 1.998799139862865e-06, + "loss": 1.3388, + "step": 9245 + }, + { + "epoch": 0.06, + "grad_norm": 4.737787258758223, + "learning_rate": 1.9987988798336965e-06, + "loss": 1.2581, + "step": 9246 + }, + { + "epoch": 0.06, + "grad_norm": 4.460589872603003, + "learning_rate": 1.9987986197763946e-06, + "loss": 1.3994, + "step": 9247 + }, + { + "epoch": 0.06, + "grad_norm": 4.730783176442161, + "learning_rate": 1.9987983596909603e-06, + "loss": 1.2868, + "step": 9248 + }, + { + "epoch": 0.06, + "grad_norm": 4.52809047493597, + "learning_rate": 1.998798099577393e-06, + "loss": 1.349, + "step": 9249 + }, + { + "epoch": 0.06, + "grad_norm": 4.251415073060551, + "learning_rate": 1.9987978394356927e-06, + "loss": 1.2302, + "step": 9250 + }, + { + "epoch": 0.06, + "grad_norm": 4.5145178325005375, + "learning_rate": 1.99879757926586e-06, + "loss": 1.3751, + "step": 9251 + }, + { + "epoch": 0.06, + "grad_norm": 4.618141550647926, + "learning_rate": 1.998797319067894e-06, + "loss": 1.2455, + "step": 9252 + }, + { + "epoch": 0.06, + "grad_norm": 9.249325877496508, + "learning_rate": 1.9987970588417957e-06, + "loss": 1.3605, + "step": 9253 + }, + { + "epoch": 0.06, + "grad_norm": 5.02226422494594, + "learning_rate": 1.998796798587564e-06, + "loss": 1.4115, + "step": 9254 + }, + { + "epoch": 0.06, + "grad_norm": 4.4558576164359085, + "learning_rate": 1.9987965383052e-06, + "loss": 1.4009, + "step": 9255 + }, + { + "epoch": 0.06, + "grad_norm": 4.6185799596053245, + "learning_rate": 1.998796277994703e-06, + "loss": 1.3943, + "step": 9256 + }, + { + "epoch": 0.06, + "grad_norm": 4.047559855515776, + "learning_rate": 1.9987960176560733e-06, + "loss": 1.2431, + "step": 9257 + }, + { + "epoch": 0.06, + "grad_norm": 4.591163392316811, + "learning_rate": 1.998795757289311e-06, + "loss": 1.4109, + "step": 9258 + }, + { + "epoch": 0.06, + "grad_norm": 4.225018282293772, + "learning_rate": 1.998795496894416e-06, + "loss": 1.2567, + "step": 9259 + }, + { + "epoch": 0.06, + "grad_norm": 4.317149360129581, + "learning_rate": 1.998795236471388e-06, + "loss": 1.3953, + "step": 9260 + }, + { + "epoch": 0.06, + "grad_norm": 6.7634580294834015, + "learning_rate": 1.9987949760202274e-06, + "loss": 1.5209, + "step": 9261 + }, + { + "epoch": 0.06, + "grad_norm": 5.482034080478455, + "learning_rate": 1.998794715540934e-06, + "loss": 1.3214, + "step": 9262 + }, + { + "epoch": 0.06, + "grad_norm": 4.085856425355162, + "learning_rate": 1.998794455033508e-06, + "loss": 1.2051, + "step": 9263 + }, + { + "epoch": 0.06, + "grad_norm": 4.412689106105339, + "learning_rate": 1.998794194497949e-06, + "loss": 1.3565, + "step": 9264 + }, + { + "epoch": 0.06, + "grad_norm": 5.177331083369055, + "learning_rate": 1.9987939339342576e-06, + "loss": 1.3637, + "step": 9265 + }, + { + "epoch": 0.06, + "grad_norm": 4.836459468080875, + "learning_rate": 1.9987936733424335e-06, + "loss": 1.2617, + "step": 9266 + }, + { + "epoch": 0.06, + "grad_norm": 4.503984940561418, + "learning_rate": 1.998793412722476e-06, + "loss": 1.2991, + "step": 9267 + }, + { + "epoch": 0.06, + "grad_norm": 4.57715649412643, + "learning_rate": 1.9987931520743864e-06, + "loss": 1.3679, + "step": 9268 + }, + { + "epoch": 0.06, + "grad_norm": 4.365446337498259, + "learning_rate": 1.9987928913981643e-06, + "loss": 1.3526, + "step": 9269 + }, + { + "epoch": 0.06, + "grad_norm": 4.588375265317352, + "learning_rate": 1.9987926306938093e-06, + "loss": 1.4596, + "step": 9270 + }, + { + "epoch": 0.06, + "grad_norm": 6.615517229308341, + "learning_rate": 1.998792369961322e-06, + "loss": 1.4156, + "step": 9271 + }, + { + "epoch": 0.06, + "eval_loss": 1.554495096206665, + "eval_runtime": 4.6064, + "eval_samples_per_second": 1.954, + "eval_steps_per_second": 1.085, + "step": 9271 + }, + { + "epoch": 0.06, + "grad_norm": 4.472883395236333, + "learning_rate": 1.9987921092007014e-06, + "loss": 1.3725, + "step": 9272 + }, + { + "epoch": 0.06, + "grad_norm": 5.4377656112995085, + "learning_rate": 1.9987918484119484e-06, + "loss": 1.3698, + "step": 9273 + }, + { + "epoch": 0.06, + "grad_norm": 5.435964818923897, + "learning_rate": 1.998791587595063e-06, + "loss": 1.5279, + "step": 9274 + }, + { + "epoch": 0.06, + "grad_norm": 4.4465383279021164, + "learning_rate": 1.9987913267500446e-06, + "loss": 1.525, + "step": 9275 + }, + { + "epoch": 0.06, + "grad_norm": 4.775829844342686, + "learning_rate": 1.9987910658768937e-06, + "loss": 1.4982, + "step": 9276 + }, + { + "epoch": 0.06, + "grad_norm": 4.488655788858193, + "learning_rate": 1.9987908049756103e-06, + "loss": 1.4157, + "step": 9277 + }, + { + "epoch": 0.06, + "grad_norm": 4.184203121501775, + "learning_rate": 1.998790544046194e-06, + "loss": 1.2199, + "step": 9278 + }, + { + "epoch": 0.06, + "grad_norm": 4.661767676614311, + "learning_rate": 1.9987902830886452e-06, + "loss": 1.4224, + "step": 9279 + }, + { + "epoch": 0.06, + "grad_norm": 5.395610188181745, + "learning_rate": 1.998790022102964e-06, + "loss": 1.2944, + "step": 9280 + }, + { + "epoch": 0.06, + "grad_norm": 14.70008307969552, + "learning_rate": 1.99878976108915e-06, + "loss": 1.3301, + "step": 9281 + }, + { + "epoch": 0.06, + "grad_norm": 4.717504790595443, + "learning_rate": 1.9987895000472033e-06, + "loss": 1.4021, + "step": 9282 + }, + { + "epoch": 0.06, + "grad_norm": 5.0920116594945215, + "learning_rate": 1.9987892389771245e-06, + "loss": 1.464, + "step": 9283 + }, + { + "epoch": 0.06, + "grad_norm": 4.420083599275562, + "learning_rate": 1.9987889778789127e-06, + "loss": 1.3963, + "step": 9284 + }, + { + "epoch": 0.06, + "grad_norm": 4.65791395587469, + "learning_rate": 1.9987887167525684e-06, + "loss": 1.3724, + "step": 9285 + }, + { + "epoch": 0.06, + "grad_norm": 4.956681883190954, + "learning_rate": 1.9987884555980916e-06, + "loss": 1.3088, + "step": 9286 + }, + { + "epoch": 0.06, + "grad_norm": 4.632885623210763, + "learning_rate": 1.9987881944154823e-06, + "loss": 1.3204, + "step": 9287 + }, + { + "epoch": 0.06, + "grad_norm": 4.745986269835675, + "learning_rate": 1.9987879332047406e-06, + "loss": 1.4684, + "step": 9288 + }, + { + "epoch": 0.06, + "grad_norm": 5.832174684013402, + "learning_rate": 1.9987876719658663e-06, + "loss": 1.2743, + "step": 9289 + }, + { + "epoch": 0.06, + "grad_norm": 4.40543614112395, + "learning_rate": 1.9987874106988595e-06, + "loss": 1.4756, + "step": 9290 + }, + { + "epoch": 0.06, + "grad_norm": 4.6797750413247545, + "learning_rate": 1.99878714940372e-06, + "loss": 1.3292, + "step": 9291 + }, + { + "epoch": 0.06, + "grad_norm": 5.183811398343307, + "learning_rate": 1.9987868880804483e-06, + "loss": 1.4219, + "step": 9292 + }, + { + "epoch": 0.06, + "grad_norm": 4.651033397706635, + "learning_rate": 1.9987866267290436e-06, + "loss": 1.3912, + "step": 9293 + }, + { + "epoch": 0.06, + "grad_norm": 4.762988933329257, + "learning_rate": 1.9987863653495064e-06, + "loss": 1.2451, + "step": 9294 + }, + { + "epoch": 0.06, + "grad_norm": 4.736059905112012, + "learning_rate": 1.998786103941837e-06, + "loss": 1.497, + "step": 9295 + }, + { + "epoch": 0.06, + "grad_norm": 5.765904543506734, + "learning_rate": 1.998785842506035e-06, + "loss": 1.6876, + "step": 9296 + }, + { + "epoch": 0.06, + "grad_norm": 4.592128130821391, + "learning_rate": 1.998785581042101e-06, + "loss": 1.3138, + "step": 9297 + }, + { + "epoch": 0.06, + "grad_norm": 4.70491513327742, + "learning_rate": 1.998785319550034e-06, + "loss": 1.399, + "step": 9298 + }, + { + "epoch": 0.06, + "grad_norm": 4.267025604843614, + "learning_rate": 1.9987850580298347e-06, + "loss": 1.2901, + "step": 9299 + }, + { + "epoch": 0.06, + "grad_norm": 7.339875503217822, + "learning_rate": 1.998784796481503e-06, + "loss": 1.1524, + "step": 9300 + }, + { + "epoch": 0.06, + "grad_norm": 4.949563078877494, + "learning_rate": 1.9987845349050384e-06, + "loss": 1.2128, + "step": 9301 + }, + { + "epoch": 0.06, + "grad_norm": 5.396387899021429, + "learning_rate": 1.998784273300442e-06, + "loss": 1.3408, + "step": 9302 + }, + { + "epoch": 0.06, + "grad_norm": 6.347357519236312, + "learning_rate": 1.998784011667713e-06, + "loss": 1.3769, + "step": 9303 + }, + { + "epoch": 0.06, + "grad_norm": 4.649649010355445, + "learning_rate": 1.9987837500068516e-06, + "loss": 1.4515, + "step": 9304 + }, + { + "epoch": 0.06, + "grad_norm": 4.58513396644983, + "learning_rate": 1.9987834883178576e-06, + "loss": 1.3744, + "step": 9305 + }, + { + "epoch": 0.06, + "grad_norm": 5.166731555818931, + "learning_rate": 1.9987832266007308e-06, + "loss": 1.4787, + "step": 9306 + }, + { + "epoch": 0.06, + "grad_norm": 4.283407057774179, + "learning_rate": 1.9987829648554722e-06, + "loss": 1.268, + "step": 9307 + }, + { + "epoch": 0.06, + "grad_norm": 4.48034430599568, + "learning_rate": 1.998782703082081e-06, + "loss": 1.4802, + "step": 9308 + }, + { + "epoch": 0.06, + "grad_norm": 4.392416470892556, + "learning_rate": 1.9987824412805576e-06, + "loss": 1.2899, + "step": 9309 + }, + { + "epoch": 0.06, + "grad_norm": 4.38167239713809, + "learning_rate": 1.998782179450902e-06, + "loss": 1.3048, + "step": 9310 + }, + { + "epoch": 0.06, + "grad_norm": 4.795089895853581, + "learning_rate": 1.9987819175931134e-06, + "loss": 1.4448, + "step": 9311 + }, + { + "epoch": 0.06, + "grad_norm": 4.7341605878517505, + "learning_rate": 1.998781655707193e-06, + "loss": 1.2488, + "step": 9312 + }, + { + "epoch": 0.06, + "grad_norm": 5.858909377611416, + "learning_rate": 1.99878139379314e-06, + "loss": 1.3509, + "step": 9313 + }, + { + "epoch": 0.06, + "grad_norm": 4.901591635443287, + "learning_rate": 1.998781131850955e-06, + "loss": 1.3949, + "step": 9314 + }, + { + "epoch": 0.06, + "grad_norm": 5.40303957806761, + "learning_rate": 1.9987808698806366e-06, + "loss": 1.5304, + "step": 9315 + }, + { + "epoch": 0.06, + "grad_norm": 4.404420742355614, + "learning_rate": 1.998780607882187e-06, + "loss": 1.3839, + "step": 9316 + }, + { + "epoch": 0.06, + "grad_norm": 6.471627541955831, + "learning_rate": 1.9987803458556045e-06, + "loss": 1.5146, + "step": 9317 + }, + { + "epoch": 0.06, + "grad_norm": 4.704646072489282, + "learning_rate": 1.9987800838008897e-06, + "loss": 1.3695, + "step": 9318 + }, + { + "epoch": 0.06, + "grad_norm": 4.557454953695969, + "learning_rate": 1.9987798217180427e-06, + "loss": 1.2422, + "step": 9319 + }, + { + "epoch": 0.06, + "grad_norm": 4.476143796087004, + "learning_rate": 1.9987795596070637e-06, + "loss": 1.3398, + "step": 9320 + }, + { + "epoch": 0.06, + "grad_norm": 4.861996942887446, + "learning_rate": 1.998779297467952e-06, + "loss": 1.3641, + "step": 9321 + }, + { + "epoch": 0.06, + "grad_norm": 7.56123616641868, + "learning_rate": 1.998779035300708e-06, + "loss": 1.2767, + "step": 9322 + }, + { + "epoch": 0.06, + "grad_norm": 4.808672507983671, + "learning_rate": 1.9987787731053317e-06, + "loss": 1.285, + "step": 9323 + }, + { + "epoch": 0.06, + "grad_norm": 4.185642482271401, + "learning_rate": 1.9987785108818236e-06, + "loss": 1.2857, + "step": 9324 + }, + { + "epoch": 0.06, + "grad_norm": 6.203206204004206, + "learning_rate": 1.998778248630183e-06, + "loss": 1.3522, + "step": 9325 + }, + { + "epoch": 0.06, + "grad_norm": 4.276307882865296, + "learning_rate": 1.9987779863504097e-06, + "loss": 1.3013, + "step": 9326 + }, + { + "epoch": 0.06, + "grad_norm": 5.399071763911135, + "learning_rate": 1.9987777240425044e-06, + "loss": 1.4096, + "step": 9327 + }, + { + "epoch": 0.06, + "grad_norm": 4.299485897763391, + "learning_rate": 1.998777461706467e-06, + "loss": 1.2972, + "step": 9328 + }, + { + "epoch": 0.06, + "grad_norm": 4.976409578703816, + "learning_rate": 1.9987771993422972e-06, + "loss": 1.4363, + "step": 9329 + }, + { + "epoch": 0.06, + "grad_norm": 4.568935534913313, + "learning_rate": 1.9987769369499953e-06, + "loss": 1.2764, + "step": 9330 + }, + { + "epoch": 0.06, + "grad_norm": 6.503288948265216, + "learning_rate": 1.9987766745295613e-06, + "loss": 1.2364, + "step": 9331 + }, + { + "epoch": 0.06, + "grad_norm": 4.256260136957411, + "learning_rate": 1.998776412080995e-06, + "loss": 1.3106, + "step": 9332 + }, + { + "epoch": 0.06, + "grad_norm": 4.199000698707674, + "learning_rate": 1.998776149604296e-06, + "loss": 1.3945, + "step": 9333 + }, + { + "epoch": 0.06, + "grad_norm": 4.048167747518878, + "learning_rate": 1.9987758870994655e-06, + "loss": 1.3643, + "step": 9334 + }, + { + "epoch": 0.06, + "grad_norm": 4.560653833986124, + "learning_rate": 1.9987756245665027e-06, + "loss": 1.2131, + "step": 9335 + }, + { + "epoch": 0.06, + "grad_norm": 4.734914641753475, + "learning_rate": 1.9987753620054075e-06, + "loss": 1.2232, + "step": 9336 + }, + { + "epoch": 0.06, + "grad_norm": 5.346564245683493, + "learning_rate": 1.9987750994161797e-06, + "loss": 1.332, + "step": 9337 + }, + { + "epoch": 0.06, + "grad_norm": 4.766509990122271, + "learning_rate": 1.9987748367988203e-06, + "loss": 1.4455, + "step": 9338 + }, + { + "epoch": 0.06, + "grad_norm": 4.537614482329693, + "learning_rate": 1.9987745741533288e-06, + "loss": 1.4035, + "step": 9339 + }, + { + "epoch": 0.06, + "grad_norm": 4.535923577308804, + "learning_rate": 1.9987743114797047e-06, + "loss": 1.3578, + "step": 9340 + }, + { + "epoch": 0.06, + "grad_norm": 5.66711209337483, + "learning_rate": 1.9987740487779486e-06, + "loss": 1.3531, + "step": 9341 + }, + { + "epoch": 0.06, + "grad_norm": 5.157752259967078, + "learning_rate": 1.998773786048061e-06, + "loss": 1.2807, + "step": 9342 + }, + { + "epoch": 0.06, + "grad_norm": 4.580449970801107, + "learning_rate": 1.9987735232900406e-06, + "loss": 1.4835, + "step": 9343 + }, + { + "epoch": 0.06, + "grad_norm": 5.041308807278067, + "learning_rate": 1.998773260503888e-06, + "loss": 1.507, + "step": 9344 + }, + { + "epoch": 0.06, + "eval_loss": 1.5564230680465698, + "eval_runtime": 4.5977, + "eval_samples_per_second": 1.957, + "eval_steps_per_second": 1.087, + "step": 9344 + }, + { + "epoch": 0.06, + "grad_norm": 4.651619444032944, + "learning_rate": 1.9987729976896034e-06, + "loss": 1.3943, + "step": 9345 + }, + { + "epoch": 0.06, + "grad_norm": 4.919107611574725, + "learning_rate": 1.998772734847187e-06, + "loss": 1.388, + "step": 9346 + }, + { + "epoch": 0.06, + "grad_norm": 3.9307381434164723, + "learning_rate": 1.9987724719766383e-06, + "loss": 1.1384, + "step": 9347 + }, + { + "epoch": 0.06, + "grad_norm": 4.489358723978206, + "learning_rate": 1.9987722090779572e-06, + "loss": 1.2868, + "step": 9348 + }, + { + "epoch": 0.06, + "grad_norm": 4.3874365251733805, + "learning_rate": 1.9987719461511445e-06, + "loss": 1.173, + "step": 9349 + }, + { + "epoch": 0.06, + "grad_norm": 4.206845871964003, + "learning_rate": 1.998771683196199e-06, + "loss": 1.015, + "step": 9350 + }, + { + "epoch": 0.06, + "grad_norm": 4.765176521714314, + "learning_rate": 1.9987714202131223e-06, + "loss": 1.4216, + "step": 9351 + }, + { + "epoch": 0.06, + "grad_norm": 6.397277861698614, + "learning_rate": 1.9987711572019133e-06, + "loss": 1.3599, + "step": 9352 + }, + { + "epoch": 0.06, + "grad_norm": 4.554091947437095, + "learning_rate": 1.9987708941625718e-06, + "loss": 1.2454, + "step": 9353 + }, + { + "epoch": 0.06, + "grad_norm": 4.6806296300669645, + "learning_rate": 1.9987706310950986e-06, + "loss": 1.3785, + "step": 9354 + }, + { + "epoch": 0.06, + "grad_norm": 16.765534803659744, + "learning_rate": 1.9987703679994934e-06, + "loss": 1.3085, + "step": 9355 + }, + { + "epoch": 0.06, + "grad_norm": 4.245376816942007, + "learning_rate": 1.998770104875756e-06, + "loss": 1.2533, + "step": 9356 + }, + { + "epoch": 0.06, + "grad_norm": 5.284808208838013, + "learning_rate": 1.9987698417238866e-06, + "loss": 1.3031, + "step": 9357 + }, + { + "epoch": 0.06, + "grad_norm": 5.106723034892174, + "learning_rate": 1.998769578543885e-06, + "loss": 1.3921, + "step": 9358 + }, + { + "epoch": 0.06, + "grad_norm": 4.625868763169571, + "learning_rate": 1.9987693153357516e-06, + "loss": 1.3934, + "step": 9359 + }, + { + "epoch": 0.06, + "grad_norm": 4.4624830709939385, + "learning_rate": 1.9987690520994863e-06, + "loss": 1.3563, + "step": 9360 + }, + { + "epoch": 0.06, + "grad_norm": 4.39807846365891, + "learning_rate": 1.9987687888350886e-06, + "loss": 1.3282, + "step": 9361 + }, + { + "epoch": 0.06, + "grad_norm": 4.529497524903928, + "learning_rate": 1.9987685255425592e-06, + "loss": 1.4251, + "step": 9362 + }, + { + "epoch": 0.06, + "grad_norm": 4.357869001320897, + "learning_rate": 1.998768262221898e-06, + "loss": 1.3545, + "step": 9363 + }, + { + "epoch": 0.06, + "grad_norm": 4.5683256561702725, + "learning_rate": 1.9987679988731046e-06, + "loss": 1.3858, + "step": 9364 + }, + { + "epoch": 0.06, + "grad_norm": 5.2895249000173985, + "learning_rate": 1.998767735496179e-06, + "loss": 1.4175, + "step": 9365 + }, + { + "epoch": 0.06, + "grad_norm": 4.654215484010466, + "learning_rate": 1.9987674720911217e-06, + "loss": 1.4406, + "step": 9366 + }, + { + "epoch": 0.06, + "grad_norm": 5.291279222863882, + "learning_rate": 1.9987672086579327e-06, + "loss": 1.3936, + "step": 9367 + }, + { + "epoch": 0.06, + "grad_norm": 4.839411000145247, + "learning_rate": 1.9987669451966113e-06, + "loss": 1.4873, + "step": 9368 + }, + { + "epoch": 0.06, + "grad_norm": 7.153951594412018, + "learning_rate": 1.998766681707158e-06, + "loss": 1.2487, + "step": 9369 + }, + { + "epoch": 0.06, + "grad_norm": 4.764295742363618, + "learning_rate": 1.998766418189573e-06, + "loss": 1.4439, + "step": 9370 + }, + { + "epoch": 0.06, + "grad_norm": 4.750167490834732, + "learning_rate": 1.9987661546438557e-06, + "loss": 1.4094, + "step": 9371 + }, + { + "epoch": 0.06, + "grad_norm": 4.505501703856124, + "learning_rate": 1.9987658910700067e-06, + "loss": 1.1499, + "step": 9372 + }, + { + "epoch": 0.06, + "grad_norm": 4.179037053362475, + "learning_rate": 1.998765627468026e-06, + "loss": 1.1673, + "step": 9373 + }, + { + "epoch": 0.06, + "grad_norm": 4.817171723995393, + "learning_rate": 1.998765363837913e-06, + "loss": 1.3536, + "step": 9374 + }, + { + "epoch": 0.06, + "grad_norm": 4.300571417546668, + "learning_rate": 1.9987651001796682e-06, + "loss": 1.3439, + "step": 9375 + }, + { + "epoch": 0.06, + "grad_norm": 4.4718558066022425, + "learning_rate": 1.998764836493292e-06, + "loss": 1.4057, + "step": 9376 + }, + { + "epoch": 0.06, + "grad_norm": 4.501448963846782, + "learning_rate": 1.9987645727787833e-06, + "loss": 1.3458, + "step": 9377 + }, + { + "epoch": 0.06, + "grad_norm": 5.110078065693422, + "learning_rate": 1.998764309036143e-06, + "loss": 1.402, + "step": 9378 + }, + { + "epoch": 0.06, + "grad_norm": 4.188935065097337, + "learning_rate": 1.998764045265371e-06, + "loss": 1.1881, + "step": 9379 + }, + { + "epoch": 0.06, + "grad_norm": 4.334402988353024, + "learning_rate": 1.9987637814664666e-06, + "loss": 1.4239, + "step": 9380 + }, + { + "epoch": 0.06, + "grad_norm": 4.890979438566899, + "learning_rate": 1.9987635176394306e-06, + "loss": 1.5492, + "step": 9381 + }, + { + "epoch": 0.06, + "grad_norm": 5.377491515086609, + "learning_rate": 1.998763253784263e-06, + "loss": 1.1524, + "step": 9382 + }, + { + "epoch": 0.06, + "grad_norm": 4.476054373815102, + "learning_rate": 1.9987629899009637e-06, + "loss": 1.4062, + "step": 9383 + }, + { + "epoch": 0.06, + "grad_norm": 4.260747832926185, + "learning_rate": 1.998762725989532e-06, + "loss": 1.2834, + "step": 9384 + }, + { + "epoch": 0.06, + "grad_norm": 4.816625609447784, + "learning_rate": 1.9987624620499684e-06, + "loss": 1.3921, + "step": 9385 + }, + { + "epoch": 0.06, + "grad_norm": 6.091096415979983, + "learning_rate": 1.9987621980822737e-06, + "loss": 1.3007, + "step": 9386 + }, + { + "epoch": 0.06, + "grad_norm": 4.425480951782516, + "learning_rate": 1.998761934086447e-06, + "loss": 1.4246, + "step": 9387 + }, + { + "epoch": 0.06, + "grad_norm": 7.411021803943703, + "learning_rate": 1.998761670062488e-06, + "loss": 1.2711, + "step": 9388 + }, + { + "epoch": 0.06, + "grad_norm": 4.564211915096558, + "learning_rate": 1.9987614060103975e-06, + "loss": 1.3117, + "step": 9389 + }, + { + "epoch": 0.06, + "grad_norm": 6.271459895443692, + "learning_rate": 1.998761141930175e-06, + "loss": 1.3914, + "step": 9390 + }, + { + "epoch": 0.06, + "grad_norm": 4.42397509417245, + "learning_rate": 1.998760877821821e-06, + "loss": 1.3259, + "step": 9391 + }, + { + "epoch": 0.06, + "grad_norm": 4.472042016889664, + "learning_rate": 1.998760613685335e-06, + "loss": 1.2991, + "step": 9392 + }, + { + "epoch": 0.06, + "grad_norm": 4.631671171162598, + "learning_rate": 1.9987603495207176e-06, + "loss": 1.4603, + "step": 9393 + }, + { + "epoch": 0.06, + "grad_norm": 5.126982568006298, + "learning_rate": 1.9987600853279684e-06, + "loss": 1.3015, + "step": 9394 + }, + { + "epoch": 0.06, + "grad_norm": 4.35775755410097, + "learning_rate": 1.9987598211070875e-06, + "loss": 1.2212, + "step": 9395 + }, + { + "epoch": 0.06, + "grad_norm": 4.376302506254838, + "learning_rate": 1.9987595568580745e-06, + "loss": 1.2954, + "step": 9396 + }, + { + "epoch": 0.06, + "grad_norm": 4.22920181372995, + "learning_rate": 1.99875929258093e-06, + "loss": 1.2735, + "step": 9397 + }, + { + "epoch": 0.06, + "grad_norm": 4.353132731433254, + "learning_rate": 1.9987590282756536e-06, + "loss": 1.3804, + "step": 9398 + }, + { + "epoch": 0.06, + "grad_norm": 6.090285023597953, + "learning_rate": 1.9987587639422452e-06, + "loss": 1.2131, + "step": 9399 + }, + { + "epoch": 0.06, + "grad_norm": 4.647817003881624, + "learning_rate": 1.9987584995807056e-06, + "loss": 1.3534, + "step": 9400 + }, + { + "epoch": 0.06, + "grad_norm": 4.023506578973363, + "learning_rate": 1.9987582351910343e-06, + "loss": 1.2525, + "step": 9401 + }, + { + "epoch": 0.06, + "grad_norm": 4.723006433047982, + "learning_rate": 1.998757970773231e-06, + "loss": 1.4166, + "step": 9402 + }, + { + "epoch": 0.06, + "grad_norm": 6.0269702562074094, + "learning_rate": 1.9987577063272964e-06, + "loss": 1.432, + "step": 9403 + }, + { + "epoch": 0.06, + "grad_norm": 5.266689423088489, + "learning_rate": 1.9987574418532298e-06, + "loss": 1.325, + "step": 9404 + }, + { + "epoch": 0.06, + "grad_norm": 8.181221368989679, + "learning_rate": 1.9987571773510314e-06, + "loss": 1.3802, + "step": 9405 + }, + { + "epoch": 0.06, + "grad_norm": 4.388891468179163, + "learning_rate": 1.998756912820702e-06, + "loss": 1.3755, + "step": 9406 + }, + { + "epoch": 0.06, + "grad_norm": 5.487684122863875, + "learning_rate": 1.99875664826224e-06, + "loss": 1.3536, + "step": 9407 + }, + { + "epoch": 0.06, + "grad_norm": 4.548071851503697, + "learning_rate": 1.998756383675647e-06, + "loss": 1.3414, + "step": 9408 + }, + { + "epoch": 0.06, + "grad_norm": 4.224147895397433, + "learning_rate": 1.998756119060922e-06, + "loss": 1.2701, + "step": 9409 + }, + { + "epoch": 0.06, + "grad_norm": 5.370676813455007, + "learning_rate": 1.998755854418066e-06, + "loss": 1.4123, + "step": 9410 + }, + { + "epoch": 0.06, + "grad_norm": 3.9779755980831815, + "learning_rate": 1.9987555897470775e-06, + "loss": 1.2393, + "step": 9411 + }, + { + "epoch": 0.06, + "grad_norm": 4.2740192236856105, + "learning_rate": 1.998755325047958e-06, + "loss": 1.4512, + "step": 9412 + }, + { + "epoch": 0.06, + "grad_norm": 5.815177989160682, + "learning_rate": 1.9987550603207064e-06, + "loss": 1.6383, + "step": 9413 + }, + { + "epoch": 0.06, + "grad_norm": 4.6404335981797775, + "learning_rate": 1.9987547955653236e-06, + "loss": 1.3385, + "step": 9414 + }, + { + "epoch": 0.06, + "grad_norm": 4.3188661188951984, + "learning_rate": 1.998754530781809e-06, + "loss": 1.4059, + "step": 9415 + }, + { + "epoch": 0.06, + "grad_norm": 4.450171886637124, + "learning_rate": 1.998754265970163e-06, + "loss": 1.2894, + "step": 9416 + }, + { + "epoch": 0.06, + "grad_norm": 4.397497627251262, + "learning_rate": 1.998754001130385e-06, + "loss": 1.289, + "step": 9417 + }, + { + "epoch": 0.06, + "eval_loss": 1.5530638694763184, + "eval_runtime": 4.5926, + "eval_samples_per_second": 1.96, + "eval_steps_per_second": 1.089, + "step": 9417 } ], "logging_steps": 1, @@ -54517,7 +66973,7 @@ "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 73, - "total_flos": 802421357936640.0, + "total_flos": 985811763855360.0, "train_batch_size": 1, "trial_name": null, "trial_params": null