{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.976, "eval_steps": 125, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002, "grad_norm": 0.06923668831586838, "learning_rate": 1.0000000000000002e-06, "loss": 0.4175, "step": 1 }, { "epoch": 0.002, "eval_loss": 0.4618559181690216, "eval_runtime": 137.9356, "eval_samples_per_second": 4.002, "eval_steps_per_second": 0.5, "step": 1 }, { "epoch": 0.004, "grad_norm": 0.09036832302808762, "learning_rate": 2.0000000000000003e-06, "loss": 0.5159, "step": 2 }, { "epoch": 0.006, "grad_norm": 0.06212183088064194, "learning_rate": 3e-06, "loss": 0.3274, "step": 3 }, { "epoch": 0.008, "grad_norm": 0.089068204164505, "learning_rate": 4.000000000000001e-06, "loss": 0.5353, "step": 4 }, { "epoch": 0.01, "grad_norm": 0.08060520887374878, "learning_rate": 5e-06, "loss": 0.5229, "step": 5 }, { "epoch": 0.012, "grad_norm": 0.08129512518644333, "learning_rate": 6e-06, "loss": 0.416, "step": 6 }, { "epoch": 0.014, "grad_norm": 0.13881395757198334, "learning_rate": 7e-06, "loss": 0.4797, "step": 7 }, { "epoch": 0.016, "grad_norm": 0.09156442433595657, "learning_rate": 8.000000000000001e-06, "loss": 0.4808, "step": 8 }, { "epoch": 0.018, "grad_norm": 0.09145132452249527, "learning_rate": 9e-06, "loss": 0.4991, "step": 9 }, { "epoch": 0.02, "grad_norm": 0.08622220903635025, "learning_rate": 1e-05, "loss": 0.484, "step": 10 }, { "epoch": 0.022, "grad_norm": 0.07630373537540436, "learning_rate": 9.999974825027756e-06, "loss": 0.3951, "step": 11 }, { "epoch": 0.024, "grad_norm": 0.06840338557958603, "learning_rate": 9.999899300364534e-06, "loss": 0.4058, "step": 12 }, { "epoch": 0.026, "grad_norm": 0.09991295635700226, "learning_rate": 9.999773426770864e-06, "loss": 0.5737, "step": 13 }, { "epoch": 0.028, "grad_norm": 0.09987013041973114, "learning_rate": 9.999597205514298e-06, "loss": 0.4535, "step": 14 }, { "epoch": 0.03, "grad_norm": 0.07334341108798981, "learning_rate": 9.999370638369377e-06, "loss": 0.4047, "step": 15 }, { "epoch": 0.032, "grad_norm": 0.10504010319709778, "learning_rate": 9.99909372761763e-06, "loss": 0.4587, "step": 16 }, { "epoch": 0.034, "grad_norm": 0.12481511384248734, "learning_rate": 9.998766476047546e-06, "loss": 0.5568, "step": 17 }, { "epoch": 0.036, "grad_norm": 0.10193619877099991, "learning_rate": 9.998388886954546e-06, "loss": 0.58, "step": 18 }, { "epoch": 0.038, "grad_norm": 0.09747433662414551, "learning_rate": 9.997960964140946e-06, "loss": 0.4248, "step": 19 }, { "epoch": 0.04, "grad_norm": 0.10985693335533142, "learning_rate": 9.997482711915926e-06, "loss": 0.5813, "step": 20 }, { "epoch": 0.042, "grad_norm": 0.08061390370130539, "learning_rate": 9.99695413509548e-06, "loss": 0.3419, "step": 21 }, { "epoch": 0.044, "grad_norm": 0.09820478409528732, "learning_rate": 9.99637523900237e-06, "loss": 0.336, "step": 22 }, { "epoch": 0.046, "grad_norm": 0.11657540500164032, "learning_rate": 9.995746029466071e-06, "loss": 0.4634, "step": 23 }, { "epoch": 0.048, "grad_norm": 0.0904548391699791, "learning_rate": 9.99506651282272e-06, "loss": 0.4085, "step": 24 }, { "epoch": 0.05, "grad_norm": 0.1137523204088211, "learning_rate": 9.994336695915041e-06, "loss": 0.6002, "step": 25 }, { "epoch": 0.052, "grad_norm": 0.08930382132530212, "learning_rate": 9.993556586092281e-06, "loss": 0.4007, "step": 26 }, { "epoch": 0.054, "grad_norm": 0.10268951207399368, "learning_rate": 9.992726191210139e-06, "loss": 0.5762, "step": 27 }, { "epoch": 0.056, "grad_norm": 0.11000809073448181, "learning_rate": 9.991845519630679e-06, "loss": 0.5878, "step": 28 }, { "epoch": 0.058, "grad_norm": 0.08394967019557953, "learning_rate": 9.990914580222258e-06, "loss": 0.4447, "step": 29 }, { "epoch": 0.06, "grad_norm": 0.10849784314632416, "learning_rate": 9.989933382359423e-06, "loss": 0.6129, "step": 30 }, { "epoch": 0.062, "grad_norm": 0.09749893844127655, "learning_rate": 9.988901935922826e-06, "loss": 0.4993, "step": 31 }, { "epoch": 0.064, "grad_norm": 0.09867393970489502, "learning_rate": 9.987820251299121e-06, "loss": 0.4415, "step": 32 }, { "epoch": 0.066, "grad_norm": 0.07566885650157928, "learning_rate": 9.986688339380863e-06, "loss": 0.3669, "step": 33 }, { "epoch": 0.068, "grad_norm": 0.08246949315071106, "learning_rate": 9.985506211566388e-06, "loss": 0.4102, "step": 34 }, { "epoch": 0.07, "grad_norm": 0.10148797929286957, "learning_rate": 9.984273879759713e-06, "loss": 0.5327, "step": 35 }, { "epoch": 0.072, "grad_norm": 0.08779735118150711, "learning_rate": 9.982991356370404e-06, "loss": 0.4914, "step": 36 }, { "epoch": 0.074, "grad_norm": 0.09165964275598526, "learning_rate": 9.981658654313458e-06, "loss": 0.4136, "step": 37 }, { "epoch": 0.076, "grad_norm": 0.10425784438848495, "learning_rate": 9.98027578700917e-06, "loss": 0.6063, "step": 38 }, { "epoch": 0.078, "grad_norm": 0.09124460816383362, "learning_rate": 9.978842768382999e-06, "loss": 0.5461, "step": 39 }, { "epoch": 0.08, "grad_norm": 0.0863451436161995, "learning_rate": 9.977359612865424e-06, "loss": 0.5108, "step": 40 }, { "epoch": 0.082, "grad_norm": 0.11560487747192383, "learning_rate": 9.975826335391808e-06, "loss": 0.4965, "step": 41 }, { "epoch": 0.084, "grad_norm": 0.1319773942232132, "learning_rate": 9.974242951402236e-06, "loss": 0.4754, "step": 42 }, { "epoch": 0.086, "grad_norm": 0.08868485689163208, "learning_rate": 9.972609476841368e-06, "loss": 0.4958, "step": 43 }, { "epoch": 0.088, "grad_norm": 0.12390384823083878, "learning_rate": 9.970925928158275e-06, "loss": 0.5641, "step": 44 }, { "epoch": 0.09, "grad_norm": 0.095445416867733, "learning_rate": 9.969192322306271e-06, "loss": 0.5145, "step": 45 }, { "epoch": 0.092, "grad_norm": 0.09656377136707306, "learning_rate": 9.96740867674275e-06, "loss": 0.3749, "step": 46 }, { "epoch": 0.094, "grad_norm": 0.07841179519891739, "learning_rate": 9.965575009429006e-06, "loss": 0.4113, "step": 47 }, { "epoch": 0.096, "grad_norm": 0.07786890119314194, "learning_rate": 9.963691338830045e-06, "loss": 0.4374, "step": 48 }, { "epoch": 0.098, "grad_norm": 0.09050661325454712, "learning_rate": 9.961757683914406e-06, "loss": 0.5285, "step": 49 }, { "epoch": 0.1, "grad_norm": 0.11070208251476288, "learning_rate": 9.959774064153977e-06, "loss": 0.5326, "step": 50 }, { "epoch": 0.102, "grad_norm": 0.09067952632904053, "learning_rate": 9.957740499523787e-06, "loss": 0.5613, "step": 51 }, { "epoch": 0.104, "grad_norm": 0.08883544057607651, "learning_rate": 9.955657010501807e-06, "loss": 0.4599, "step": 52 }, { "epoch": 0.106, "grad_norm": 0.10251513868570328, "learning_rate": 9.95352361806875e-06, "loss": 0.5354, "step": 53 }, { "epoch": 0.108, "grad_norm": 0.07133735716342926, "learning_rate": 9.951340343707852e-06, "loss": 0.3696, "step": 54 }, { "epoch": 0.11, "grad_norm": 0.061642151325941086, "learning_rate": 9.949107209404664e-06, "loss": 0.3472, "step": 55 }, { "epoch": 0.112, "grad_norm": 0.08950634300708771, "learning_rate": 9.946824237646823e-06, "loss": 0.4969, "step": 56 }, { "epoch": 0.114, "grad_norm": 0.08016358315944672, "learning_rate": 9.944491451423829e-06, "loss": 0.5239, "step": 57 }, { "epoch": 0.116, "grad_norm": 0.12512832880020142, "learning_rate": 9.942108874226812e-06, "loss": 0.5365, "step": 58 }, { "epoch": 0.118, "grad_norm": 0.09220532327890396, "learning_rate": 9.9396765300483e-06, "loss": 0.4783, "step": 59 }, { "epoch": 0.12, "grad_norm": 0.0885612890124321, "learning_rate": 9.937194443381972e-06, "loss": 0.5459, "step": 60 }, { "epoch": 0.122, "grad_norm": 0.08592379838228226, "learning_rate": 9.934662639222412e-06, "loss": 0.4545, "step": 61 }, { "epoch": 0.124, "grad_norm": 0.08418423682451248, "learning_rate": 9.93208114306486e-06, "loss": 0.5105, "step": 62 }, { "epoch": 0.126, "grad_norm": 0.07870952039957047, "learning_rate": 9.929449980904952e-06, "loss": 0.4593, "step": 63 }, { "epoch": 0.128, "grad_norm": 0.08841884881258011, "learning_rate": 9.926769179238467e-06, "loss": 0.4812, "step": 64 }, { "epoch": 0.13, "grad_norm": 0.07493194192647934, "learning_rate": 9.924038765061042e-06, "loss": 0.5065, "step": 65 }, { "epoch": 0.132, "grad_norm": 0.08470446616411209, "learning_rate": 9.921258765867919e-06, "loss": 0.4676, "step": 66 }, { "epoch": 0.134, "grad_norm": 0.0656595379114151, "learning_rate": 9.918429209653662e-06, "loss": 0.3227, "step": 67 }, { "epoch": 0.136, "grad_norm": 0.06501025706529617, "learning_rate": 9.915550124911866e-06, "loss": 0.2777, "step": 68 }, { "epoch": 0.138, "grad_norm": 0.08443128317594528, "learning_rate": 9.912621540634889e-06, "loss": 0.4357, "step": 69 }, { "epoch": 0.14, "grad_norm": 0.07121642678976059, "learning_rate": 9.909643486313533e-06, "loss": 0.3545, "step": 70 }, { "epoch": 0.142, "grad_norm": 0.09408602863550186, "learning_rate": 9.906615991936781e-06, "loss": 0.3916, "step": 71 }, { "epoch": 0.144, "grad_norm": 0.05998094752430916, "learning_rate": 9.903539087991462e-06, "loss": 0.2739, "step": 72 }, { "epoch": 0.146, "grad_norm": 0.08949826657772064, "learning_rate": 9.900412805461968e-06, "loss": 0.3722, "step": 73 }, { "epoch": 0.148, "grad_norm": 0.0731697678565979, "learning_rate": 9.897237175829927e-06, "loss": 0.2906, "step": 74 }, { "epoch": 0.15, "grad_norm": 0.07855986058712006, "learning_rate": 9.894012231073895e-06, "loss": 0.4149, "step": 75 }, { "epoch": 0.152, "grad_norm": 0.0791892409324646, "learning_rate": 9.890738003669029e-06, "loss": 0.4383, "step": 76 }, { "epoch": 0.154, "grad_norm": 0.07980603724718094, "learning_rate": 9.887414526586764e-06, "loss": 0.4867, "step": 77 }, { "epoch": 0.156, "grad_norm": 0.08503536880016327, "learning_rate": 9.884041833294477e-06, "loss": 0.4644, "step": 78 }, { "epoch": 0.158, "grad_norm": 0.09240555018186569, "learning_rate": 9.880619957755151e-06, "loss": 0.3107, "step": 79 }, { "epoch": 0.16, "grad_norm": 0.08195238560438156, "learning_rate": 9.877148934427037e-06, "loss": 0.3414, "step": 80 }, { "epoch": 0.162, "grad_norm": 0.09512759745121002, "learning_rate": 9.873628798263297e-06, "loss": 0.4745, "step": 81 }, { "epoch": 0.164, "grad_norm": 0.07976000756025314, "learning_rate": 9.870059584711668e-06, "loss": 0.3925, "step": 82 }, { "epoch": 0.166, "grad_norm": 0.11229317635297775, "learning_rate": 9.86644132971409e-06, "loss": 0.4921, "step": 83 }, { "epoch": 0.168, "grad_norm": 0.07479218393564224, "learning_rate": 9.862774069706346e-06, "loss": 0.3607, "step": 84 }, { "epoch": 0.17, "grad_norm": 0.08530927449464798, "learning_rate": 9.859057841617709e-06, "loss": 0.4116, "step": 85 }, { "epoch": 0.172, "grad_norm": 0.05544688552618027, "learning_rate": 9.855292682870552e-06, "loss": 0.2043, "step": 86 }, { "epoch": 0.174, "grad_norm": 0.08539939671754837, "learning_rate": 9.851478631379982e-06, "loss": 0.4437, "step": 87 }, { "epoch": 0.176, "grad_norm": 0.08732863515615463, "learning_rate": 9.847615725553457e-06, "loss": 0.4449, "step": 88 }, { "epoch": 0.178, "grad_norm": 0.08848625421524048, "learning_rate": 9.843704004290393e-06, "loss": 0.5191, "step": 89 }, { "epoch": 0.18, "grad_norm": 0.1142885684967041, "learning_rate": 9.839743506981783e-06, "loss": 0.3788, "step": 90 }, { "epoch": 0.182, "grad_norm": 0.0678037703037262, "learning_rate": 9.835734273509787e-06, "loss": 0.3655, "step": 91 }, { "epoch": 0.184, "grad_norm": 0.08179458975791931, "learning_rate": 9.831676344247343e-06, "loss": 0.4804, "step": 92 }, { "epoch": 0.186, "grad_norm": 0.10821828246116638, "learning_rate": 9.827569760057755e-06, "loss": 0.4946, "step": 93 }, { "epoch": 0.188, "grad_norm": 0.06980521976947784, "learning_rate": 9.82341456229428e-06, "loss": 0.3301, "step": 94 }, { "epoch": 0.19, "grad_norm": 0.07966768741607666, "learning_rate": 9.819210792799711e-06, "loss": 0.4377, "step": 95 }, { "epoch": 0.192, "grad_norm": 0.08750802278518677, "learning_rate": 9.814958493905962e-06, "loss": 0.4137, "step": 96 }, { "epoch": 0.194, "grad_norm": 0.08171187341213226, "learning_rate": 9.810657708433637e-06, "loss": 0.5154, "step": 97 }, { "epoch": 0.196, "grad_norm": 0.07627864181995392, "learning_rate": 9.806308479691595e-06, "loss": 0.3593, "step": 98 }, { "epoch": 0.198, "grad_norm": 0.07038850337266922, "learning_rate": 9.801910851476524e-06, "loss": 0.3882, "step": 99 }, { "epoch": 0.2, "grad_norm": 0.09910848736763, "learning_rate": 9.797464868072489e-06, "loss": 0.5034, "step": 100 }, { "epoch": 0.202, "grad_norm": 0.08382704854011536, "learning_rate": 9.792970574250493e-06, "loss": 0.4769, "step": 101 }, { "epoch": 0.204, "grad_norm": 0.07511335611343384, "learning_rate": 9.788428015268027e-06, "loss": 0.3703, "step": 102 }, { "epoch": 0.206, "grad_norm": 0.08155877888202667, "learning_rate": 9.78383723686861e-06, "loss": 0.4102, "step": 103 }, { "epoch": 0.208, "grad_norm": 0.06436574459075928, "learning_rate": 9.779198285281326e-06, "loss": 0.3253, "step": 104 }, { "epoch": 0.21, "grad_norm": 0.06901544332504272, "learning_rate": 9.774511207220369e-06, "loss": 0.2842, "step": 105 }, { "epoch": 0.212, "grad_norm": 0.08444689959287643, "learning_rate": 9.769776049884564e-06, "loss": 0.4212, "step": 106 }, { "epoch": 0.214, "grad_norm": 0.08550014346837997, "learning_rate": 9.76499286095689e-06, "loss": 0.4404, "step": 107 }, { "epoch": 0.216, "grad_norm": 0.09659305214881897, "learning_rate": 9.760161688604008e-06, "loss": 0.5841, "step": 108 }, { "epoch": 0.218, "grad_norm": 0.06201549619436264, "learning_rate": 9.755282581475769e-06, "loss": 0.2246, "step": 109 }, { "epoch": 0.22, "grad_norm": 0.07813581079244614, "learning_rate": 9.750355588704728e-06, "loss": 0.4415, "step": 110 }, { "epoch": 0.222, "grad_norm": 0.10021974891424179, "learning_rate": 9.745380759905648e-06, "loss": 0.3042, "step": 111 }, { "epoch": 0.224, "grad_norm": 0.10321412235498428, "learning_rate": 9.740358145174999e-06, "loss": 0.4837, "step": 112 }, { "epoch": 0.226, "grad_norm": 0.11536537110805511, "learning_rate": 9.735287795090455e-06, "loss": 0.5586, "step": 113 }, { "epoch": 0.228, "grad_norm": 0.07521039247512817, "learning_rate": 9.730169760710385e-06, "loss": 0.361, "step": 114 }, { "epoch": 0.23, "grad_norm": 0.07128458470106125, "learning_rate": 9.725004093573343e-06, "loss": 0.3511, "step": 115 }, { "epoch": 0.232, "grad_norm": 0.08504608273506165, "learning_rate": 9.719790845697534e-06, "loss": 0.4472, "step": 116 }, { "epoch": 0.234, "grad_norm": 0.08541107177734375, "learning_rate": 9.71453006958031e-06, "loss": 0.3195, "step": 117 }, { "epoch": 0.236, "grad_norm": 0.085638627409935, "learning_rate": 9.709221818197626e-06, "loss": 0.4343, "step": 118 }, { "epoch": 0.238, "grad_norm": 0.06405656784772873, "learning_rate": 9.703866145003512e-06, "loss": 0.2905, "step": 119 }, { "epoch": 0.24, "grad_norm": 0.12191811949014664, "learning_rate": 9.698463103929542e-06, "loss": 0.4092, "step": 120 }, { "epoch": 0.242, "grad_norm": 0.08051154762506485, "learning_rate": 9.69301274938428e-06, "loss": 0.3362, "step": 121 }, { "epoch": 0.244, "grad_norm": 0.09473302215337753, "learning_rate": 9.687515136252732e-06, "loss": 0.3941, "step": 122 }, { "epoch": 0.246, "grad_norm": 0.09992998838424683, "learning_rate": 9.681970319895804e-06, "loss": 0.4603, "step": 123 }, { "epoch": 0.248, "grad_norm": 0.08887780457735062, "learning_rate": 9.676378356149733e-06, "loss": 0.3082, "step": 124 }, { "epoch": 0.25, "grad_norm": 0.08823645859956741, "learning_rate": 9.670739301325534e-06, "loss": 0.4301, "step": 125 }, { "epoch": 0.25, "eval_loss": 0.3706146478652954, "eval_runtime": 76.5201, "eval_samples_per_second": 7.214, "eval_steps_per_second": 0.902, "step": 125 }, { "epoch": 0.252, "grad_norm": 0.10688935965299606, "learning_rate": 9.665053212208426e-06, "loss": 0.3065, "step": 126 }, { "epoch": 0.254, "grad_norm": 0.09517981857061386, "learning_rate": 9.659320146057263e-06, "loss": 0.5437, "step": 127 }, { "epoch": 0.256, "grad_norm": 0.11310486495494843, "learning_rate": 9.653540160603956e-06, "loss": 0.6087, "step": 128 }, { "epoch": 0.258, "grad_norm": 0.08851969987154007, "learning_rate": 9.647713314052896e-06, "loss": 0.3598, "step": 129 }, { "epoch": 0.26, "grad_norm": 0.09503145515918732, "learning_rate": 9.641839665080363e-06, "loss": 0.338, "step": 130 }, { "epoch": 0.262, "grad_norm": 0.09553948044776917, "learning_rate": 9.635919272833938e-06, "loss": 0.3801, "step": 131 }, { "epoch": 0.264, "grad_norm": 0.09811339527368546, "learning_rate": 9.629952196931902e-06, "loss": 0.3866, "step": 132 }, { "epoch": 0.266, "grad_norm": 0.0865439921617508, "learning_rate": 9.623938497462647e-06, "loss": 0.4466, "step": 133 }, { "epoch": 0.268, "grad_norm": 0.09298735857009888, "learning_rate": 9.617878234984056e-06, "loss": 0.4413, "step": 134 }, { "epoch": 0.27, "grad_norm": 0.10931612551212311, "learning_rate": 9.611771470522908e-06, "loss": 0.3974, "step": 135 }, { "epoch": 0.272, "grad_norm": 0.08798681199550629, "learning_rate": 9.60561826557425e-06, "loss": 0.4052, "step": 136 }, { "epoch": 0.274, "grad_norm": 0.09892652928829193, "learning_rate": 9.599418682100793e-06, "loss": 0.4645, "step": 137 }, { "epoch": 0.276, "grad_norm": 0.10193604230880737, "learning_rate": 9.59317278253227e-06, "loss": 0.4064, "step": 138 }, { "epoch": 0.278, "grad_norm": 0.07900392264127731, "learning_rate": 9.586880629764817e-06, "loss": 0.3229, "step": 139 }, { "epoch": 0.28, "grad_norm": 0.08284664154052734, "learning_rate": 9.580542287160348e-06, "loss": 0.3703, "step": 140 }, { "epoch": 0.282, "grad_norm": 0.08164459466934204, "learning_rate": 9.574157818545902e-06, "loss": 0.2879, "step": 141 }, { "epoch": 0.284, "grad_norm": 0.1115422248840332, "learning_rate": 9.567727288213005e-06, "loss": 0.4593, "step": 142 }, { "epoch": 0.286, "grad_norm": 0.09770838916301727, "learning_rate": 9.561250760917026e-06, "loss": 0.4133, "step": 143 }, { "epoch": 0.288, "grad_norm": 0.12189961224794388, "learning_rate": 9.554728301876525e-06, "loss": 0.5928, "step": 144 }, { "epoch": 0.29, "grad_norm": 0.14093732833862305, "learning_rate": 9.548159976772593e-06, "loss": 0.415, "step": 145 }, { "epoch": 0.292, "grad_norm": 0.11479732394218445, "learning_rate": 9.541545851748186e-06, "loss": 0.3691, "step": 146 }, { "epoch": 0.294, "grad_norm": 0.09249378740787506, "learning_rate": 9.534885993407474e-06, "loss": 0.3394, "step": 147 }, { "epoch": 0.296, "grad_norm": 0.10194878280162811, "learning_rate": 9.528180468815155e-06, "loss": 0.3745, "step": 148 }, { "epoch": 0.298, "grad_norm": 0.09345925599336624, "learning_rate": 9.521429345495787e-06, "loss": 0.3934, "step": 149 }, { "epoch": 0.3, "grad_norm": 0.09919178485870361, "learning_rate": 9.514632691433108e-06, "loss": 0.4053, "step": 150 }, { "epoch": 0.302, "grad_norm": 0.10807909071445465, "learning_rate": 9.507790575069347e-06, "loss": 0.4631, "step": 151 }, { "epoch": 0.304, "grad_norm": 0.10555636882781982, "learning_rate": 9.50090306530454e-06, "loss": 0.4952, "step": 152 }, { "epoch": 0.306, "grad_norm": 0.10507559776306152, "learning_rate": 9.493970231495836e-06, "loss": 0.294, "step": 153 }, { "epoch": 0.308, "grad_norm": 0.08718883246183395, "learning_rate": 9.486992143456792e-06, "loss": 0.3044, "step": 154 }, { "epoch": 0.31, "grad_norm": 0.10039477050304413, "learning_rate": 9.47996887145668e-06, "loss": 0.3736, "step": 155 }, { "epoch": 0.312, "grad_norm": 0.09952064603567123, "learning_rate": 9.47290048621977e-06, "loss": 0.4359, "step": 156 }, { "epoch": 0.314, "grad_norm": 0.10663799196481705, "learning_rate": 9.46578705892462e-06, "loss": 0.3939, "step": 157 }, { "epoch": 0.316, "grad_norm": 0.10759017616510391, "learning_rate": 9.458628661203368e-06, "loss": 0.4575, "step": 158 }, { "epoch": 0.318, "grad_norm": 0.08924371749162674, "learning_rate": 9.451425365140997e-06, "loss": 0.3525, "step": 159 }, { "epoch": 0.32, "grad_norm": 0.13670168817043304, "learning_rate": 9.444177243274619e-06, "loss": 0.5385, "step": 160 }, { "epoch": 0.322, "grad_norm": 0.10520858317613602, "learning_rate": 9.43688436859274e-06, "loss": 0.2964, "step": 161 }, { "epoch": 0.324, "grad_norm": 0.10608810931444168, "learning_rate": 9.429546814534528e-06, "loss": 0.4369, "step": 162 }, { "epoch": 0.326, "grad_norm": 0.08399061113595963, "learning_rate": 9.422164654989073e-06, "loss": 0.3246, "step": 163 }, { "epoch": 0.328, "grad_norm": 0.11295214295387268, "learning_rate": 9.414737964294636e-06, "loss": 0.4766, "step": 164 }, { "epoch": 0.33, "grad_norm": 0.1255977749824524, "learning_rate": 9.40726681723791e-06, "loss": 0.5263, "step": 165 }, { "epoch": 0.332, "grad_norm": 0.0891086682677269, "learning_rate": 9.399751289053267e-06, "loss": 0.2796, "step": 166 }, { "epoch": 0.334, "grad_norm": 0.12856395542621613, "learning_rate": 9.392191455421989e-06, "loss": 0.4485, "step": 167 }, { "epoch": 0.336, "grad_norm": 0.1172974556684494, "learning_rate": 9.384587392471516e-06, "loss": 0.542, "step": 168 }, { "epoch": 0.338, "grad_norm": 0.08675208687782288, "learning_rate": 9.376939176774678e-06, "loss": 0.2899, "step": 169 }, { "epoch": 0.34, "grad_norm": 0.11079028248786926, "learning_rate": 9.369246885348926e-06, "loss": 0.3732, "step": 170 }, { "epoch": 0.342, "grad_norm": 0.12667471170425415, "learning_rate": 9.361510595655545e-06, "loss": 0.54, "step": 171 }, { "epoch": 0.344, "grad_norm": 0.08692082017660141, "learning_rate": 9.353730385598887e-06, "loss": 0.3873, "step": 172 }, { "epoch": 0.346, "grad_norm": 0.1013069748878479, "learning_rate": 9.345906333525582e-06, "loss": 0.438, "step": 173 }, { "epoch": 0.348, "grad_norm": 0.09999188780784607, "learning_rate": 9.338038518223746e-06, "loss": 0.4467, "step": 174 }, { "epoch": 0.35, "grad_norm": 0.11317498981952667, "learning_rate": 9.330127018922195e-06, "loss": 0.3912, "step": 175 }, { "epoch": 0.352, "grad_norm": 0.10574603080749512, "learning_rate": 9.322171915289635e-06, "loss": 0.3808, "step": 176 }, { "epoch": 0.354, "grad_norm": 0.1281527876853943, "learning_rate": 9.314173287433874e-06, "loss": 0.423, "step": 177 }, { "epoch": 0.356, "grad_norm": 0.12899580597877502, "learning_rate": 9.306131215901004e-06, "loss": 0.4509, "step": 178 }, { "epoch": 0.358, "grad_norm": 0.10952267050743103, "learning_rate": 9.298045781674595e-06, "loss": 0.3512, "step": 179 }, { "epoch": 0.36, "grad_norm": 0.1423255354166031, "learning_rate": 9.289917066174887e-06, "loss": 0.3631, "step": 180 }, { "epoch": 0.362, "grad_norm": 0.13039131462574005, "learning_rate": 9.281745151257946e-06, "loss": 0.3762, "step": 181 }, { "epoch": 0.364, "grad_norm": 0.10448655486106873, "learning_rate": 9.273530119214868e-06, "loss": 0.3694, "step": 182 }, { "epoch": 0.366, "grad_norm": 0.0945306122303009, "learning_rate": 9.265272052770936e-06, "loss": 0.28, "step": 183 }, { "epoch": 0.368, "grad_norm": 0.10995735973119736, "learning_rate": 9.256971035084786e-06, "loss": 0.4849, "step": 184 }, { "epoch": 0.37, "grad_norm": 0.11014600843191147, "learning_rate": 9.248627149747573e-06, "loss": 0.3213, "step": 185 }, { "epoch": 0.372, "grad_norm": 0.09283925592899323, "learning_rate": 9.24024048078213e-06, "loss": 0.4077, "step": 186 }, { "epoch": 0.374, "grad_norm": 0.14395715296268463, "learning_rate": 9.231811112642121e-06, "loss": 0.4869, "step": 187 }, { "epoch": 0.376, "grad_norm": 0.10785488784313202, "learning_rate": 9.223339130211194e-06, "loss": 0.4122, "step": 188 }, { "epoch": 0.378, "grad_norm": 0.09983161091804504, "learning_rate": 9.214824618802108e-06, "loss": 0.3027, "step": 189 }, { "epoch": 0.38, "grad_norm": 0.10121427476406097, "learning_rate": 9.206267664155906e-06, "loss": 0.3055, "step": 190 }, { "epoch": 0.382, "grad_norm": 0.11393419653177261, "learning_rate": 9.197668352441025e-06, "loss": 0.3567, "step": 191 }, { "epoch": 0.384, "grad_norm": 0.132842019200325, "learning_rate": 9.189026770252437e-06, "loss": 0.3556, "step": 192 }, { "epoch": 0.386, "grad_norm": 0.1139449030160904, "learning_rate": 9.18034300461078e-06, "loss": 0.4298, "step": 193 }, { "epoch": 0.388, "grad_norm": 0.09980877488851547, "learning_rate": 9.171617142961477e-06, "loss": 0.3853, "step": 194 }, { "epoch": 0.39, "grad_norm": 0.12531818449497223, "learning_rate": 9.162849273173857e-06, "loss": 0.4845, "step": 195 }, { "epoch": 0.392, "grad_norm": 0.11148197203874588, "learning_rate": 9.154039483540273e-06, "loss": 0.4091, "step": 196 }, { "epoch": 0.394, "grad_norm": 0.11962081491947174, "learning_rate": 9.145187862775208e-06, "loss": 0.371, "step": 197 }, { "epoch": 0.396, "grad_norm": 0.10789982974529266, "learning_rate": 9.136294500014387e-06, "loss": 0.4268, "step": 198 }, { "epoch": 0.398, "grad_norm": 0.15846121311187744, "learning_rate": 9.12735948481387e-06, "loss": 0.6264, "step": 199 }, { "epoch": 0.4, "grad_norm": 0.1426246613264084, "learning_rate": 9.118382907149164e-06, "loss": 0.4769, "step": 200 }, { "epoch": 0.402, "grad_norm": 0.1069459393620491, "learning_rate": 9.109364857414306e-06, "loss": 0.3708, "step": 201 }, { "epoch": 0.404, "grad_norm": 0.10732389986515045, "learning_rate": 9.100305426420957e-06, "loss": 0.3962, "step": 202 }, { "epoch": 0.406, "grad_norm": 0.1436106562614441, "learning_rate": 9.091204705397485e-06, "loss": 0.4549, "step": 203 }, { "epoch": 0.408, "grad_norm": 0.10230587422847748, "learning_rate": 9.08206278598805e-06, "loss": 0.3926, "step": 204 }, { "epoch": 0.41, "grad_norm": 0.11367027461528778, "learning_rate": 9.07287976025168e-06, "loss": 0.3378, "step": 205 }, { "epoch": 0.412, "grad_norm": 0.14832234382629395, "learning_rate": 9.06365572066134e-06, "loss": 0.4202, "step": 206 }, { "epoch": 0.414, "grad_norm": 0.10567332804203033, "learning_rate": 9.05439076010301e-06, "loss": 0.2904, "step": 207 }, { "epoch": 0.416, "grad_norm": 0.11918513476848602, "learning_rate": 9.045084971874738e-06, "loss": 0.2632, "step": 208 }, { "epoch": 0.418, "grad_norm": 0.13223537802696228, "learning_rate": 9.035738449685707e-06, "loss": 0.4208, "step": 209 }, { "epoch": 0.42, "grad_norm": 0.12573251128196716, "learning_rate": 9.026351287655294e-06, "loss": 0.4609, "step": 210 }, { "epoch": 0.422, "grad_norm": 0.11943136155605316, "learning_rate": 9.016923580312114e-06, "loss": 0.3323, "step": 211 }, { "epoch": 0.424, "grad_norm": 0.13152974843978882, "learning_rate": 9.007455422593077e-06, "loss": 0.4258, "step": 212 }, { "epoch": 0.426, "grad_norm": 0.13339808583259583, "learning_rate": 8.997946909842426e-06, "loss": 0.5303, "step": 213 }, { "epoch": 0.428, "grad_norm": 0.11746034771203995, "learning_rate": 8.988398137810778e-06, "loss": 0.4109, "step": 214 }, { "epoch": 0.43, "grad_norm": 0.11518029868602753, "learning_rate": 8.978809202654161e-06, "loss": 0.4154, "step": 215 }, { "epoch": 0.432, "grad_norm": 0.15307952463626862, "learning_rate": 8.969180200933048e-06, "loss": 0.4196, "step": 216 }, { "epoch": 0.434, "grad_norm": 0.11385340988636017, "learning_rate": 8.959511229611377e-06, "loss": 0.3713, "step": 217 }, { "epoch": 0.436, "grad_norm": 0.1380355805158615, "learning_rate": 8.949802386055582e-06, "loss": 0.3891, "step": 218 }, { "epoch": 0.438, "grad_norm": 0.09614066779613495, "learning_rate": 8.94005376803361e-06, "loss": 0.2527, "step": 219 }, { "epoch": 0.44, "grad_norm": 0.12352288514375687, "learning_rate": 8.930265473713939e-06, "loss": 0.3737, "step": 220 }, { "epoch": 0.442, "grad_norm": 0.18210633099079132, "learning_rate": 8.92043760166458e-06, "loss": 0.3839, "step": 221 }, { "epoch": 0.444, "grad_norm": 0.1087498739361763, "learning_rate": 8.910570250852098e-06, "loss": 0.3141, "step": 222 }, { "epoch": 0.446, "grad_norm": 0.11985889822244644, "learning_rate": 8.900663520640605e-06, "loss": 0.4606, "step": 223 }, { "epoch": 0.448, "grad_norm": 0.146299347281456, "learning_rate": 8.890717510790763e-06, "loss": 0.4094, "step": 224 }, { "epoch": 0.45, "grad_norm": 0.09788361191749573, "learning_rate": 8.880732321458785e-06, "loss": 0.2964, "step": 225 }, { "epoch": 0.452, "grad_norm": 0.09735774993896484, "learning_rate": 8.870708053195414e-06, "loss": 0.2646, "step": 226 }, { "epoch": 0.454, "grad_norm": 0.1293504238128662, "learning_rate": 8.860644806944917e-06, "loss": 0.2991, "step": 227 }, { "epoch": 0.456, "grad_norm": 0.13126921653747559, "learning_rate": 8.850542684044078e-06, "loss": 0.4474, "step": 228 }, { "epoch": 0.458, "grad_norm": 0.11488878726959229, "learning_rate": 8.84040178622116e-06, "loss": 0.3628, "step": 229 }, { "epoch": 0.46, "grad_norm": 0.13861073553562164, "learning_rate": 8.83022221559489e-06, "loss": 0.4022, "step": 230 }, { "epoch": 0.462, "grad_norm": 0.16164664924144745, "learning_rate": 8.820004074673433e-06, "loss": 0.4217, "step": 231 }, { "epoch": 0.464, "grad_norm": 0.10550030320882797, "learning_rate": 8.809747466353356e-06, "loss": 0.2927, "step": 232 }, { "epoch": 0.466, "grad_norm": 0.1035122275352478, "learning_rate": 8.799452493918586e-06, "loss": 0.2453, "step": 233 }, { "epoch": 0.468, "grad_norm": 0.15530018508434296, "learning_rate": 8.789119261039385e-06, "loss": 0.3758, "step": 234 }, { "epoch": 0.47, "grad_norm": 0.13951483368873596, "learning_rate": 8.778747871771293e-06, "loss": 0.4502, "step": 235 }, { "epoch": 0.472, "grad_norm": 0.13241475820541382, "learning_rate": 8.768338430554083e-06, "loss": 0.5012, "step": 236 }, { "epoch": 0.474, "grad_norm": 0.11370962113142014, "learning_rate": 8.757891042210713e-06, "loss": 0.2801, "step": 237 }, { "epoch": 0.476, "grad_norm": 0.1501305103302002, "learning_rate": 8.747405811946272e-06, "loss": 0.4888, "step": 238 }, { "epoch": 0.478, "grad_norm": 0.1636514514684677, "learning_rate": 8.736882845346906e-06, "loss": 0.518, "step": 239 }, { "epoch": 0.48, "grad_norm": 0.11505798250436783, "learning_rate": 8.726322248378775e-06, "loss": 0.2627, "step": 240 }, { "epoch": 0.482, "grad_norm": 0.15717971324920654, "learning_rate": 8.715724127386971e-06, "loss": 0.3299, "step": 241 }, { "epoch": 0.484, "grad_norm": 0.13042742013931274, "learning_rate": 8.705088589094458e-06, "loss": 0.351, "step": 242 }, { "epoch": 0.486, "grad_norm": 0.1414385885000229, "learning_rate": 8.69441574060099e-06, "loss": 0.471, "step": 243 }, { "epoch": 0.488, "grad_norm": 0.10110446810722351, "learning_rate": 8.683705689382025e-06, "loss": 0.2369, "step": 244 }, { "epoch": 0.49, "grad_norm": 0.1549258530139923, "learning_rate": 8.672958543287666e-06, "loss": 0.4333, "step": 245 }, { "epoch": 0.492, "grad_norm": 0.11834664642810822, "learning_rate": 8.662174410541556e-06, "loss": 0.3182, "step": 246 }, { "epoch": 0.494, "grad_norm": 0.1529727429151535, "learning_rate": 8.651353399739787e-06, "loss": 0.4963, "step": 247 }, { "epoch": 0.496, "grad_norm": 0.14854104816913605, "learning_rate": 8.640495619849821e-06, "loss": 0.4514, "step": 248 }, { "epoch": 0.498, "grad_norm": 0.12271202355623245, "learning_rate": 8.629601180209382e-06, "loss": 0.3694, "step": 249 }, { "epoch": 0.5, "grad_norm": 0.11352905631065369, "learning_rate": 8.61867019052535e-06, "loss": 0.2978, "step": 250 }, { "epoch": 0.5, "eval_loss": 0.32808247208595276, "eval_runtime": 76.51, "eval_samples_per_second": 7.215, "eval_steps_per_second": 0.902, "step": 250 }, { "epoch": 0.502, "grad_norm": 0.1511523425579071, "learning_rate": 8.607702760872679e-06, "loss": 0.4037, "step": 251 }, { "epoch": 0.504, "grad_norm": 0.13344620168209076, "learning_rate": 8.596699001693257e-06, "loss": 0.2303, "step": 252 }, { "epoch": 0.506, "grad_norm": 0.12220989167690277, "learning_rate": 8.585659023794818e-06, "loss": 0.4347, "step": 253 }, { "epoch": 0.508, "grad_norm": 0.1094481498003006, "learning_rate": 8.574582938349818e-06, "loss": 0.3089, "step": 254 }, { "epoch": 0.51, "grad_norm": 0.11940666288137436, "learning_rate": 8.563470856894316e-06, "loss": 0.2699, "step": 255 }, { "epoch": 0.512, "grad_norm": 0.139656201004982, "learning_rate": 8.552322891326846e-06, "loss": 0.2763, "step": 256 }, { "epoch": 0.514, "grad_norm": 0.11665194481611252, "learning_rate": 8.541139153907296e-06, "loss": 0.2695, "step": 257 }, { "epoch": 0.516, "grad_norm": 0.12714596092700958, "learning_rate": 8.529919757255783e-06, "loss": 0.2489, "step": 258 }, { "epoch": 0.518, "grad_norm": 0.12326015532016754, "learning_rate": 8.518664814351502e-06, "loss": 0.3067, "step": 259 }, { "epoch": 0.52, "grad_norm": 0.13826797902584076, "learning_rate": 8.507374438531606e-06, "loss": 0.3119, "step": 260 }, { "epoch": 0.522, "grad_norm": 0.15031856298446655, "learning_rate": 8.496048743490053e-06, "loss": 0.3112, "step": 261 }, { "epoch": 0.524, "grad_norm": 0.14100715517997742, "learning_rate": 8.48468784327647e-06, "loss": 0.3878, "step": 262 }, { "epoch": 0.526, "grad_norm": 0.15813864767551422, "learning_rate": 8.473291852294986e-06, "loss": 0.3382, "step": 263 }, { "epoch": 0.528, "grad_norm": 0.15911728143692017, "learning_rate": 8.461860885303116e-06, "loss": 0.4177, "step": 264 }, { "epoch": 0.53, "grad_norm": 0.15685637295246124, "learning_rate": 8.450395057410561e-06, "loss": 0.3557, "step": 265 }, { "epoch": 0.532, "grad_norm": 0.13905856013298035, "learning_rate": 8.438894484078086e-06, "loss": 0.3323, "step": 266 }, { "epoch": 0.534, "grad_norm": 0.13344989717006683, "learning_rate": 8.427359281116335e-06, "loss": 0.3475, "step": 267 }, { "epoch": 0.536, "grad_norm": 0.16016146540641785, "learning_rate": 8.415789564684673e-06, "loss": 0.3789, "step": 268 }, { "epoch": 0.538, "grad_norm": 0.11681054532527924, "learning_rate": 8.404185451290017e-06, "loss": 0.2061, "step": 269 }, { "epoch": 0.54, "grad_norm": 0.14662593603134155, "learning_rate": 8.392547057785662e-06, "loss": 0.4173, "step": 270 }, { "epoch": 0.542, "grad_norm": 0.21970625221729279, "learning_rate": 8.380874501370098e-06, "loss": 0.5602, "step": 271 }, { "epoch": 0.544, "grad_norm": 0.11630596220493317, "learning_rate": 8.36916789958584e-06, "loss": 0.2674, "step": 272 }, { "epoch": 0.546, "grad_norm": 0.14212217926979065, "learning_rate": 8.357427370318239e-06, "loss": 0.2776, "step": 273 }, { "epoch": 0.548, "grad_norm": 0.14911417663097382, "learning_rate": 8.345653031794292e-06, "loss": 0.4463, "step": 274 }, { "epoch": 0.55, "grad_norm": 0.142579585313797, "learning_rate": 8.33384500258146e-06, "loss": 0.4963, "step": 275 }, { "epoch": 0.552, "grad_norm": 0.14713557064533234, "learning_rate": 8.322003401586463e-06, "loss": 0.2642, "step": 276 }, { "epoch": 0.554, "grad_norm": 0.24756528437137604, "learning_rate": 8.310128348054093e-06, "loss": 0.5423, "step": 277 }, { "epoch": 0.556, "grad_norm": 0.13731062412261963, "learning_rate": 8.298219961566008e-06, "loss": 0.3333, "step": 278 }, { "epoch": 0.558, "grad_norm": 0.18075144290924072, "learning_rate": 8.286278362039527e-06, "loss": 0.3733, "step": 279 }, { "epoch": 0.56, "grad_norm": 0.1650344282388687, "learning_rate": 8.274303669726427e-06, "loss": 0.383, "step": 280 }, { "epoch": 0.562, "grad_norm": 0.18053463101387024, "learning_rate": 8.262296005211722e-06, "loss": 0.4359, "step": 281 }, { "epoch": 0.564, "grad_norm": 0.16192179918289185, "learning_rate": 8.250255489412464e-06, "loss": 0.3839, "step": 282 }, { "epoch": 0.566, "grad_norm": 0.16045285761356354, "learning_rate": 8.238182243576512e-06, "loss": 0.4185, "step": 283 }, { "epoch": 0.568, "grad_norm": 0.14847232401371002, "learning_rate": 8.226076389281316e-06, "loss": 0.43, "step": 284 }, { "epoch": 0.57, "grad_norm": 0.1868700236082077, "learning_rate": 8.213938048432697e-06, "loss": 0.3437, "step": 285 }, { "epoch": 0.572, "grad_norm": 0.1744498908519745, "learning_rate": 8.201767343263612e-06, "loss": 0.4926, "step": 286 }, { "epoch": 0.574, "grad_norm": 0.13156633079051971, "learning_rate": 8.189564396332927e-06, "loss": 0.4245, "step": 287 }, { "epoch": 0.576, "grad_norm": 0.17716287076473236, "learning_rate": 8.177329330524182e-06, "loss": 0.3134, "step": 288 }, { "epoch": 0.578, "grad_norm": 0.15387575328350067, "learning_rate": 8.165062269044353e-06, "loss": 0.3723, "step": 289 }, { "epoch": 0.58, "grad_norm": 0.11926203221082687, "learning_rate": 8.152763335422612e-06, "loss": 0.251, "step": 290 }, { "epoch": 0.582, "grad_norm": 0.14692164957523346, "learning_rate": 8.140432653509089e-06, "loss": 0.3068, "step": 291 }, { "epoch": 0.584, "grad_norm": 0.12874449789524078, "learning_rate": 8.128070347473609e-06, "loss": 0.3449, "step": 292 }, { "epoch": 0.586, "grad_norm": 0.1284901350736618, "learning_rate": 8.115676541804456e-06, "loss": 0.2336, "step": 293 }, { "epoch": 0.588, "grad_norm": 0.18448615074157715, "learning_rate": 8.10325136130712e-06, "loss": 0.4497, "step": 294 }, { "epoch": 0.59, "grad_norm": 0.18793466687202454, "learning_rate": 8.090794931103026e-06, "loss": 0.446, "step": 295 }, { "epoch": 0.592, "grad_norm": 0.11833447217941284, "learning_rate": 8.078307376628292e-06, "loss": 0.286, "step": 296 }, { "epoch": 0.594, "grad_norm": 0.14963407814502716, "learning_rate": 8.065788823632451e-06, "loss": 0.329, "step": 297 }, { "epoch": 0.596, "grad_norm": 0.1394645869731903, "learning_rate": 8.053239398177191e-06, "loss": 0.2671, "step": 298 }, { "epoch": 0.598, "grad_norm": 0.17401300370693207, "learning_rate": 8.04065922663509e-06, "loss": 0.5106, "step": 299 }, { "epoch": 0.6, "grad_norm": 0.1559733897447586, "learning_rate": 8.028048435688333e-06, "loss": 0.259, "step": 300 }, { "epoch": 0.602, "grad_norm": 0.14853116869926453, "learning_rate": 8.015407152327448e-06, "loss": 0.4095, "step": 301 }, { "epoch": 0.604, "grad_norm": 0.13665775954723358, "learning_rate": 8.002735503850016e-06, "loss": 0.379, "step": 302 }, { "epoch": 0.606, "grad_norm": 0.15187975764274597, "learning_rate": 7.990033617859396e-06, "loss": 0.336, "step": 303 }, { "epoch": 0.608, "grad_norm": 0.17993216216564178, "learning_rate": 7.97730162226344e-06, "loss": 0.4718, "step": 304 }, { "epoch": 0.61, "grad_norm": 0.14840970933437347, "learning_rate": 7.964539645273204e-06, "loss": 0.3572, "step": 305 }, { "epoch": 0.612, "grad_norm": 0.2386975884437561, "learning_rate": 7.951747815401651e-06, "loss": 0.3185, "step": 306 }, { "epoch": 0.614, "grad_norm": 0.21291233599185944, "learning_rate": 7.938926261462366e-06, "loss": 0.362, "step": 307 }, { "epoch": 0.616, "grad_norm": 0.16196957230567932, "learning_rate": 7.92607511256826e-06, "loss": 0.3024, "step": 308 }, { "epoch": 0.618, "grad_norm": 0.2727487087249756, "learning_rate": 7.913194498130252e-06, "loss": 0.5212, "step": 309 }, { "epoch": 0.62, "grad_norm": 0.1640804558992386, "learning_rate": 7.900284547855992e-06, "loss": 0.3948, "step": 310 }, { "epoch": 0.622, "grad_norm": 0.22003543376922607, "learning_rate": 7.887345391748533e-06, "loss": 0.3745, "step": 311 }, { "epoch": 0.624, "grad_norm": 0.1896262764930725, "learning_rate": 7.874377160105037e-06, "loss": 0.4448, "step": 312 }, { "epoch": 0.626, "grad_norm": 0.18609432876110077, "learning_rate": 7.861379983515449e-06, "loss": 0.3685, "step": 313 }, { "epoch": 0.628, "grad_norm": 0.14590106904506683, "learning_rate": 7.848353992861195e-06, "loss": 0.3338, "step": 314 }, { "epoch": 0.63, "grad_norm": 0.13211271166801453, "learning_rate": 7.835299319313854e-06, "loss": 0.3297, "step": 315 }, { "epoch": 0.632, "grad_norm": 0.16736850142478943, "learning_rate": 7.822216094333847e-06, "loss": 0.3118, "step": 316 }, { "epoch": 0.634, "grad_norm": 0.17553502321243286, "learning_rate": 7.8091044496691e-06, "loss": 0.3447, "step": 317 }, { "epoch": 0.636, "grad_norm": 0.17292480170726776, "learning_rate": 7.795964517353734e-06, "loss": 0.3152, "step": 318 }, { "epoch": 0.638, "grad_norm": 0.13962873816490173, "learning_rate": 7.782796429706721e-06, "loss": 0.2142, "step": 319 }, { "epoch": 0.64, "grad_norm": 0.19501662254333496, "learning_rate": 7.769600319330553e-06, "loss": 0.3923, "step": 320 }, { "epoch": 0.642, "grad_norm": 0.1338018923997879, "learning_rate": 7.756376319109917e-06, "loss": 0.3381, "step": 321 }, { "epoch": 0.644, "grad_norm": 0.1579694300889969, "learning_rate": 7.743124562210351e-06, "loss": 0.37, "step": 322 }, { "epoch": 0.646, "grad_norm": 0.12136895209550858, "learning_rate": 7.729845182076896e-06, "loss": 0.212, "step": 323 }, { "epoch": 0.648, "grad_norm": 0.2188921570777893, "learning_rate": 7.716538312432767e-06, "loss": 0.3732, "step": 324 }, { "epoch": 0.65, "grad_norm": 0.1570715606212616, "learning_rate": 7.703204087277989e-06, "loss": 0.321, "step": 325 }, { "epoch": 0.652, "grad_norm": 0.19729937613010406, "learning_rate": 7.689842640888063e-06, "loss": 0.3955, "step": 326 }, { "epoch": 0.654, "grad_norm": 0.20023679733276367, "learning_rate": 7.676454107812608e-06, "loss": 0.4399, "step": 327 }, { "epoch": 0.656, "grad_norm": 0.14793503284454346, "learning_rate": 7.663038622873999e-06, "loss": 0.2922, "step": 328 }, { "epoch": 0.658, "grad_norm": 0.16386426985263824, "learning_rate": 7.649596321166024e-06, "loss": 0.3495, "step": 329 }, { "epoch": 0.66, "grad_norm": 0.15845847129821777, "learning_rate": 7.636127338052513e-06, "loss": 0.3607, "step": 330 }, { "epoch": 0.662, "grad_norm": 0.17752616107463837, "learning_rate": 7.622631809165972e-06, "loss": 0.2863, "step": 331 }, { "epoch": 0.664, "grad_norm": 0.2213558405637741, "learning_rate": 7.60910987040623e-06, "loss": 0.4411, "step": 332 }, { "epoch": 0.666, "grad_norm": 0.2018650323152542, "learning_rate": 7.595561657939061e-06, "loss": 0.418, "step": 333 }, { "epoch": 0.668, "grad_norm": 0.20029357075691223, "learning_rate": 7.5819873081948105e-06, "loss": 0.3025, "step": 334 }, { "epoch": 0.67, "grad_norm": 0.1478874832391739, "learning_rate": 7.568386957867033e-06, "loss": 0.2437, "step": 335 }, { "epoch": 0.672, "grad_norm": 0.18909971415996552, "learning_rate": 7.554760743911104e-06, "loss": 0.3974, "step": 336 }, { "epoch": 0.674, "grad_norm": 0.16544924676418304, "learning_rate": 7.541108803542846e-06, "loss": 0.336, "step": 337 }, { "epoch": 0.676, "grad_norm": 0.19204874336719513, "learning_rate": 7.527431274237149e-06, "loss": 0.3617, "step": 338 }, { "epoch": 0.678, "grad_norm": 0.1770397573709488, "learning_rate": 7.5137282937265796e-06, "loss": 0.3617, "step": 339 }, { "epoch": 0.68, "grad_norm": 0.15880927443504333, "learning_rate": 7.500000000000001e-06, "loss": 0.2993, "step": 340 }, { "epoch": 0.682, "grad_norm": 0.4031960368156433, "learning_rate": 7.486246531301178e-06, "loss": 0.3137, "step": 341 }, { "epoch": 0.684, "grad_norm": 0.17426829040050507, "learning_rate": 7.472468026127385e-06, "loss": 0.3712, "step": 342 }, { "epoch": 0.686, "grad_norm": 0.16782499849796295, "learning_rate": 7.45866462322802e-06, "loss": 0.359, "step": 343 }, { "epoch": 0.688, "grad_norm": 0.20207028090953827, "learning_rate": 7.444836461603195e-06, "loss": 0.4301, "step": 344 }, { "epoch": 0.69, "grad_norm": 0.18788397312164307, "learning_rate": 7.430983680502344e-06, "loss": 0.3609, "step": 345 }, { "epoch": 0.692, "grad_norm": 0.16447116434574127, "learning_rate": 7.4171064194228196e-06, "loss": 0.3514, "step": 346 }, { "epoch": 0.694, "grad_norm": 0.15939724445343018, "learning_rate": 7.403204818108487e-06, "loss": 0.2747, "step": 347 }, { "epoch": 0.696, "grad_norm": 0.2825759947299957, "learning_rate": 7.3892790165483164e-06, "loss": 0.5376, "step": 348 }, { "epoch": 0.698, "grad_norm": 0.15753747522830963, "learning_rate": 7.3753291549749764e-06, "loss": 0.2741, "step": 349 }, { "epoch": 0.7, "grad_norm": 0.19103243947029114, "learning_rate": 7.361355373863415e-06, "loss": 0.3088, "step": 350 }, { "epoch": 0.702, "grad_norm": 0.18185654282569885, "learning_rate": 7.347357813929455e-06, "loss": 0.3204, "step": 351 }, { "epoch": 0.704, "grad_norm": 0.15075427293777466, "learning_rate": 7.333336616128369e-06, "loss": 0.2885, "step": 352 }, { "epoch": 0.706, "grad_norm": 0.14092062413692474, "learning_rate": 7.319291921653464e-06, "loss": 0.2423, "step": 353 }, { "epoch": 0.708, "grad_norm": 0.11944609135389328, "learning_rate": 7.305223871934657e-06, "loss": 0.1367, "step": 354 }, { "epoch": 0.71, "grad_norm": 0.2248326539993286, "learning_rate": 7.291132608637053e-06, "loss": 0.4119, "step": 355 }, { "epoch": 0.712, "grad_norm": 0.1844269186258316, "learning_rate": 7.2770182736595164e-06, "loss": 0.2714, "step": 356 }, { "epoch": 0.714, "grad_norm": 0.19066232442855835, "learning_rate": 7.262881009133242e-06, "loss": 0.432, "step": 357 }, { "epoch": 0.716, "grad_norm": 0.21767167747020721, "learning_rate": 7.24872095742033e-06, "loss": 0.3804, "step": 358 }, { "epoch": 0.718, "grad_norm": 0.14823076128959656, "learning_rate": 7.234538261112342e-06, "loss": 0.3182, "step": 359 }, { "epoch": 0.72, "grad_norm": 0.1661371886730194, "learning_rate": 7.2203330630288714e-06, "loss": 0.3078, "step": 360 }, { "epoch": 0.722, "grad_norm": 0.18412846326828003, "learning_rate": 7.206105506216107e-06, "loss": 0.4066, "step": 361 }, { "epoch": 0.724, "grad_norm": 0.17892518639564514, "learning_rate": 7.191855733945388e-06, "loss": 0.4772, "step": 362 }, { "epoch": 0.726, "grad_norm": 0.24270282685756683, "learning_rate": 7.177583889711763e-06, "loss": 0.3902, "step": 363 }, { "epoch": 0.728, "grad_norm": 0.187135249376297, "learning_rate": 7.163290117232542e-06, "loss": 0.3154, "step": 364 }, { "epoch": 0.73, "grad_norm": 0.20502962172031403, "learning_rate": 7.148974560445859e-06, "loss": 0.3599, "step": 365 }, { "epoch": 0.732, "grad_norm": 0.1704569160938263, "learning_rate": 7.1346373635092095e-06, "loss": 0.3705, "step": 366 }, { "epoch": 0.734, "grad_norm": 0.20562830567359924, "learning_rate": 7.12027867079801e-06, "loss": 0.3169, "step": 367 }, { "epoch": 0.736, "grad_norm": 0.19051577150821686, "learning_rate": 7.105898626904134e-06, "loss": 0.4571, "step": 368 }, { "epoch": 0.738, "grad_norm": 0.18842366337776184, "learning_rate": 7.0914973766344645e-06, "loss": 0.2771, "step": 369 }, { "epoch": 0.74, "grad_norm": 0.14864154160022736, "learning_rate": 7.0770750650094335e-06, "loss": 0.2184, "step": 370 }, { "epoch": 0.742, "grad_norm": 0.1662212610244751, "learning_rate": 7.062631837261556e-06, "loss": 0.2706, "step": 371 }, { "epoch": 0.744, "grad_norm": 0.15230734646320343, "learning_rate": 7.048167838833977e-06, "loss": 0.2611, "step": 372 }, { "epoch": 0.746, "grad_norm": 0.16176356375217438, "learning_rate": 7.033683215379002e-06, "loss": 0.3144, "step": 373 }, { "epoch": 0.748, "grad_norm": 0.16796669363975525, "learning_rate": 7.019178112756625e-06, "loss": 0.3742, "step": 374 }, { "epoch": 0.75, "grad_norm": 0.16455894708633423, "learning_rate": 7.004652677033069e-06, "loss": 0.2426, "step": 375 }, { "epoch": 0.75, "eval_loss": 0.2979236841201782, "eval_runtime": 76.5795, "eval_samples_per_second": 7.208, "eval_steps_per_second": 0.901, "step": 375 }, { "epoch": 0.752, "grad_norm": 0.22792088985443115, "learning_rate": 6.990107054479313e-06, "loss": 0.319, "step": 376 }, { "epoch": 0.754, "grad_norm": 0.24258168041706085, "learning_rate": 6.9755413915696105e-06, "loss": 0.5036, "step": 377 }, { "epoch": 0.756, "grad_norm": 0.17646639049053192, "learning_rate": 6.960955834980028e-06, "loss": 0.3024, "step": 378 }, { "epoch": 0.758, "grad_norm": 0.15006083250045776, "learning_rate": 6.946350531586959e-06, "loss": 0.2702, "step": 379 }, { "epoch": 0.76, "grad_norm": 0.15430916845798492, "learning_rate": 6.931725628465643e-06, "loss": 0.2492, "step": 380 }, { "epoch": 0.762, "grad_norm": 0.13274860382080078, "learning_rate": 6.917081272888697e-06, "loss": 0.2188, "step": 381 }, { "epoch": 0.764, "grad_norm": 0.12552917003631592, "learning_rate": 6.902417612324615e-06, "loss": 0.2275, "step": 382 }, { "epoch": 0.766, "grad_norm": 0.14306232333183289, "learning_rate": 6.887734794436301e-06, "loss": 0.3204, "step": 383 }, { "epoch": 0.768, "grad_norm": 0.18567156791687012, "learning_rate": 6.873032967079562e-06, "loss": 0.4079, "step": 384 }, { "epoch": 0.77, "grad_norm": 0.18761208653450012, "learning_rate": 6.858312278301638e-06, "loss": 0.2944, "step": 385 }, { "epoch": 0.772, "grad_norm": 0.18265055119991302, "learning_rate": 6.8435728763397045e-06, "loss": 0.4399, "step": 386 }, { "epoch": 0.774, "grad_norm": 0.18840709328651428, "learning_rate": 6.828814909619374e-06, "loss": 0.4057, "step": 387 }, { "epoch": 0.776, "grad_norm": 0.19235002994537354, "learning_rate": 6.814038526753205e-06, "loss": 0.2826, "step": 388 }, { "epoch": 0.778, "grad_norm": 0.1880473792552948, "learning_rate": 6.799243876539213e-06, "loss": 0.3739, "step": 389 }, { "epoch": 0.78, "grad_norm": 0.29550889134407043, "learning_rate": 6.78443110795936e-06, "loss": 0.3594, "step": 390 }, { "epoch": 0.782, "grad_norm": 0.19335615634918213, "learning_rate": 6.76960037017806e-06, "loss": 0.4026, "step": 391 }, { "epoch": 0.784, "grad_norm": 0.14000019431114197, "learning_rate": 6.75475181254068e-06, "loss": 0.2576, "step": 392 }, { "epoch": 0.786, "grad_norm": 0.15106743574142456, "learning_rate": 6.739885584572026e-06, "loss": 0.2538, "step": 393 }, { "epoch": 0.788, "grad_norm": 0.19910076260566711, "learning_rate": 6.725001835974854e-06, "loss": 0.2867, "step": 394 }, { "epoch": 0.79, "grad_norm": 0.22941169142723083, "learning_rate": 6.710100716628345e-06, "loss": 0.3183, "step": 395 }, { "epoch": 0.792, "grad_norm": 0.1540730744600296, "learning_rate": 6.695182376586603e-06, "loss": 0.31, "step": 396 }, { "epoch": 0.794, "grad_norm": 0.18420648574829102, "learning_rate": 6.680246966077151e-06, "loss": 0.388, "step": 397 }, { "epoch": 0.796, "grad_norm": 0.14336371421813965, "learning_rate": 6.665294635499404e-06, "loss": 0.3359, "step": 398 }, { "epoch": 0.798, "grad_norm": 0.21092049777507782, "learning_rate": 6.650325535423166e-06, "loss": 0.2935, "step": 399 }, { "epoch": 0.8, "grad_norm": 0.23870034515857697, "learning_rate": 6.635339816587109e-06, "loss": 0.3413, "step": 400 }, { "epoch": 0.802, "grad_norm": 0.21548299491405487, "learning_rate": 6.6203376298972535e-06, "loss": 0.4255, "step": 401 }, { "epoch": 0.804, "grad_norm": 0.21555306017398834, "learning_rate": 6.605319126425455e-06, "loss": 0.4044, "step": 402 }, { "epoch": 0.806, "grad_norm": 0.212354838848114, "learning_rate": 6.590284457407876e-06, "loss": 0.3225, "step": 403 }, { "epoch": 0.808, "grad_norm": 0.17822064459323883, "learning_rate": 6.5752337742434644e-06, "loss": 0.3449, "step": 404 }, { "epoch": 0.81, "grad_norm": 0.15272925794124603, "learning_rate": 6.560167228492436e-06, "loss": 0.2732, "step": 405 }, { "epoch": 0.812, "grad_norm": 0.18225990235805511, "learning_rate": 6.545084971874738e-06, "loss": 0.3326, "step": 406 }, { "epoch": 0.814, "grad_norm": 0.1854051798582077, "learning_rate": 6.529987156268527e-06, "loss": 0.3603, "step": 407 }, { "epoch": 0.816, "grad_norm": 0.17678527534008026, "learning_rate": 6.514873933708637e-06, "loss": 0.2996, "step": 408 }, { "epoch": 0.818, "grad_norm": 0.35500454902648926, "learning_rate": 6.499745456385054e-06, "loss": 0.4185, "step": 409 }, { "epoch": 0.82, "grad_norm": 0.18555931746959686, "learning_rate": 6.484601876641375e-06, "loss": 0.2208, "step": 410 }, { "epoch": 0.822, "grad_norm": 0.16834326088428497, "learning_rate": 6.469443346973281e-06, "loss": 0.3684, "step": 411 }, { "epoch": 0.824, "grad_norm": 0.1469370424747467, "learning_rate": 6.454270020026996e-06, "loss": 0.2526, "step": 412 }, { "epoch": 0.826, "grad_norm": 0.19754226505756378, "learning_rate": 6.439082048597755e-06, "loss": 0.3341, "step": 413 }, { "epoch": 0.828, "grad_norm": 0.15154729783535004, "learning_rate": 6.423879585628262e-06, "loss": 0.2402, "step": 414 }, { "epoch": 0.83, "grad_norm": 0.20265011489391327, "learning_rate": 6.408662784207149e-06, "loss": 0.374, "step": 415 }, { "epoch": 0.832, "grad_norm": 0.2674030065536499, "learning_rate": 6.39343179756744e-06, "loss": 0.3057, "step": 416 }, { "epoch": 0.834, "grad_norm": 0.1473691463470459, "learning_rate": 6.378186779084996e-06, "loss": 0.3684, "step": 417 }, { "epoch": 0.836, "grad_norm": 0.2826951742172241, "learning_rate": 6.362927882276991e-06, "loss": 0.2585, "step": 418 }, { "epoch": 0.838, "grad_norm": 0.20093302428722382, "learning_rate": 6.34765526080034e-06, "loss": 0.3041, "step": 419 }, { "epoch": 0.84, "grad_norm": 0.1346312314271927, "learning_rate": 6.332369068450175e-06, "loss": 0.2105, "step": 420 }, { "epoch": 0.842, "grad_norm": 0.16400040686130524, "learning_rate": 6.317069459158284e-06, "loss": 0.2832, "step": 421 }, { "epoch": 0.844, "grad_norm": 0.19443334639072418, "learning_rate": 6.301756586991561e-06, "loss": 0.3353, "step": 422 }, { "epoch": 0.846, "grad_norm": 0.22223643958568573, "learning_rate": 6.286430606150458e-06, "loss": 0.384, "step": 423 }, { "epoch": 0.848, "grad_norm": 0.16762332618236542, "learning_rate": 6.271091670967437e-06, "loss": 0.3826, "step": 424 }, { "epoch": 0.85, "grad_norm": 0.26455458998680115, "learning_rate": 6.255739935905396e-06, "loss": 0.4419, "step": 425 }, { "epoch": 0.852, "grad_norm": 0.1570374071598053, "learning_rate": 6.240375555556145e-06, "loss": 0.2199, "step": 426 }, { "epoch": 0.854, "grad_norm": 0.16800148785114288, "learning_rate": 6.22499868463882e-06, "loss": 0.2561, "step": 427 }, { "epoch": 0.856, "grad_norm": 0.17082828283309937, "learning_rate": 6.209609477998339e-06, "loss": 0.3317, "step": 428 }, { "epoch": 0.858, "grad_norm": 0.26214951276779175, "learning_rate": 6.194208090603845e-06, "loss": 0.4105, "step": 429 }, { "epoch": 0.86, "grad_norm": 0.17318500578403473, "learning_rate": 6.178794677547138e-06, "loss": 0.2216, "step": 430 }, { "epoch": 0.862, "grad_norm": 0.18394838273525238, "learning_rate": 6.163369394041112e-06, "loss": 0.3251, "step": 431 }, { "epoch": 0.864, "grad_norm": 0.2352125197649002, "learning_rate": 6.1479323954182055e-06, "loss": 0.349, "step": 432 }, { "epoch": 0.866, "grad_norm": 0.18627074360847473, "learning_rate": 6.132483837128823e-06, "loss": 0.3048, "step": 433 }, { "epoch": 0.868, "grad_norm": 0.2253945916891098, "learning_rate": 6.1170238747397715e-06, "loss": 0.3081, "step": 434 }, { "epoch": 0.87, "grad_norm": 0.1479015201330185, "learning_rate": 6.101552663932704e-06, "loss": 0.192, "step": 435 }, { "epoch": 0.872, "grad_norm": 0.1954430192708969, "learning_rate": 6.08607036050254e-06, "loss": 0.2251, "step": 436 }, { "epoch": 0.874, "grad_norm": 0.16169880330562592, "learning_rate": 6.070577120355903e-06, "loss": 0.2765, "step": 437 }, { "epoch": 0.876, "grad_norm": 0.19537843763828278, "learning_rate": 6.055073099509549e-06, "loss": 0.2724, "step": 438 }, { "epoch": 0.878, "grad_norm": 0.1675713211297989, "learning_rate": 6.039558454088796e-06, "loss": 0.3164, "step": 439 }, { "epoch": 0.88, "grad_norm": 0.27977389097213745, "learning_rate": 6.024033340325954e-06, "loss": 0.4432, "step": 440 }, { "epoch": 0.882, "grad_norm": 0.1879289448261261, "learning_rate": 6.0084979145587444e-06, "loss": 0.3558, "step": 441 }, { "epoch": 0.884, "grad_norm": 0.16285355389118195, "learning_rate": 5.9929523332287275e-06, "loss": 0.3014, "step": 442 }, { "epoch": 0.886, "grad_norm": 0.2135494202375412, "learning_rate": 5.977396752879742e-06, "loss": 0.3124, "step": 443 }, { "epoch": 0.888, "grad_norm": 0.21992646157741547, "learning_rate": 5.961831330156306e-06, "loss": 0.3152, "step": 444 }, { "epoch": 0.89, "grad_norm": 0.34824761748313904, "learning_rate": 5.946256221802052e-06, "loss": 0.4022, "step": 445 }, { "epoch": 0.892, "grad_norm": 0.3176579177379608, "learning_rate": 5.930671584658151e-06, "loss": 0.3373, "step": 446 }, { "epoch": 0.894, "grad_norm": 0.13881681859493256, "learning_rate": 5.915077575661723e-06, "loss": 0.2732, "step": 447 }, { "epoch": 0.896, "grad_norm": 0.23585429787635803, "learning_rate": 5.89947435184427e-06, "loss": 0.383, "step": 448 }, { "epoch": 0.898, "grad_norm": 0.20338225364685059, "learning_rate": 5.883862070330079e-06, "loss": 0.3929, "step": 449 }, { "epoch": 0.9, "grad_norm": 0.5738399028778076, "learning_rate": 5.8682408883346535e-06, "loss": 0.3834, "step": 450 }, { "epoch": 0.902, "grad_norm": 0.16114148497581482, "learning_rate": 5.85261096316312e-06, "loss": 0.2351, "step": 451 }, { "epoch": 0.904, "grad_norm": 0.16090261936187744, "learning_rate": 5.8369724522086545e-06, "loss": 0.2264, "step": 452 }, { "epoch": 0.906, "grad_norm": 0.1992426961660385, "learning_rate": 5.821325512950886e-06, "loss": 0.3239, "step": 453 }, { "epoch": 0.908, "grad_norm": 0.1780838966369629, "learning_rate": 5.805670302954322e-06, "loss": 0.2997, "step": 454 }, { "epoch": 0.91, "grad_norm": 0.24148645997047424, "learning_rate": 5.79000697986675e-06, "loss": 0.3701, "step": 455 }, { "epoch": 0.912, "grad_norm": 0.1544380933046341, "learning_rate": 5.774335701417662e-06, "loss": 0.1843, "step": 456 }, { "epoch": 0.914, "grad_norm": 0.20772896707057953, "learning_rate": 5.758656625416659e-06, "loss": 0.3617, "step": 457 }, { "epoch": 0.916, "grad_norm": 0.2054608017206192, "learning_rate": 5.7429699097518585e-06, "loss": 0.3286, "step": 458 }, { "epoch": 0.918, "grad_norm": 0.1513553261756897, "learning_rate": 5.727275712388318e-06, "loss": 0.2149, "step": 459 }, { "epoch": 0.92, "grad_norm": 0.20221109688282013, "learning_rate": 5.711574191366427e-06, "loss": 0.2895, "step": 460 }, { "epoch": 0.922, "grad_norm": 0.26075002551078796, "learning_rate": 5.695865504800328e-06, "loss": 0.3115, "step": 461 }, { "epoch": 0.924, "grad_norm": 0.2223353236913681, "learning_rate": 5.680149810876322e-06, "loss": 0.3065, "step": 462 }, { "epoch": 0.926, "grad_norm": 0.18663600087165833, "learning_rate": 5.664427267851271e-06, "loss": 0.2444, "step": 463 }, { "epoch": 0.928, "grad_norm": 0.19538210332393646, "learning_rate": 5.648698034051009e-06, "loss": 0.3877, "step": 464 }, { "epoch": 0.93, "grad_norm": 0.1691403090953827, "learning_rate": 5.632962267868747e-06, "loss": 0.2445, "step": 465 }, { "epoch": 0.932, "grad_norm": 0.1581772416830063, "learning_rate": 5.617220127763474e-06, "loss": 0.3217, "step": 466 }, { "epoch": 0.934, "grad_norm": 0.20001822710037231, "learning_rate": 5.601471772258368e-06, "loss": 0.3184, "step": 467 }, { "epoch": 0.936, "grad_norm": 0.3052047789096832, "learning_rate": 5.585717359939192e-06, "loss": 0.3479, "step": 468 }, { "epoch": 0.938, "grad_norm": 0.23681974411010742, "learning_rate": 5.569957049452703e-06, "loss": 0.3403, "step": 469 }, { "epoch": 0.94, "grad_norm": 0.12364782392978668, "learning_rate": 5.5541909995050554e-06, "loss": 0.2085, "step": 470 }, { "epoch": 0.942, "grad_norm": 0.1526976227760315, "learning_rate": 5.538419368860196e-06, "loss": 0.2281, "step": 471 }, { "epoch": 0.944, "grad_norm": 0.2230585813522339, "learning_rate": 5.522642316338268e-06, "loss": 0.3351, "step": 472 }, { "epoch": 0.946, "grad_norm": 0.17690080404281616, "learning_rate": 5.506860000814017e-06, "loss": 0.2985, "step": 473 }, { "epoch": 0.948, "grad_norm": 0.1738656908273697, "learning_rate": 5.491072581215186e-06, "loss": 0.247, "step": 474 }, { "epoch": 0.95, "grad_norm": 0.18501204252243042, "learning_rate": 5.475280216520913e-06, "loss": 0.2646, "step": 475 }, { "epoch": 0.952, "grad_norm": 0.19721092283725739, "learning_rate": 5.459483065760138e-06, "loss": 0.2876, "step": 476 }, { "epoch": 0.954, "grad_norm": 0.16680027544498444, "learning_rate": 5.443681288009991e-06, "loss": 0.2167, "step": 477 }, { "epoch": 0.956, "grad_norm": 0.17918136715888977, "learning_rate": 5.4278750423942e-06, "loss": 0.3997, "step": 478 }, { "epoch": 0.958, "grad_norm": 0.15725551545619965, "learning_rate": 5.412064488081482e-06, "loss": 0.2829, "step": 479 }, { "epoch": 0.96, "grad_norm": 0.19459596276283264, "learning_rate": 5.396249784283943e-06, "loss": 0.3373, "step": 480 }, { "epoch": 0.962, "grad_norm": 0.32756415009498596, "learning_rate": 5.380431090255475e-06, "loss": 0.4206, "step": 481 }, { "epoch": 0.964, "grad_norm": 0.19843968749046326, "learning_rate": 5.364608565290154e-06, "loss": 0.3385, "step": 482 }, { "epoch": 0.966, "grad_norm": 0.15863648056983948, "learning_rate": 5.348782368720627e-06, "loss": 0.2524, "step": 483 }, { "epoch": 0.968, "grad_norm": 0.21220897138118744, "learning_rate": 5.33295265991652e-06, "loss": 0.2326, "step": 484 }, { "epoch": 0.97, "grad_norm": 0.24547149240970612, "learning_rate": 5.317119598282823e-06, "loss": 0.3854, "step": 485 }, { "epoch": 0.972, "grad_norm": 0.2009747326374054, "learning_rate": 5.301283343258293e-06, "loss": 0.3141, "step": 486 }, { "epoch": 0.974, "grad_norm": 0.22629286348819733, "learning_rate": 5.285444054313841e-06, "loss": 0.3044, "step": 487 }, { "epoch": 0.976, "grad_norm": 0.18528909981250763, "learning_rate": 5.26960189095093e-06, "loss": 0.3056, "step": 488 }, { "epoch": 0.978, "grad_norm": 0.18446871638298035, "learning_rate": 5.253757012699972e-06, "loss": 0.3206, "step": 489 }, { "epoch": 0.98, "grad_norm": 0.1961178332567215, "learning_rate": 5.237909579118713e-06, "loss": 0.386, "step": 490 }, { "epoch": 0.982, "grad_norm": 0.20445547997951508, "learning_rate": 5.2220597497906315e-06, "loss": 0.3997, "step": 491 }, { "epoch": 0.984, "grad_norm": 0.17709751427173615, "learning_rate": 5.206207684323337e-06, "loss": 0.3212, "step": 492 }, { "epoch": 0.986, "grad_norm": 0.15768595039844513, "learning_rate": 5.190353542346951e-06, "loss": 0.2752, "step": 493 }, { "epoch": 0.988, "grad_norm": 0.14925841987133026, "learning_rate": 5.174497483512506e-06, "loss": 0.2593, "step": 494 }, { "epoch": 0.99, "grad_norm": 0.2051381766796112, "learning_rate": 5.15863966749034e-06, "loss": 0.3941, "step": 495 }, { "epoch": 0.992, "grad_norm": 0.2395932674407959, "learning_rate": 5.142780253968481e-06, "loss": 0.3136, "step": 496 }, { "epoch": 0.994, "grad_norm": 0.2152215540409088, "learning_rate": 5.126919402651053e-06, "loss": 0.3083, "step": 497 }, { "epoch": 0.996, "grad_norm": 0.17021948099136353, "learning_rate": 5.111057273256648e-06, "loss": 0.3185, "step": 498 }, { "epoch": 0.998, "grad_norm": 0.22681966423988342, "learning_rate": 5.095194025516733e-06, "loss": 0.4107, "step": 499 }, { "epoch": 1.0, "grad_norm": 0.22234933078289032, "learning_rate": 5.07932981917404e-06, "loss": 0.3672, "step": 500 }, { "epoch": 1.0, "eval_loss": 0.27911150455474854, "eval_runtime": 76.7158, "eval_samples_per_second": 7.195, "eval_steps_per_second": 0.899, "step": 500 }, { "epoch": 1.002, "grad_norm": 0.18890836834907532, "learning_rate": 5.063464813980948e-06, "loss": 0.2277, "step": 501 }, { "epoch": 1.004, "grad_norm": 0.19094686210155487, "learning_rate": 5.0475991696978844e-06, "loss": 0.3602, "step": 502 }, { "epoch": 1.006, "grad_norm": 0.24123992025852203, "learning_rate": 5.03173304609171e-06, "loss": 0.2796, "step": 503 }, { "epoch": 1.008, "grad_norm": 0.2091682106256485, "learning_rate": 5.015866602934112e-06, "loss": 0.333, "step": 504 }, { "epoch": 1.01, "grad_norm": 0.21148917078971863, "learning_rate": 5e-06, "loss": 0.4005, "step": 505 }, { "epoch": 1.012, "grad_norm": 0.14547854661941528, "learning_rate": 4.984133397065889e-06, "loss": 0.2223, "step": 506 }, { "epoch": 1.014, "grad_norm": 0.23349957168102264, "learning_rate": 4.9682669539082914e-06, "loss": 0.3264, "step": 507 }, { "epoch": 1.016, "grad_norm": 0.16822971403598785, "learning_rate": 4.952400830302117e-06, "loss": 0.3151, "step": 508 }, { "epoch": 1.018, "grad_norm": 0.1795063018798828, "learning_rate": 4.936535186019053e-06, "loss": 0.2896, "step": 509 }, { "epoch": 1.02, "grad_norm": 0.19863282144069672, "learning_rate": 4.9206701808259605e-06, "loss": 0.2481, "step": 510 }, { "epoch": 1.022, "grad_norm": 0.18788766860961914, "learning_rate": 4.904805974483267e-06, "loss": 0.3513, "step": 511 }, { "epoch": 1.024, "grad_norm": 0.1949293315410614, "learning_rate": 4.888942726743353e-06, "loss": 0.2264, "step": 512 }, { "epoch": 1.002, "grad_norm": 0.16474653780460358, "learning_rate": 4.873080597348948e-06, "loss": 0.2793, "step": 513 }, { "epoch": 1.004, "grad_norm": 0.20230461657047272, "learning_rate": 4.85721974603152e-06, "loss": 0.3618, "step": 514 }, { "epoch": 1.006, "grad_norm": 0.16907107830047607, "learning_rate": 4.841360332509663e-06, "loss": 0.2708, "step": 515 }, { "epoch": 1.008, "grad_norm": 0.22199520468711853, "learning_rate": 4.825502516487497e-06, "loss": 0.3405, "step": 516 }, { "epoch": 1.01, "grad_norm": 0.17370116710662842, "learning_rate": 4.809646457653051e-06, "loss": 0.2715, "step": 517 }, { "epoch": 1.012, "grad_norm": 0.21842899918556213, "learning_rate": 4.793792315676665e-06, "loss": 0.1802, "step": 518 }, { "epoch": 1.014, "grad_norm": 0.1792248785495758, "learning_rate": 4.777940250209369e-06, "loss": 0.1912, "step": 519 }, { "epoch": 1.016, "grad_norm": 0.24431253969669342, "learning_rate": 4.762090420881289e-06, "loss": 0.3494, "step": 520 }, { "epoch": 1.018, "grad_norm": 0.1893794983625412, "learning_rate": 4.74624298730003e-06, "loss": 0.246, "step": 521 }, { "epoch": 1.02, "grad_norm": 0.29100745916366577, "learning_rate": 4.7303981090490715e-06, "loss": 0.4553, "step": 522 }, { "epoch": 1.022, "grad_norm": 0.21313871443271637, "learning_rate": 4.71455594568616e-06, "loss": 0.3414, "step": 523 }, { "epoch": 1.024, "grad_norm": 0.257988840341568, "learning_rate": 4.6987166567417085e-06, "loss": 0.3223, "step": 524 }, { "epoch": 1.026, "grad_norm": 0.1500207781791687, "learning_rate": 4.682880401717178e-06, "loss": 0.2883, "step": 525 }, { "epoch": 1.028, "grad_norm": 0.2195630818605423, "learning_rate": 4.667047340083481e-06, "loss": 0.4185, "step": 526 }, { "epoch": 1.03, "grad_norm": 0.24663732945919037, "learning_rate": 4.651217631279374e-06, "loss": 0.312, "step": 527 }, { "epoch": 1.032, "grad_norm": 0.23168163001537323, "learning_rate": 4.635391434709847e-06, "loss": 0.3826, "step": 528 }, { "epoch": 1.034, "grad_norm": 0.20334544777870178, "learning_rate": 4.619568909744524e-06, "loss": 0.302, "step": 529 }, { "epoch": 1.036, "grad_norm": 0.2471403032541275, "learning_rate": 4.603750215716057e-06, "loss": 0.3024, "step": 530 }, { "epoch": 1.038, "grad_norm": 0.19385652244091034, "learning_rate": 4.587935511918521e-06, "loss": 0.2803, "step": 531 }, { "epoch": 1.04, "grad_norm": 0.24697639048099518, "learning_rate": 4.572124957605803e-06, "loss": 0.4114, "step": 532 }, { "epoch": 1.042, "grad_norm": 0.24823316931724548, "learning_rate": 4.55631871199001e-06, "loss": 0.3705, "step": 533 }, { "epoch": 1.044, "grad_norm": 0.1970013827085495, "learning_rate": 4.5405169342398634e-06, "loss": 0.3608, "step": 534 }, { "epoch": 1.046, "grad_norm": 0.20955346524715424, "learning_rate": 4.524719783479088e-06, "loss": 0.347, "step": 535 }, { "epoch": 1.048, "grad_norm": 0.1911235898733139, "learning_rate": 4.5089274187848144e-06, "loss": 0.2342, "step": 536 }, { "epoch": 1.05, "grad_norm": 0.22940923273563385, "learning_rate": 4.493139999185984e-06, "loss": 0.2803, "step": 537 }, { "epoch": 1.052, "grad_norm": 0.24347023665905, "learning_rate": 4.477357683661734e-06, "loss": 0.3833, "step": 538 }, { "epoch": 1.054, "grad_norm": 0.24687382578849792, "learning_rate": 4.461580631139806e-06, "loss": 0.3467, "step": 539 }, { "epoch": 1.056, "grad_norm": 0.15779221057891846, "learning_rate": 4.445809000494945e-06, "loss": 0.2781, "step": 540 }, { "epoch": 1.058, "grad_norm": 0.20665578544139862, "learning_rate": 4.430042950547298e-06, "loss": 0.4656, "step": 541 }, { "epoch": 1.06, "grad_norm": 0.24457348883152008, "learning_rate": 4.414282640060809e-06, "loss": 0.2684, "step": 542 }, { "epoch": 1.062, "grad_norm": 0.20804962515830994, "learning_rate": 4.398528227741634e-06, "loss": 0.3577, "step": 543 }, { "epoch": 1.064, "grad_norm": 0.2586953043937683, "learning_rate": 4.382779872236527e-06, "loss": 0.3492, "step": 544 }, { "epoch": 1.066, "grad_norm": 0.26488688588142395, "learning_rate": 4.367037732131254e-06, "loss": 0.3954, "step": 545 }, { "epoch": 1.068, "grad_norm": 0.15630888938903809, "learning_rate": 4.3513019659489916e-06, "loss": 0.1673, "step": 546 }, { "epoch": 1.07, "grad_norm": 0.15465758740901947, "learning_rate": 4.33557273214873e-06, "loss": 0.2532, "step": 547 }, { "epoch": 1.072, "grad_norm": 0.25680503249168396, "learning_rate": 4.319850189123681e-06, "loss": 0.3065, "step": 548 }, { "epoch": 1.074, "grad_norm": 0.24224849045276642, "learning_rate": 4.304134495199675e-06, "loss": 0.4157, "step": 549 }, { "epoch": 1.076, "grad_norm": 0.1849289834499359, "learning_rate": 4.2884258086335755e-06, "loss": 0.3611, "step": 550 }, { "epoch": 1.078, "grad_norm": 0.2488396316766739, "learning_rate": 4.272724287611684e-06, "loss": 0.313, "step": 551 }, { "epoch": 1.08, "grad_norm": 0.23535999655723572, "learning_rate": 4.257030090248142e-06, "loss": 0.3165, "step": 552 }, { "epoch": 1.082, "grad_norm": 0.19105635583400726, "learning_rate": 4.241343374583343e-06, "loss": 0.2779, "step": 553 }, { "epoch": 1.084, "grad_norm": 0.22108493745326996, "learning_rate": 4.225664298582339e-06, "loss": 0.3312, "step": 554 }, { "epoch": 1.086, "grad_norm": 0.18127895891666412, "learning_rate": 4.209993020133251e-06, "loss": 0.2099, "step": 555 }, { "epoch": 1.088, "grad_norm": 0.304030179977417, "learning_rate": 4.194329697045681e-06, "loss": 0.4397, "step": 556 }, { "epoch": 1.09, "grad_norm": 0.16876006126403809, "learning_rate": 4.178674487049116e-06, "loss": 0.253, "step": 557 }, { "epoch": 1.092, "grad_norm": 0.18693579733371735, "learning_rate": 4.163027547791347e-06, "loss": 0.2696, "step": 558 }, { "epoch": 1.094, "grad_norm": 0.2209119349718094, "learning_rate": 4.147389036836881e-06, "loss": 0.2225, "step": 559 }, { "epoch": 1.096, "grad_norm": 0.1712501347064972, "learning_rate": 4.131759111665349e-06, "loss": 0.2205, "step": 560 }, { "epoch": 1.098, "grad_norm": 0.18427731096744537, "learning_rate": 4.116137929669921e-06, "loss": 0.2527, "step": 561 }, { "epoch": 1.1, "grad_norm": 0.16298742592334747, "learning_rate": 4.100525648155731e-06, "loss": 0.2583, "step": 562 }, { "epoch": 1.102, "grad_norm": 0.1921571046113968, "learning_rate": 4.084922424338277e-06, "loss": 0.2931, "step": 563 }, { "epoch": 1.104, "grad_norm": 0.1696956604719162, "learning_rate": 4.06932841534185e-06, "loss": 0.2686, "step": 564 }, { "epoch": 1.106, "grad_norm": 0.2463129460811615, "learning_rate": 4.053743778197951e-06, "loss": 0.301, "step": 565 }, { "epoch": 1.108, "grad_norm": 0.15761299431324005, "learning_rate": 4.038168669843698e-06, "loss": 0.1756, "step": 566 }, { "epoch": 1.11, "grad_norm": 0.1688557118177414, "learning_rate": 4.02260324712026e-06, "loss": 0.2969, "step": 567 }, { "epoch": 1.112, "grad_norm": 0.21805354952812195, "learning_rate": 4.007047666771274e-06, "loss": 0.2739, "step": 568 }, { "epoch": 1.114, "grad_norm": 0.17749401926994324, "learning_rate": 3.991502085441259e-06, "loss": 0.2698, "step": 569 }, { "epoch": 1.116, "grad_norm": 0.2537892758846283, "learning_rate": 3.975966659674048e-06, "loss": 0.4131, "step": 570 }, { "epoch": 1.1179999999999999, "grad_norm": 0.15672741830348969, "learning_rate": 3.960441545911205e-06, "loss": 0.2118, "step": 571 }, { "epoch": 1.12, "grad_norm": 0.23960451781749725, "learning_rate": 3.944926900490452e-06, "loss": 0.2715, "step": 572 }, { "epoch": 1.1219999999999999, "grad_norm": 0.17803031206130981, "learning_rate": 3.929422879644099e-06, "loss": 0.24, "step": 573 }, { "epoch": 1.124, "grad_norm": 0.2676704525947571, "learning_rate": 3.913929639497462e-06, "loss": 0.3247, "step": 574 }, { "epoch": 1.126, "grad_norm": 0.1522570550441742, "learning_rate": 3.898447336067297e-06, "loss": 0.2298, "step": 575 }, { "epoch": 1.1280000000000001, "grad_norm": 0.23372875154018402, "learning_rate": 3.882976125260229e-06, "loss": 0.4375, "step": 576 }, { "epoch": 1.13, "grad_norm": 0.3442481756210327, "learning_rate": 3.867516162871177e-06, "loss": 0.2883, "step": 577 }, { "epoch": 1.1320000000000001, "grad_norm": 0.2335498332977295, "learning_rate": 3.8520676045817945e-06, "loss": 0.2602, "step": 578 }, { "epoch": 1.134, "grad_norm": 0.29386457800865173, "learning_rate": 3.8366306059588885e-06, "loss": 0.3826, "step": 579 }, { "epoch": 1.1360000000000001, "grad_norm": 0.18141314387321472, "learning_rate": 3.821205322452863e-06, "loss": 0.205, "step": 580 }, { "epoch": 1.138, "grad_norm": 0.21235667169094086, "learning_rate": 3.8057919093961554e-06, "loss": 0.2511, "step": 581 }, { "epoch": 1.1400000000000001, "grad_norm": 0.15281343460083008, "learning_rate": 3.790390522001662e-06, "loss": 0.1908, "step": 582 }, { "epoch": 1.142, "grad_norm": 0.1883106231689453, "learning_rate": 3.775001315361183e-06, "loss": 0.2896, "step": 583 }, { "epoch": 1.144, "grad_norm": 0.19878095388412476, "learning_rate": 3.7596244444438577e-06, "loss": 0.2847, "step": 584 }, { "epoch": 1.146, "grad_norm": 0.18822817504405975, "learning_rate": 3.7442600640946045e-06, "loss": 0.3134, "step": 585 }, { "epoch": 1.148, "grad_norm": 0.21552503108978271, "learning_rate": 3.7289083290325668e-06, "loss": 0.3323, "step": 586 }, { "epoch": 1.15, "grad_norm": 0.25933748483657837, "learning_rate": 3.7135693938495433e-06, "loss": 0.3463, "step": 587 }, { "epoch": 1.152, "grad_norm": 0.23867465555667877, "learning_rate": 3.69824341300844e-06, "loss": 0.3601, "step": 588 }, { "epoch": 1.154, "grad_norm": 0.3167083263397217, "learning_rate": 3.682930540841717e-06, "loss": 0.4182, "step": 589 }, { "epoch": 1.156, "grad_norm": 0.31397873163223267, "learning_rate": 3.667630931549826e-06, "loss": 0.3287, "step": 590 }, { "epoch": 1.158, "grad_norm": 0.18764562904834747, "learning_rate": 3.6523447391996613e-06, "loss": 0.276, "step": 591 }, { "epoch": 1.16, "grad_norm": 0.29411885142326355, "learning_rate": 3.637072117723012e-06, "loss": 0.3956, "step": 592 }, { "epoch": 1.162, "grad_norm": 0.19027218222618103, "learning_rate": 3.6218132209150047e-06, "loss": 0.2753, "step": 593 }, { "epoch": 1.164, "grad_norm": 0.20175009965896606, "learning_rate": 3.606568202432562e-06, "loss": 0.3459, "step": 594 }, { "epoch": 1.166, "grad_norm": 0.2005695253610611, "learning_rate": 3.5913372157928515e-06, "loss": 0.2125, "step": 595 }, { "epoch": 1.168, "grad_norm": 0.22972247004508972, "learning_rate": 3.5761204143717387e-06, "loss": 0.2925, "step": 596 }, { "epoch": 1.17, "grad_norm": 0.22252865135669708, "learning_rate": 3.560917951402245e-06, "loss": 0.3467, "step": 597 }, { "epoch": 1.172, "grad_norm": 0.2404780089855194, "learning_rate": 3.5457299799730047e-06, "loss": 0.3268, "step": 598 }, { "epoch": 1.174, "grad_norm": 0.24187296628952026, "learning_rate": 3.5305566530267217e-06, "loss": 0.3654, "step": 599 }, { "epoch": 1.176, "grad_norm": 0.23365625739097595, "learning_rate": 3.5153981233586277e-06, "loss": 0.3168, "step": 600 }, { "epoch": 1.178, "grad_norm": 0.20350268483161926, "learning_rate": 3.5002545436149478e-06, "loss": 0.2618, "step": 601 }, { "epoch": 1.18, "grad_norm": 0.22084195911884308, "learning_rate": 3.4851260662913643e-06, "loss": 0.381, "step": 602 }, { "epoch": 1.182, "grad_norm": 0.5043354630470276, "learning_rate": 3.470012843731476e-06, "loss": 0.426, "step": 603 }, { "epoch": 1.184, "grad_norm": 0.23615571856498718, "learning_rate": 3.4549150281252635e-06, "loss": 0.3891, "step": 604 }, { "epoch": 1.186, "grad_norm": 0.1776285469532013, "learning_rate": 3.439832771507565e-06, "loss": 0.2032, "step": 605 }, { "epoch": 1.188, "grad_norm": 0.23352046310901642, "learning_rate": 3.4247662257565372e-06, "loss": 0.2098, "step": 606 }, { "epoch": 1.19, "grad_norm": 0.19145451486110687, "learning_rate": 3.4097155425921256e-06, "loss": 0.2612, "step": 607 }, { "epoch": 1.192, "grad_norm": 0.19671331346035004, "learning_rate": 3.394680873574546e-06, "loss": 0.2941, "step": 608 }, { "epoch": 1.194, "grad_norm": 0.2002706378698349, "learning_rate": 3.3796623701027477e-06, "loss": 0.1828, "step": 609 }, { "epoch": 1.196, "grad_norm": 0.23058104515075684, "learning_rate": 3.3646601834128924e-06, "loss": 0.2983, "step": 610 }, { "epoch": 1.198, "grad_norm": 0.13006491959095, "learning_rate": 3.349674464576834e-06, "loss": 0.1306, "step": 611 }, { "epoch": 1.2, "grad_norm": 0.29587817192077637, "learning_rate": 3.3347053645005965e-06, "loss": 0.3542, "step": 612 }, { "epoch": 1.202, "grad_norm": 0.23100513219833374, "learning_rate": 3.319753033922849e-06, "loss": 0.4051, "step": 613 }, { "epoch": 1.204, "grad_norm": 0.24775229394435883, "learning_rate": 3.3048176234133967e-06, "loss": 0.2378, "step": 614 }, { "epoch": 1.206, "grad_norm": 0.18648101389408112, "learning_rate": 3.289899283371657e-06, "loss": 0.2141, "step": 615 }, { "epoch": 1.208, "grad_norm": 0.24682392179965973, "learning_rate": 3.274998164025148e-06, "loss": 0.3123, "step": 616 }, { "epoch": 1.21, "grad_norm": 0.25237175822257996, "learning_rate": 3.260114415427975e-06, "loss": 0.4471, "step": 617 }, { "epoch": 1.212, "grad_norm": 0.20262058079242706, "learning_rate": 3.2452481874593234e-06, "loss": 0.2694, "step": 618 }, { "epoch": 1.214, "grad_norm": 0.23342056572437286, "learning_rate": 3.230399629821942e-06, "loss": 0.3093, "step": 619 }, { "epoch": 1.216, "grad_norm": 0.17575059831142426, "learning_rate": 3.2155688920406415e-06, "loss": 0.2923, "step": 620 }, { "epoch": 1.218, "grad_norm": 0.2357223480939865, "learning_rate": 3.200756123460788e-06, "loss": 0.3569, "step": 621 }, { "epoch": 1.22, "grad_norm": 0.3179761469364166, "learning_rate": 3.1859614732467957e-06, "loss": 0.4442, "step": 622 }, { "epoch": 1.222, "grad_norm": 0.28770139813423157, "learning_rate": 3.171185090380628e-06, "loss": 0.3325, "step": 623 }, { "epoch": 1.224, "grad_norm": 0.18547223508358002, "learning_rate": 3.156427123660297e-06, "loss": 0.2269, "step": 624 }, { "epoch": 1.226, "grad_norm": 0.21385949850082397, "learning_rate": 3.141687721698363e-06, "loss": 0.2615, "step": 625 }, { "epoch": 1.226, "eval_loss": 0.2700715959072113, "eval_runtime": 76.6157, "eval_samples_per_second": 7.205, "eval_steps_per_second": 0.901, "step": 625 }, { "epoch": 1.228, "grad_norm": 0.3386872708797455, "learning_rate": 3.12696703292044e-06, "loss": 0.3519, "step": 626 }, { "epoch": 1.23, "grad_norm": 0.19794243574142456, "learning_rate": 3.1122652055637014e-06, "loss": 0.2581, "step": 627 }, { "epoch": 1.232, "grad_norm": 0.1912515014410019, "learning_rate": 3.097582387675385e-06, "loss": 0.3286, "step": 628 }, { "epoch": 1.234, "grad_norm": 0.18073877692222595, "learning_rate": 3.0829187271113035e-06, "loss": 0.2411, "step": 629 }, { "epoch": 1.236, "grad_norm": 0.24173890054225922, "learning_rate": 3.0682743715343565e-06, "loss": 0.3853, "step": 630 }, { "epoch": 1.238, "grad_norm": 0.17611730098724365, "learning_rate": 3.053649468413043e-06, "loss": 0.1971, "step": 631 }, { "epoch": 1.24, "grad_norm": 0.22723500430583954, "learning_rate": 3.0390441650199727e-06, "loss": 0.2852, "step": 632 }, { "epoch": 1.242, "grad_norm": 0.2124418169260025, "learning_rate": 3.0244586084303908e-06, "loss": 0.329, "step": 633 }, { "epoch": 1.244, "grad_norm": 0.24569527804851532, "learning_rate": 3.0098929455206905e-06, "loss": 0.4141, "step": 634 }, { "epoch": 1.246, "grad_norm": 0.2651529312133789, "learning_rate": 2.995347322966933e-06, "loss": 0.2759, "step": 635 }, { "epoch": 1.248, "grad_norm": 0.3110187351703644, "learning_rate": 2.980821887243377e-06, "loss": 0.3405, "step": 636 }, { "epoch": 1.25, "grad_norm": 0.23818974196910858, "learning_rate": 2.966316784621e-06, "loss": 0.2185, "step": 637 }, { "epoch": 1.252, "grad_norm": 0.32177677750587463, "learning_rate": 2.951832161166024e-06, "loss": 0.4972, "step": 638 }, { "epoch": 1.254, "grad_norm": 0.21647526323795319, "learning_rate": 2.937368162738445e-06, "loss": 0.4215, "step": 639 }, { "epoch": 1.256, "grad_norm": 0.1766624003648758, "learning_rate": 2.9229249349905686e-06, "loss": 0.2439, "step": 640 }, { "epoch": 1.258, "grad_norm": 0.34441429376602173, "learning_rate": 2.9085026233655367e-06, "loss": 0.4078, "step": 641 }, { "epoch": 1.26, "grad_norm": 0.30576056241989136, "learning_rate": 2.8941013730958674e-06, "loss": 0.4071, "step": 642 }, { "epoch": 1.262, "grad_norm": 0.22246578335762024, "learning_rate": 2.8797213292019927e-06, "loss": 0.3456, "step": 643 }, { "epoch": 1.264, "grad_norm": 0.21253855526447296, "learning_rate": 2.8653626364907918e-06, "loss": 0.2257, "step": 644 }, { "epoch": 1.266, "grad_norm": 0.22427724301815033, "learning_rate": 2.851025439554142e-06, "loss": 0.298, "step": 645 }, { "epoch": 1.268, "grad_norm": 0.19472835958003998, "learning_rate": 2.8367098827674575e-06, "loss": 0.3093, "step": 646 }, { "epoch": 1.27, "grad_norm": 0.19399920105934143, "learning_rate": 2.82241611028824e-06, "loss": 0.2254, "step": 647 }, { "epoch": 1.272, "grad_norm": 0.23820382356643677, "learning_rate": 2.8081442660546126e-06, "loss": 0.2909, "step": 648 }, { "epoch": 1.274, "grad_norm": 0.1856381893157959, "learning_rate": 2.7938944937838924e-06, "loss": 0.2367, "step": 649 }, { "epoch": 1.276, "grad_norm": 0.16763170063495636, "learning_rate": 2.7796669369711294e-06, "loss": 0.1991, "step": 650 }, { "epoch": 1.278, "grad_norm": 0.25936460494995117, "learning_rate": 2.7654617388876612e-06, "loss": 0.3244, "step": 651 }, { "epoch": 1.28, "grad_norm": 0.37680599093437195, "learning_rate": 2.751279042579672e-06, "loss": 0.409, "step": 652 }, { "epoch": 1.282, "grad_norm": 0.2094666063785553, "learning_rate": 2.7371189908667604e-06, "loss": 0.3523, "step": 653 }, { "epoch": 1.284, "grad_norm": 0.25615018606185913, "learning_rate": 2.722981726340487e-06, "loss": 0.3496, "step": 654 }, { "epoch": 1.286, "grad_norm": 0.2155938446521759, "learning_rate": 2.708867391362948e-06, "loss": 0.2099, "step": 655 }, { "epoch": 1.288, "grad_norm": 0.2571382522583008, "learning_rate": 2.694776128065345e-06, "loss": 0.2505, "step": 656 }, { "epoch": 1.29, "grad_norm": 0.25513583421707153, "learning_rate": 2.6807080783465376e-06, "loss": 0.3528, "step": 657 }, { "epoch": 1.292, "grad_norm": 0.21190734207630157, "learning_rate": 2.6666633838716317e-06, "loss": 0.3892, "step": 658 }, { "epoch": 1.294, "grad_norm": 0.2990153133869171, "learning_rate": 2.6526421860705474e-06, "loss": 0.3916, "step": 659 }, { "epoch": 1.296, "grad_norm": 0.22129324078559875, "learning_rate": 2.6386446261365874e-06, "loss": 0.2596, "step": 660 }, { "epoch": 1.298, "grad_norm": 0.2187465876340866, "learning_rate": 2.6246708450250256e-06, "loss": 0.3962, "step": 661 }, { "epoch": 1.3, "grad_norm": 0.17136049270629883, "learning_rate": 2.6107209834516857e-06, "loss": 0.3483, "step": 662 }, { "epoch": 1.302, "grad_norm": 0.25110378861427307, "learning_rate": 2.5967951818915137e-06, "loss": 0.4098, "step": 663 }, { "epoch": 1.304, "grad_norm": 0.3335612118244171, "learning_rate": 2.5828935805771804e-06, "loss": 0.3407, "step": 664 }, { "epoch": 1.306, "grad_norm": 0.23392237722873688, "learning_rate": 2.5690163194976576e-06, "loss": 0.3893, "step": 665 }, { "epoch": 1.308, "grad_norm": 0.21025826036930084, "learning_rate": 2.5551635383968063e-06, "loss": 0.3047, "step": 666 }, { "epoch": 1.31, "grad_norm": 0.20678383111953735, "learning_rate": 2.5413353767719805e-06, "loss": 0.3068, "step": 667 }, { "epoch": 1.312, "grad_norm": 0.255937397480011, "learning_rate": 2.527531973872617e-06, "loss": 0.2963, "step": 668 }, { "epoch": 1.314, "grad_norm": 0.3448125422000885, "learning_rate": 2.5137534686988265e-06, "loss": 0.3944, "step": 669 }, { "epoch": 1.316, "grad_norm": 0.21276655793190002, "learning_rate": 2.5000000000000015e-06, "loss": 0.2955, "step": 670 }, { "epoch": 1.318, "grad_norm": 0.2522459030151367, "learning_rate": 2.486271706273421e-06, "loss": 0.3536, "step": 671 }, { "epoch": 1.32, "grad_norm": 0.2182285189628601, "learning_rate": 2.4725687257628533e-06, "loss": 0.3541, "step": 672 }, { "epoch": 1.322, "grad_norm": 0.30204272270202637, "learning_rate": 2.4588911964571557e-06, "loss": 0.268, "step": 673 }, { "epoch": 1.324, "grad_norm": 0.27727144956588745, "learning_rate": 2.445239256088898e-06, "loss": 0.3061, "step": 674 }, { "epoch": 1.326, "grad_norm": 0.22263972461223602, "learning_rate": 2.4316130421329696e-06, "loss": 0.3317, "step": 675 }, { "epoch": 1.328, "grad_norm": 0.23461495339870453, "learning_rate": 2.418012691805191e-06, "loss": 0.3153, "step": 676 }, { "epoch": 1.33, "grad_norm": 0.1453184336423874, "learning_rate": 2.404438342060941e-06, "loss": 0.1933, "step": 677 }, { "epoch": 1.332, "grad_norm": 0.20232437551021576, "learning_rate": 2.3908901295937713e-06, "loss": 0.1941, "step": 678 }, { "epoch": 1.334, "grad_norm": 0.23894034326076508, "learning_rate": 2.3773681908340284e-06, "loss": 0.3198, "step": 679 }, { "epoch": 1.336, "grad_norm": 0.3079819977283478, "learning_rate": 2.363872661947488e-06, "loss": 0.3761, "step": 680 }, { "epoch": 1.338, "grad_norm": 0.20794443786144257, "learning_rate": 2.3504036788339763e-06, "loss": 0.3837, "step": 681 }, { "epoch": 1.34, "grad_norm": 0.2881450057029724, "learning_rate": 2.3369613771260006e-06, "loss": 0.2904, "step": 682 }, { "epoch": 1.342, "grad_norm": 0.20050355792045593, "learning_rate": 2.323545892187393e-06, "loss": 0.2323, "step": 683 }, { "epoch": 1.3439999999999999, "grad_norm": 0.22167599201202393, "learning_rate": 2.310157359111938e-06, "loss": 0.2501, "step": 684 }, { "epoch": 1.346, "grad_norm": 0.29652273654937744, "learning_rate": 2.296795912722014e-06, "loss": 0.3702, "step": 685 }, { "epoch": 1.3479999999999999, "grad_norm": 0.20178988575935364, "learning_rate": 2.2834616875672362e-06, "loss": 0.2581, "step": 686 }, { "epoch": 1.35, "grad_norm": 0.25368136167526245, "learning_rate": 2.2701548179231048e-06, "loss": 0.3034, "step": 687 }, { "epoch": 1.3519999999999999, "grad_norm": 0.20186640322208405, "learning_rate": 2.2568754377896516e-06, "loss": 0.2991, "step": 688 }, { "epoch": 1.354, "grad_norm": 0.2289544939994812, "learning_rate": 2.2436236808900846e-06, "loss": 0.3188, "step": 689 }, { "epoch": 1.3559999999999999, "grad_norm": 0.2351309210062027, "learning_rate": 2.230399680669449e-06, "loss": 0.2942, "step": 690 }, { "epoch": 1.358, "grad_norm": 0.19411875307559967, "learning_rate": 2.2172035702932828e-06, "loss": 0.3415, "step": 691 }, { "epoch": 1.3599999999999999, "grad_norm": 0.23344936966896057, "learning_rate": 2.204035482646267e-06, "loss": 0.2904, "step": 692 }, { "epoch": 1.362, "grad_norm": 0.17623913288116455, "learning_rate": 2.190895550330899e-06, "loss": 0.1493, "step": 693 }, { "epoch": 1.3639999999999999, "grad_norm": 0.22438128292560577, "learning_rate": 2.1777839056661555e-06, "loss": 0.3669, "step": 694 }, { "epoch": 1.366, "grad_norm": 0.25720444321632385, "learning_rate": 2.1647006806861472e-06, "loss": 0.4394, "step": 695 }, { "epoch": 1.3679999999999999, "grad_norm": 0.17176856100559235, "learning_rate": 2.1516460071388062e-06, "loss": 0.2309, "step": 696 }, { "epoch": 1.37, "grad_norm": 0.26110807061195374, "learning_rate": 2.1386200164845527e-06, "loss": 0.4329, "step": 697 }, { "epoch": 1.3719999999999999, "grad_norm": 0.24240969121456146, "learning_rate": 2.125622839894964e-06, "loss": 0.2596, "step": 698 }, { "epoch": 1.374, "grad_norm": 0.202704519033432, "learning_rate": 2.1126546082514665e-06, "loss": 0.2737, "step": 699 }, { "epoch": 1.376, "grad_norm": 0.20342108607292175, "learning_rate": 2.09971545214401e-06, "loss": 0.2692, "step": 700 }, { "epoch": 1.3780000000000001, "grad_norm": 0.3197811543941498, "learning_rate": 2.086805501869749e-06, "loss": 0.3117, "step": 701 }, { "epoch": 1.38, "grad_norm": 0.29925206303596497, "learning_rate": 2.073924887431744e-06, "loss": 0.2391, "step": 702 }, { "epoch": 1.3820000000000001, "grad_norm": 0.2412380427122116, "learning_rate": 2.061073738537635e-06, "loss": 0.2434, "step": 703 }, { "epoch": 1.384, "grad_norm": 0.25253570079803467, "learning_rate": 2.0482521845983522e-06, "loss": 0.3284, "step": 704 }, { "epoch": 1.3860000000000001, "grad_norm": 0.18548652529716492, "learning_rate": 2.0354603547267985e-06, "loss": 0.2562, "step": 705 }, { "epoch": 1.388, "grad_norm": 0.2307010442018509, "learning_rate": 2.0226983777365604e-06, "loss": 0.2445, "step": 706 }, { "epoch": 1.3900000000000001, "grad_norm": 0.1840142160654068, "learning_rate": 2.009966382140606e-06, "loss": 0.3521, "step": 707 }, { "epoch": 1.392, "grad_norm": 0.2078990340232849, "learning_rate": 1.9972644961499853e-06, "loss": 0.2887, "step": 708 }, { "epoch": 1.3940000000000001, "grad_norm": 0.20442235469818115, "learning_rate": 1.9845928476725522e-06, "loss": 0.3453, "step": 709 }, { "epoch": 1.396, "grad_norm": 0.1933489441871643, "learning_rate": 1.971951564311668e-06, "loss": 0.3581, "step": 710 }, { "epoch": 1.3980000000000001, "grad_norm": 0.19691258668899536, "learning_rate": 1.959340773364911e-06, "loss": 0.2933, "step": 711 }, { "epoch": 1.4, "grad_norm": 0.1842382252216339, "learning_rate": 1.946760601822809e-06, "loss": 0.2894, "step": 712 }, { "epoch": 1.4020000000000001, "grad_norm": 0.35139110684394836, "learning_rate": 1.9342111763675512e-06, "loss": 0.3405, "step": 713 }, { "epoch": 1.404, "grad_norm": 0.19070106744766235, "learning_rate": 1.9216926233717087e-06, "loss": 0.213, "step": 714 }, { "epoch": 1.4060000000000001, "grad_norm": 0.20061296224594116, "learning_rate": 1.9092050688969736e-06, "loss": 0.2858, "step": 715 }, { "epoch": 1.408, "grad_norm": 0.30167287588119507, "learning_rate": 1.8967486386928819e-06, "loss": 0.4004, "step": 716 }, { "epoch": 1.41, "grad_norm": 0.21128444373607635, "learning_rate": 1.8843234581955444e-06, "loss": 0.2326, "step": 717 }, { "epoch": 1.412, "grad_norm": 0.23791776597499847, "learning_rate": 1.8719296525263925e-06, "loss": 0.2337, "step": 718 }, { "epoch": 1.414, "grad_norm": 0.27308812737464905, "learning_rate": 1.859567346490913e-06, "loss": 0.2667, "step": 719 }, { "epoch": 1.416, "grad_norm": 0.19012384116649628, "learning_rate": 1.8472366645773892e-06, "loss": 0.2042, "step": 720 }, { "epoch": 1.418, "grad_norm": 0.2819920480251312, "learning_rate": 1.8349377309556487e-06, "loss": 0.3546, "step": 721 }, { "epoch": 1.42, "grad_norm": 0.16963627934455872, "learning_rate": 1.8226706694758194e-06, "loss": 0.2087, "step": 722 }, { "epoch": 1.422, "grad_norm": 0.222882941365242, "learning_rate": 1.810435603667075e-06, "loss": 0.3519, "step": 723 }, { "epoch": 1.424, "grad_norm": 0.200264573097229, "learning_rate": 1.798232656736389e-06, "loss": 0.2172, "step": 724 }, { "epoch": 1.426, "grad_norm": 0.25277942419052124, "learning_rate": 1.7860619515673034e-06, "loss": 0.3984, "step": 725 }, { "epoch": 1.428, "grad_norm": 0.24608227610588074, "learning_rate": 1.7739236107186858e-06, "loss": 0.2575, "step": 726 }, { "epoch": 1.43, "grad_norm": 0.30379989743232727, "learning_rate": 1.7618177564234907e-06, "loss": 0.2949, "step": 727 }, { "epoch": 1.432, "grad_norm": 0.15659303963184357, "learning_rate": 1.7497445105875377e-06, "loss": 0.1913, "step": 728 }, { "epoch": 1.434, "grad_norm": 0.2043537199497223, "learning_rate": 1.7377039947882802e-06, "loss": 0.2716, "step": 729 }, { "epoch": 1.436, "grad_norm": 0.20367324352264404, "learning_rate": 1.7256963302735752e-06, "loss": 0.2358, "step": 730 }, { "epoch": 1.438, "grad_norm": 0.28134340047836304, "learning_rate": 1.7137216379604727e-06, "loss": 0.2814, "step": 731 }, { "epoch": 1.44, "grad_norm": 0.2837545871734619, "learning_rate": 1.7017800384339928e-06, "loss": 0.3792, "step": 732 }, { "epoch": 1.442, "grad_norm": 0.22841040790081024, "learning_rate": 1.6898716519459074e-06, "loss": 0.2819, "step": 733 }, { "epoch": 1.444, "grad_norm": 0.21164868772029877, "learning_rate": 1.6779965984135376e-06, "loss": 0.2676, "step": 734 }, { "epoch": 1.446, "grad_norm": 0.2656158208847046, "learning_rate": 1.6661549974185426e-06, "loss": 0.284, "step": 735 }, { "epoch": 1.448, "grad_norm": 0.2675846815109253, "learning_rate": 1.6543469682057105e-06, "loss": 0.3098, "step": 736 }, { "epoch": 1.45, "grad_norm": 0.2900715172290802, "learning_rate": 1.6425726296817634e-06, "loss": 0.3378, "step": 737 }, { "epoch": 1.452, "grad_norm": 0.27534744143486023, "learning_rate": 1.6308321004141609e-06, "loss": 0.3497, "step": 738 }, { "epoch": 1.454, "grad_norm": 0.30499523878097534, "learning_rate": 1.6191254986299044e-06, "loss": 0.3271, "step": 739 }, { "epoch": 1.456, "grad_norm": 0.1775362193584442, "learning_rate": 1.6074529422143398e-06, "loss": 0.1754, "step": 740 }, { "epoch": 1.458, "grad_norm": 0.25734683871269226, "learning_rate": 1.5958145487099829e-06, "loss": 0.3568, "step": 741 }, { "epoch": 1.46, "grad_norm": 0.22716552019119263, "learning_rate": 1.5842104353153286e-06, "loss": 0.2856, "step": 742 }, { "epoch": 1.462, "grad_norm": 0.2042451947927475, "learning_rate": 1.5726407188836672e-06, "loss": 0.2623, "step": 743 }, { "epoch": 1.464, "grad_norm": 0.26923978328704834, "learning_rate": 1.561105515921915e-06, "loss": 0.4326, "step": 744 }, { "epoch": 1.466, "grad_norm": 0.22442659735679626, "learning_rate": 1.549604942589441e-06, "loss": 0.2867, "step": 745 }, { "epoch": 1.468, "grad_norm": 0.16880613565444946, "learning_rate": 1.5381391146968866e-06, "loss": 0.1821, "step": 746 }, { "epoch": 1.47, "grad_norm": 0.24349483847618103, "learning_rate": 1.5267081477050132e-06, "loss": 0.2753, "step": 747 }, { "epoch": 1.472, "grad_norm": 0.27072674036026, "learning_rate": 1.5153121567235334e-06, "loss": 0.2222, "step": 748 }, { "epoch": 1.474, "grad_norm": 0.291255921125412, "learning_rate": 1.5039512565099468e-06, "loss": 0.3485, "step": 749 }, { "epoch": 1.476, "grad_norm": 0.20078301429748535, "learning_rate": 1.4926255614683931e-06, "loss": 0.2959, "step": 750 }, { "epoch": 1.476, "eval_loss": 0.2654268741607666, "eval_runtime": 76.2376, "eval_samples_per_second": 7.241, "eval_steps_per_second": 0.905, "step": 750 }, { "epoch": 1.478, "grad_norm": 0.2795911431312561, "learning_rate": 1.4813351856484981e-06, "loss": 0.1859, "step": 751 }, { "epoch": 1.48, "grad_norm": 0.35663336515426636, "learning_rate": 1.470080242744218e-06, "loss": 0.3358, "step": 752 }, { "epoch": 1.482, "grad_norm": 0.23237483203411102, "learning_rate": 1.458860846092705e-06, "loss": 0.2874, "step": 753 }, { "epoch": 1.484, "grad_norm": 0.19958510994911194, "learning_rate": 1.4476771086731567e-06, "loss": 0.3507, "step": 754 }, { "epoch": 1.486, "grad_norm": 0.22077733278274536, "learning_rate": 1.4365291431056871e-06, "loss": 0.3085, "step": 755 }, { "epoch": 1.488, "grad_norm": 0.31041693687438965, "learning_rate": 1.4254170616501828e-06, "loss": 0.3724, "step": 756 }, { "epoch": 1.49, "grad_norm": 0.18345925211906433, "learning_rate": 1.4143409762051829e-06, "loss": 0.1957, "step": 757 }, { "epoch": 1.492, "grad_norm": 0.1973162293434143, "learning_rate": 1.4033009983067454e-06, "loss": 0.2304, "step": 758 }, { "epoch": 1.494, "grad_norm": 0.2636561095714569, "learning_rate": 1.3922972391273226e-06, "loss": 0.3215, "step": 759 }, { "epoch": 1.496, "grad_norm": 0.22231453657150269, "learning_rate": 1.3813298094746491e-06, "loss": 0.2346, "step": 760 }, { "epoch": 1.498, "grad_norm": 0.21096548438072205, "learning_rate": 1.3703988197906209e-06, "loss": 0.297, "step": 761 }, { "epoch": 1.5, "grad_norm": 0.29171353578567505, "learning_rate": 1.3595043801501794e-06, "loss": 0.362, "step": 762 }, { "epoch": 1.502, "grad_norm": 0.2302405834197998, "learning_rate": 1.3486466002602133e-06, "loss": 0.3468, "step": 763 }, { "epoch": 1.504, "grad_norm": 0.1669236272573471, "learning_rate": 1.3378255894584463e-06, "loss": 0.2525, "step": 764 }, { "epoch": 1.506, "grad_norm": 0.22917306423187256, "learning_rate": 1.3270414567123342e-06, "loss": 0.34, "step": 765 }, { "epoch": 1.508, "grad_norm": 0.22837324440479279, "learning_rate": 1.3162943106179748e-06, "loss": 0.516, "step": 766 }, { "epoch": 1.51, "grad_norm": 0.1973070204257965, "learning_rate": 1.305584259399013e-06, "loss": 0.2083, "step": 767 }, { "epoch": 1.512, "grad_norm": 0.25936761498451233, "learning_rate": 1.2949114109055417e-06, "loss": 0.4483, "step": 768 }, { "epoch": 1.514, "grad_norm": 0.23405812680721283, "learning_rate": 1.2842758726130283e-06, "loss": 0.3334, "step": 769 }, { "epoch": 1.516, "grad_norm": 0.2227783501148224, "learning_rate": 1.2736777516212267e-06, "loss": 0.3724, "step": 770 }, { "epoch": 1.518, "grad_norm": 0.23398268222808838, "learning_rate": 1.263117154653097e-06, "loss": 0.2008, "step": 771 }, { "epoch": 1.52, "grad_norm": 0.16665144264698029, "learning_rate": 1.2525941880537307e-06, "loss": 0.2177, "step": 772 }, { "epoch": 1.522, "grad_norm": 0.21703177690505981, "learning_rate": 1.242108957789287e-06, "loss": 0.2668, "step": 773 }, { "epoch": 1.524, "grad_norm": 0.3440599739551544, "learning_rate": 1.2316615694459188e-06, "loss": 0.3352, "step": 774 }, { "epoch": 1.526, "grad_norm": 0.2005206048488617, "learning_rate": 1.2212521282287093e-06, "loss": 0.2719, "step": 775 }, { "epoch": 1.528, "grad_norm": 0.2054724395275116, "learning_rate": 1.210880738960616e-06, "loss": 0.3181, "step": 776 }, { "epoch": 1.53, "grad_norm": 0.2903349995613098, "learning_rate": 1.200547506081416e-06, "loss": 0.3382, "step": 777 }, { "epoch": 1.532, "grad_norm": 0.22862407565116882, "learning_rate": 1.1902525336466465e-06, "loss": 0.2544, "step": 778 }, { "epoch": 1.534, "grad_norm": 0.20812873542308807, "learning_rate": 1.1799959253265668e-06, "loss": 0.3118, "step": 779 }, { "epoch": 1.536, "grad_norm": 0.2820591330528259, "learning_rate": 1.1697777844051105e-06, "loss": 0.3646, "step": 780 }, { "epoch": 1.538, "grad_norm": 0.21943072974681854, "learning_rate": 1.1595982137788403e-06, "loss": 0.1957, "step": 781 }, { "epoch": 1.54, "grad_norm": 0.1949055939912796, "learning_rate": 1.1494573159559214e-06, "loss": 0.253, "step": 782 }, { "epoch": 1.542, "grad_norm": 0.20829080045223236, "learning_rate": 1.1393551930550828e-06, "loss": 0.2558, "step": 783 }, { "epoch": 1.544, "grad_norm": 0.20741114020347595, "learning_rate": 1.1292919468045876e-06, "loss": 0.2221, "step": 784 }, { "epoch": 1.546, "grad_norm": 0.24327073991298676, "learning_rate": 1.1192676785412154e-06, "loss": 0.2616, "step": 785 }, { "epoch": 1.548, "grad_norm": 0.2541949152946472, "learning_rate": 1.1092824892092375e-06, "loss": 0.2435, "step": 786 }, { "epoch": 1.55, "grad_norm": 0.2096426635980606, "learning_rate": 1.099336479359398e-06, "loss": 0.2448, "step": 787 }, { "epoch": 1.552, "grad_norm": 0.24535740911960602, "learning_rate": 1.0894297491479044e-06, "loss": 0.2892, "step": 788 }, { "epoch": 1.554, "grad_norm": 0.2067105919122696, "learning_rate": 1.0795623983354214e-06, "loss": 0.2584, "step": 789 }, { "epoch": 1.556, "grad_norm": 0.2478252500295639, "learning_rate": 1.0697345262860638e-06, "loss": 0.3474, "step": 790 }, { "epoch": 1.558, "grad_norm": 0.17269453406333923, "learning_rate": 1.0599462319663906e-06, "loss": 0.2793, "step": 791 }, { "epoch": 1.56, "grad_norm": 0.2102997750043869, "learning_rate": 1.0501976139444191e-06, "loss": 0.3124, "step": 792 }, { "epoch": 1.562, "grad_norm": 0.29494714736938477, "learning_rate": 1.0404887703886252e-06, "loss": 0.2693, "step": 793 }, { "epoch": 1.564, "grad_norm": 0.19094854593276978, "learning_rate": 1.0308197990669538e-06, "loss": 0.3593, "step": 794 }, { "epoch": 1.5659999999999998, "grad_norm": 0.20082080364227295, "learning_rate": 1.0211907973458391e-06, "loss": 0.2296, "step": 795 }, { "epoch": 1.568, "grad_norm": 0.24483440816402435, "learning_rate": 1.0116018621892237e-06, "loss": 0.344, "step": 796 }, { "epoch": 1.5699999999999998, "grad_norm": 0.21700353920459747, "learning_rate": 1.0020530901575754e-06, "loss": 0.2562, "step": 797 }, { "epoch": 1.572, "grad_norm": 0.18885864317417145, "learning_rate": 9.925445774069232e-07, "loss": 0.2155, "step": 798 }, { "epoch": 1.5739999999999998, "grad_norm": 0.2546456754207611, "learning_rate": 9.830764196878872e-07, "loss": 0.3539, "step": 799 }, { "epoch": 1.576, "grad_norm": 0.20347674190998077, "learning_rate": 9.73648712344707e-07, "loss": 0.2864, "step": 800 }, { "epoch": 1.5779999999999998, "grad_norm": 0.3315930962562561, "learning_rate": 9.642615503142927e-07, "loss": 0.3753, "step": 801 }, { "epoch": 1.58, "grad_norm": 0.18244577944278717, "learning_rate": 9.549150281252633e-07, "loss": 0.2116, "step": 802 }, { "epoch": 1.5819999999999999, "grad_norm": 0.24047374725341797, "learning_rate": 9.456092398969902e-07, "loss": 0.3352, "step": 803 }, { "epoch": 1.584, "grad_norm": 0.2712211012840271, "learning_rate": 9.363442793386606e-07, "loss": 0.4647, "step": 804 }, { "epoch": 1.5859999999999999, "grad_norm": 0.15284787118434906, "learning_rate": 9.271202397483214e-07, "loss": 0.2296, "step": 805 }, { "epoch": 1.588, "grad_norm": 0.2665194571018219, "learning_rate": 9.179372140119524e-07, "loss": 0.353, "step": 806 }, { "epoch": 1.5899999999999999, "grad_norm": 0.2965538799762726, "learning_rate": 9.087952946025175e-07, "loss": 0.2863, "step": 807 }, { "epoch": 1.592, "grad_norm": 0.19379866123199463, "learning_rate": 8.996945735790447e-07, "loss": 0.3056, "step": 808 }, { "epoch": 1.5939999999999999, "grad_norm": 0.2339809238910675, "learning_rate": 8.906351425856952e-07, "loss": 0.3741, "step": 809 }, { "epoch": 1.596, "grad_norm": 0.2753208577632904, "learning_rate": 8.816170928508367e-07, "loss": 0.2715, "step": 810 }, { "epoch": 1.5979999999999999, "grad_norm": 0.2367635816335678, "learning_rate": 8.7264051518613e-07, "loss": 0.3268, "step": 811 }, { "epoch": 1.6, "grad_norm": 0.2004977911710739, "learning_rate": 8.637054999856148e-07, "loss": 0.2217, "step": 812 }, { "epoch": 1.6019999999999999, "grad_norm": 0.3549105226993561, "learning_rate": 8.54812137224792e-07, "loss": 0.3371, "step": 813 }, { "epoch": 1.604, "grad_norm": 0.27921661734580994, "learning_rate": 8.459605164597268e-07, "loss": 0.3983, "step": 814 }, { "epoch": 1.6059999999999999, "grad_norm": 0.2014499306678772, "learning_rate": 8.371507268261436e-07, "loss": 0.2413, "step": 815 }, { "epoch": 1.608, "grad_norm": 0.20690080523490906, "learning_rate": 8.283828570385239e-07, "loss": 0.2012, "step": 816 }, { "epoch": 1.6099999999999999, "grad_norm": 0.21998871862888336, "learning_rate": 8.196569953892202e-07, "loss": 0.3298, "step": 817 }, { "epoch": 1.612, "grad_norm": 0.3980468511581421, "learning_rate": 8.109732297475637e-07, "loss": 0.3194, "step": 818 }, { "epoch": 1.6139999999999999, "grad_norm": 0.20355728268623352, "learning_rate": 8.023316475589754e-07, "loss": 0.1823, "step": 819 }, { "epoch": 1.616, "grad_norm": 0.17916588485240936, "learning_rate": 7.937323358440935e-07, "loss": 0.2189, "step": 820 }, { "epoch": 1.6179999999999999, "grad_norm": 0.3024926781654358, "learning_rate": 7.851753811978924e-07, "loss": 0.3149, "step": 821 }, { "epoch": 1.62, "grad_norm": 0.20770519971847534, "learning_rate": 7.766608697888095e-07, "loss": 0.2967, "step": 822 }, { "epoch": 1.6219999999999999, "grad_norm": 0.2985385060310364, "learning_rate": 7.681888873578786e-07, "loss": 0.3245, "step": 823 }, { "epoch": 1.624, "grad_norm": 0.238825723528862, "learning_rate": 7.597595192178702e-07, "loss": 0.2024, "step": 824 }, { "epoch": 1.626, "grad_norm": 0.24210689961910248, "learning_rate": 7.513728502524286e-07, "loss": 0.3364, "step": 825 }, { "epoch": 1.6280000000000001, "grad_norm": 0.2465432733297348, "learning_rate": 7.430289649152156e-07, "loss": 0.3643, "step": 826 }, { "epoch": 1.63, "grad_norm": 0.37851664423942566, "learning_rate": 7.347279472290647e-07, "loss": 0.4549, "step": 827 }, { "epoch": 1.6320000000000001, "grad_norm": 0.29046836495399475, "learning_rate": 7.264698807851328e-07, "loss": 0.3777, "step": 828 }, { "epoch": 1.634, "grad_norm": 0.17954066395759583, "learning_rate": 7.182548487420555e-07, "loss": 0.1817, "step": 829 }, { "epoch": 1.6360000000000001, "grad_norm": 0.21587719023227692, "learning_rate": 7.100829338251147e-07, "loss": 0.3208, "step": 830 }, { "epoch": 1.638, "grad_norm": 0.24211935698986053, "learning_rate": 7.019542183254047e-07, "loss": 0.302, "step": 831 }, { "epoch": 1.6400000000000001, "grad_norm": 0.3430536389350891, "learning_rate": 6.938687840989972e-07, "loss": 0.3358, "step": 832 }, { "epoch": 1.642, "grad_norm": 0.26358646154403687, "learning_rate": 6.858267125661272e-07, "loss": 0.3329, "step": 833 }, { "epoch": 1.6440000000000001, "grad_norm": 0.21013550460338593, "learning_rate": 6.778280847103668e-07, "loss": 0.247, "step": 834 }, { "epoch": 1.646, "grad_norm": 0.17694292962551117, "learning_rate": 6.698729810778065e-07, "loss": 0.2205, "step": 835 }, { "epoch": 1.6480000000000001, "grad_norm": 0.15793128311634064, "learning_rate": 6.619614817762537e-07, "loss": 0.1541, "step": 836 }, { "epoch": 1.65, "grad_norm": 0.18143923580646515, "learning_rate": 6.540936664744197e-07, "loss": 0.2367, "step": 837 }, { "epoch": 1.6520000000000001, "grad_norm": 0.21212640404701233, "learning_rate": 6.462696144011149e-07, "loss": 0.3049, "step": 838 }, { "epoch": 1.654, "grad_norm": 0.21567395329475403, "learning_rate": 6.384894043444568e-07, "loss": 0.2519, "step": 839 }, { "epoch": 1.6560000000000001, "grad_norm": 0.17464697360992432, "learning_rate": 6.307531146510754e-07, "loss": 0.1692, "step": 840 }, { "epoch": 1.658, "grad_norm": 0.23152326047420502, "learning_rate": 6.230608232253227e-07, "loss": 0.2823, "step": 841 }, { "epoch": 1.6600000000000001, "grad_norm": 0.3341864049434662, "learning_rate": 6.154126075284855e-07, "loss": 0.2823, "step": 842 }, { "epoch": 1.662, "grad_norm": 0.24136964976787567, "learning_rate": 6.07808544578013e-07, "loss": 0.3713, "step": 843 }, { "epoch": 1.6640000000000001, "grad_norm": 0.21439406275749207, "learning_rate": 6.002487109467347e-07, "loss": 0.2631, "step": 844 }, { "epoch": 1.666, "grad_norm": 0.3102458715438843, "learning_rate": 5.927331827620902e-07, "loss": 0.3513, "step": 845 }, { "epoch": 1.6680000000000001, "grad_norm": 0.20326466858386993, "learning_rate": 5.852620357053651e-07, "loss": 0.2738, "step": 846 }, { "epoch": 1.67, "grad_norm": 0.185090109705925, "learning_rate": 5.778353450109286e-07, "loss": 0.2665, "step": 847 }, { "epoch": 1.6720000000000002, "grad_norm": 0.17061105370521545, "learning_rate": 5.704531854654721e-07, "loss": 0.2018, "step": 848 }, { "epoch": 1.674, "grad_norm": 0.18026676774024963, "learning_rate": 5.631156314072605e-07, "loss": 0.2182, "step": 849 }, { "epoch": 1.6760000000000002, "grad_norm": 0.24431855976581573, "learning_rate": 5.558227567253832e-07, "loss": 0.3036, "step": 850 }, { "epoch": 1.678, "grad_norm": 0.1817561835050583, "learning_rate": 5.485746348590048e-07, "loss": 0.2786, "step": 851 }, { "epoch": 1.6800000000000002, "grad_norm": 0.20034758746623993, "learning_rate": 5.413713387966329e-07, "loss": 0.2073, "step": 852 }, { "epoch": 1.682, "grad_norm": 0.23046346008777618, "learning_rate": 5.34212941075381e-07, "loss": 0.2456, "step": 853 }, { "epoch": 1.6840000000000002, "grad_norm": 0.28231683373451233, "learning_rate": 5.270995137802315e-07, "loss": 0.2962, "step": 854 }, { "epoch": 1.686, "grad_norm": 0.20535282790660858, "learning_rate": 5.200311285433213e-07, "loss": 0.2003, "step": 855 }, { "epoch": 1.688, "grad_norm": 0.27334460616111755, "learning_rate": 5.130078565432089e-07, "loss": 0.2784, "step": 856 }, { "epoch": 1.69, "grad_norm": 0.2541443109512329, "learning_rate": 5.06029768504166e-07, "loss": 0.3575, "step": 857 }, { "epoch": 1.692, "grad_norm": 0.20568181574344635, "learning_rate": 4.990969346954611e-07, "loss": 0.3116, "step": 858 }, { "epoch": 1.694, "grad_norm": 0.2725497782230377, "learning_rate": 4.922094249306559e-07, "loss": 0.2698, "step": 859 }, { "epoch": 1.696, "grad_norm": 0.2767050862312317, "learning_rate": 4.853673085668947e-07, "loss": 0.3246, "step": 860 }, { "epoch": 1.698, "grad_norm": 0.27081194519996643, "learning_rate": 4.785706545042141e-07, "loss": 0.3067, "step": 861 }, { "epoch": 1.7, "grad_norm": 0.2148142009973526, "learning_rate": 4.7181953118484556e-07, "loss": 0.335, "step": 862 }, { "epoch": 1.702, "grad_norm": 0.20924992859363556, "learning_rate": 4.651140065925269e-07, "loss": 0.2473, "step": 863 }, { "epoch": 1.704, "grad_norm": 0.1969323456287384, "learning_rate": 4.58454148251814e-07, "loss": 0.2384, "step": 864 }, { "epoch": 1.706, "grad_norm": 0.21272586286067963, "learning_rate": 4.5184002322740784e-07, "loss": 0.1894, "step": 865 }, { "epoch": 1.708, "grad_norm": 0.22230306267738342, "learning_rate": 4.4527169812347446e-07, "loss": 0.2878, "step": 866 }, { "epoch": 1.71, "grad_norm": 0.23957069218158722, "learning_rate": 4.387492390829734e-07, "loss": 0.2608, "step": 867 }, { "epoch": 1.712, "grad_norm": 0.19603803753852844, "learning_rate": 4.322727117869951e-07, "loss": 0.2291, "step": 868 }, { "epoch": 1.714, "grad_norm": 0.19814668595790863, "learning_rate": 4.2584218145409916e-07, "loss": 0.2933, "step": 869 }, { "epoch": 1.716, "grad_norm": 0.2840145230293274, "learning_rate": 4.194577128396521e-07, "loss": 0.2678, "step": 870 }, { "epoch": 1.718, "grad_norm": 0.3841419816017151, "learning_rate": 4.131193702351827e-07, "loss": 0.4492, "step": 871 }, { "epoch": 1.72, "grad_norm": 0.1749158352613449, "learning_rate": 4.0682721746773346e-07, "loss": 0.2205, "step": 872 }, { "epoch": 1.722, "grad_norm": 0.22776730358600616, "learning_rate": 4.005813178992091e-07, "loss": 0.2634, "step": 873 }, { "epoch": 1.724, "grad_norm": 0.20322760939598083, "learning_rate": 3.9438173442575e-07, "loss": 0.3125, "step": 874 }, { "epoch": 1.726, "grad_norm": 0.24371430277824402, "learning_rate": 3.882285294770938e-07, "loss": 0.3223, "step": 875 }, { "epoch": 1.726, "eval_loss": 0.26352861523628235, "eval_runtime": 76.577, "eval_samples_per_second": 7.208, "eval_steps_per_second": 0.901, "step": 875 }, { "epoch": 1.728, "grad_norm": 0.2777194678783417, "learning_rate": 3.821217650159453e-07, "loss": 0.3117, "step": 876 }, { "epoch": 1.73, "grad_norm": 0.21060119569301605, "learning_rate": 3.760615025373543e-07, "loss": 0.2444, "step": 877 }, { "epoch": 1.732, "grad_norm": 0.19364982843399048, "learning_rate": 3.7004780306809873e-07, "loss": 0.2534, "step": 878 }, { "epoch": 1.734, "grad_norm": 0.2388126105070114, "learning_rate": 3.6408072716606346e-07, "loss": 0.5307, "step": 879 }, { "epoch": 1.736, "grad_norm": 0.21501779556274414, "learning_rate": 3.581603349196372e-07, "loss": 0.299, "step": 880 }, { "epoch": 1.738, "grad_norm": 0.2748852074146271, "learning_rate": 3.522866859471047e-07, "loss": 0.4626, "step": 881 }, { "epoch": 1.74, "grad_norm": 0.2657471299171448, "learning_rate": 3.46459839396045e-07, "loss": 0.2947, "step": 882 }, { "epoch": 1.742, "grad_norm": 0.1825701743364334, "learning_rate": 3.406798539427386e-07, "loss": 0.2525, "step": 883 }, { "epoch": 1.744, "grad_norm": 0.18898171186447144, "learning_rate": 3.3494678779157464e-07, "loss": 0.2188, "step": 884 }, { "epoch": 1.746, "grad_norm": 0.2019154280424118, "learning_rate": 3.2926069867446673e-07, "loss": 0.2575, "step": 885 }, { "epoch": 1.748, "grad_norm": 0.26931118965148926, "learning_rate": 3.2362164385026704e-07, "loss": 0.2867, "step": 886 }, { "epoch": 1.75, "grad_norm": 0.25869134068489075, "learning_rate": 3.180296801041971e-07, "loss": 0.4233, "step": 887 }, { "epoch": 1.752, "grad_norm": 0.24689964950084686, "learning_rate": 3.1248486374726884e-07, "loss": 0.3778, "step": 888 }, { "epoch": 1.754, "grad_norm": 0.2961515486240387, "learning_rate": 3.069872506157212e-07, "loss": 0.3767, "step": 889 }, { "epoch": 1.756, "grad_norm": 0.2758214473724365, "learning_rate": 3.015368960704584e-07, "loss": 0.4107, "step": 890 }, { "epoch": 1.758, "grad_norm": 0.19258597493171692, "learning_rate": 2.9613385499648926e-07, "loss": 0.2285, "step": 891 }, { "epoch": 1.76, "grad_norm": 0.21885156631469727, "learning_rate": 2.9077818180237693e-07, "loss": 0.2726, "step": 892 }, { "epoch": 1.762, "grad_norm": 0.20850767195224762, "learning_rate": 2.8546993041969173e-07, "loss": 0.3443, "step": 893 }, { "epoch": 1.764, "grad_norm": 0.22747254371643066, "learning_rate": 2.802091543024671e-07, "loss": 0.2785, "step": 894 }, { "epoch": 1.766, "grad_norm": 0.18733809888362885, "learning_rate": 2.7499590642665773e-07, "loss": 0.2047, "step": 895 }, { "epoch": 1.768, "grad_norm": 0.230934277176857, "learning_rate": 2.6983023928961406e-07, "loss": 0.2994, "step": 896 }, { "epoch": 1.77, "grad_norm": 0.1833610087633133, "learning_rate": 2.647122049095463e-07, "loss": 0.2064, "step": 897 }, { "epoch": 1.772, "grad_norm": 0.2077609896659851, "learning_rate": 2.596418548250029e-07, "loss": 0.2537, "step": 898 }, { "epoch": 1.774, "grad_norm": 0.163072407245636, "learning_rate": 2.546192400943537e-07, "loss": 0.194, "step": 899 }, { "epoch": 1.776, "grad_norm": 0.1943567395210266, "learning_rate": 2.4964441129527337e-07, "loss": 0.2519, "step": 900 }, { "epoch": 1.778, "grad_norm": 0.18382684886455536, "learning_rate": 2.447174185242324e-07, "loss": 0.1944, "step": 901 }, { "epoch": 1.78, "grad_norm": 0.20981475710868835, "learning_rate": 2.398383113959929e-07, "loss": 0.173, "step": 902 }, { "epoch": 1.782, "grad_norm": 0.1996649205684662, "learning_rate": 2.3500713904311023e-07, "loss": 0.2536, "step": 903 }, { "epoch": 1.784, "grad_norm": 0.2560986578464508, "learning_rate": 2.3022395011543687e-07, "loss": 0.374, "step": 904 }, { "epoch": 1.786, "grad_norm": 0.20811672508716583, "learning_rate": 2.2548879277963065e-07, "loss": 0.3225, "step": 905 }, { "epoch": 1.788, "grad_norm": 0.1996699571609497, "learning_rate": 2.2080171471867362e-07, "loss": 0.2632, "step": 906 }, { "epoch": 1.79, "grad_norm": 0.20678700506687164, "learning_rate": 2.161627631313923e-07, "loss": 0.3513, "step": 907 }, { "epoch": 1.792, "grad_norm": 0.20172181725502014, "learning_rate": 2.1157198473197417e-07, "loss": 0.2117, "step": 908 }, { "epoch": 1.794, "grad_norm": 0.16854679584503174, "learning_rate": 2.0702942574950812e-07, "loss": 0.3006, "step": 909 }, { "epoch": 1.796, "grad_norm": 0.1959567815065384, "learning_rate": 2.0253513192751374e-07, "loss": 0.2695, "step": 910 }, { "epoch": 1.798, "grad_norm": 0.1726803481578827, "learning_rate": 1.9808914852347817e-07, "loss": 0.2635, "step": 911 }, { "epoch": 1.8, "grad_norm": 0.22450147569179535, "learning_rate": 1.9369152030840553e-07, "loss": 0.2598, "step": 912 }, { "epoch": 1.802, "grad_norm": 0.26783040165901184, "learning_rate": 1.8934229156636453e-07, "loss": 0.2029, "step": 913 }, { "epoch": 1.804, "grad_norm": 0.2690034508705139, "learning_rate": 1.8504150609403858e-07, "loss": 0.2446, "step": 914 }, { "epoch": 1.806, "grad_norm": 0.23306065797805786, "learning_rate": 1.807892072002898e-07, "loss": 0.3264, "step": 915 }, { "epoch": 1.808, "grad_norm": 0.2681446075439453, "learning_rate": 1.765854377057219e-07, "loss": 0.302, "step": 916 }, { "epoch": 1.81, "grad_norm": 0.19500699639320374, "learning_rate": 1.724302399422456e-07, "loss": 0.2066, "step": 917 }, { "epoch": 1.812, "grad_norm": 0.2524206340312958, "learning_rate": 1.6832365575265742e-07, "loss": 0.3334, "step": 918 }, { "epoch": 1.814, "grad_norm": 0.2076834887266159, "learning_rate": 1.6426572649021477e-07, "loss": 0.2737, "step": 919 }, { "epoch": 1.8159999999999998, "grad_norm": 0.28093916177749634, "learning_rate": 1.6025649301821877e-07, "loss": 0.3558, "step": 920 }, { "epoch": 1.818, "grad_norm": 0.24566200375556946, "learning_rate": 1.562959957096072e-07, "loss": 0.3636, "step": 921 }, { "epoch": 1.8199999999999998, "grad_norm": 0.2996765077114105, "learning_rate": 1.5238427444654368e-07, "loss": 0.3945, "step": 922 }, { "epoch": 1.822, "grad_norm": 0.24855782091617584, "learning_rate": 1.4852136862001766e-07, "loss": 0.1894, "step": 923 }, { "epoch": 1.8239999999999998, "grad_norm": 0.2089153230190277, "learning_rate": 1.4470731712944885e-07, "loss": 0.3297, "step": 924 }, { "epoch": 1.826, "grad_norm": 0.3130733072757721, "learning_rate": 1.4094215838229176e-07, "loss": 0.4001, "step": 925 }, { "epoch": 1.8279999999999998, "grad_norm": 0.2722707688808441, "learning_rate": 1.372259302936546e-07, "loss": 0.356, "step": 926 }, { "epoch": 1.83, "grad_norm": 0.15767575800418854, "learning_rate": 1.3355867028591209e-07, "loss": 0.2161, "step": 927 }, { "epoch": 1.8319999999999999, "grad_norm": 0.18771317601203918, "learning_rate": 1.2994041528833267e-07, "loss": 0.1912, "step": 928 }, { "epoch": 1.834, "grad_norm": 0.15640737116336823, "learning_rate": 1.263712017367036e-07, "loss": 0.2173, "step": 929 }, { "epoch": 1.8359999999999999, "grad_norm": 0.2588789463043213, "learning_rate": 1.2285106557296479e-07, "loss": 0.3506, "step": 930 }, { "epoch": 1.838, "grad_norm": 0.21290963888168335, "learning_rate": 1.193800422448499e-07, "loss": 0.2377, "step": 931 }, { "epoch": 1.8399999999999999, "grad_norm": 0.198676198720932, "learning_rate": 1.1595816670552429e-07, "loss": 0.1823, "step": 932 }, { "epoch": 1.842, "grad_norm": 0.23629765212535858, "learning_rate": 1.12585473413237e-07, "loss": 0.2565, "step": 933 }, { "epoch": 1.8439999999999999, "grad_norm": 0.23395268619060516, "learning_rate": 1.0926199633097156e-07, "loss": 0.2184, "step": 934 }, { "epoch": 1.846, "grad_norm": 0.2589554190635681, "learning_rate": 1.0598776892610685e-07, "loss": 0.369, "step": 935 }, { "epoch": 1.8479999999999999, "grad_norm": 0.22093115746974945, "learning_rate": 1.0276282417007399e-07, "loss": 0.3437, "step": 936 }, { "epoch": 1.85, "grad_norm": 0.23697194457054138, "learning_rate": 9.958719453803278e-08, "loss": 0.3288, "step": 937 }, { "epoch": 1.8519999999999999, "grad_norm": 0.22383596003055573, "learning_rate": 9.646091200853802e-08, "loss": 0.4897, "step": 938 }, { "epoch": 1.854, "grad_norm": 0.20475724339485168, "learning_rate": 9.338400806321979e-08, "loss": 0.257, "step": 939 }, { "epoch": 1.8559999999999999, "grad_norm": 0.263615220785141, "learning_rate": 9.035651368646647e-08, "loss": 0.4592, "step": 940 }, { "epoch": 1.858, "grad_norm": 0.24478185176849365, "learning_rate": 8.737845936511335e-08, "loss": 0.4337, "step": 941 }, { "epoch": 1.8599999999999999, "grad_norm": 0.2436402142047882, "learning_rate": 8.444987508813451e-08, "loss": 0.3344, "step": 942 }, { "epoch": 1.862, "grad_norm": 0.23337677121162415, "learning_rate": 8.157079034633974e-08, "loss": 0.2967, "step": 943 }, { "epoch": 1.8639999999999999, "grad_norm": 0.20073962211608887, "learning_rate": 7.874123413208145e-08, "loss": 0.1952, "step": 944 }, { "epoch": 1.866, "grad_norm": 0.2582467496395111, "learning_rate": 7.59612349389599e-08, "loss": 0.372, "step": 945 }, { "epoch": 1.8679999999999999, "grad_norm": 0.2121819704771042, "learning_rate": 7.32308207615351e-08, "loss": 0.2619, "step": 946 }, { "epoch": 1.87, "grad_norm": 0.16836410760879517, "learning_rate": 7.055001909504755e-08, "loss": 0.293, "step": 947 }, { "epoch": 1.8719999999999999, "grad_norm": 0.18819768726825714, "learning_rate": 6.791885693514134e-08, "loss": 0.2476, "step": 948 }, { "epoch": 1.874, "grad_norm": 0.2157561331987381, "learning_rate": 6.533736077758868e-08, "loss": 0.2615, "step": 949 }, { "epoch": 1.876, "grad_norm": 0.24670301377773285, "learning_rate": 6.280555661802857e-08, "loss": 0.371, "step": 950 }, { "epoch": 1.8780000000000001, "grad_norm": 0.21483668684959412, "learning_rate": 6.032346995169968e-08, "loss": 0.2231, "step": 951 }, { "epoch": 1.88, "grad_norm": 0.1763847917318344, "learning_rate": 5.7891125773187896e-08, "loss": 0.2074, "step": 952 }, { "epoch": 1.8820000000000001, "grad_norm": 0.20190970599651337, "learning_rate": 5.550854857617194e-08, "loss": 0.3226, "step": 953 }, { "epoch": 1.884, "grad_norm": 0.23266001045703888, "learning_rate": 5.3175762353177563e-08, "loss": 0.3055, "step": 954 }, { "epoch": 1.8860000000000001, "grad_norm": 0.26426488161087036, "learning_rate": 5.089279059533658e-08, "loss": 0.3319, "step": 955 }, { "epoch": 1.888, "grad_norm": 0.24322916567325592, "learning_rate": 4.865965629214819e-08, "loss": 0.2372, "step": 956 }, { "epoch": 1.8900000000000001, "grad_norm": 0.23628686368465424, "learning_rate": 4.6476381931251366e-08, "loss": 0.3808, "step": 957 }, { "epoch": 1.892, "grad_norm": 0.16934725642204285, "learning_rate": 4.434298949819449e-08, "loss": 0.1737, "step": 958 }, { "epoch": 1.8940000000000001, "grad_norm": 0.30660754442214966, "learning_rate": 4.225950047621441e-08, "loss": 0.3483, "step": 959 }, { "epoch": 1.896, "grad_norm": 0.27640894055366516, "learning_rate": 4.02259358460233e-08, "loss": 0.3264, "step": 960 }, { "epoch": 1.8980000000000001, "grad_norm": 0.2123912125825882, "learning_rate": 3.8242316085594923e-08, "loss": 0.3876, "step": 961 }, { "epoch": 1.9, "grad_norm": 0.2987152636051178, "learning_rate": 3.630866116995757e-08, "loss": 0.4525, "step": 962 }, { "epoch": 1.9020000000000001, "grad_norm": 0.22001074254512787, "learning_rate": 3.44249905709948e-08, "loss": 0.1842, "step": 963 }, { "epoch": 1.904, "grad_norm": 0.20775096118450165, "learning_rate": 3.25913232572489e-08, "loss": 0.3012, "step": 964 }, { "epoch": 1.9060000000000001, "grad_norm": 0.19180834293365479, "learning_rate": 3.080767769372939e-08, "loss": 0.2681, "step": 965 }, { "epoch": 1.908, "grad_norm": 0.22222468256950378, "learning_rate": 2.907407184172706e-08, "loss": 0.1809, "step": 966 }, { "epoch": 1.9100000000000001, "grad_norm": 0.20555076003074646, "learning_rate": 2.7390523158633552e-08, "loss": 0.1482, "step": 967 }, { "epoch": 1.912, "grad_norm": 0.29668375849723816, "learning_rate": 2.57570485977654e-08, "loss": 0.2179, "step": 968 }, { "epoch": 1.9140000000000001, "grad_norm": 0.19830183684825897, "learning_rate": 2.4173664608193592e-08, "loss": 0.2677, "step": 969 }, { "epoch": 1.916, "grad_norm": 0.23050029575824738, "learning_rate": 2.264038713457706e-08, "loss": 0.3348, "step": 970 }, { "epoch": 1.9180000000000001, "grad_norm": 0.36921679973602295, "learning_rate": 2.1157231617002783e-08, "loss": 0.4821, "step": 971 }, { "epoch": 1.92, "grad_norm": 0.16172367334365845, "learning_rate": 1.9724212990830938e-08, "loss": 0.2348, "step": 972 }, { "epoch": 1.9220000000000002, "grad_norm": 0.18016183376312256, "learning_rate": 1.834134568654333e-08, "loss": 0.2486, "step": 973 }, { "epoch": 1.924, "grad_norm": 0.32527899742126465, "learning_rate": 1.7008643629596866e-08, "loss": 0.3623, "step": 974 }, { "epoch": 1.9260000000000002, "grad_norm": 0.21802493929862976, "learning_rate": 1.5726120240288632e-08, "loss": 0.2155, "step": 975 }, { "epoch": 1.928, "grad_norm": 0.23393763601779938, "learning_rate": 1.449378843361271e-08, "loss": 0.284, "step": 976 }, { "epoch": 1.9300000000000002, "grad_norm": 0.2498655915260315, "learning_rate": 1.3311660619138578e-08, "loss": 0.2816, "step": 977 }, { "epoch": 1.932, "grad_norm": 0.20273719727993011, "learning_rate": 1.2179748700879013e-08, "loss": 0.2945, "step": 978 }, { "epoch": 1.9340000000000002, "grad_norm": 0.16979333758354187, "learning_rate": 1.109806407717462e-08, "loss": 0.1949, "step": 979 }, { "epoch": 1.936, "grad_norm": 0.18881943821907043, "learning_rate": 1.006661764057837e-08, "loss": 0.2681, "step": 980 }, { "epoch": 1.938, "grad_norm": 0.23016507923603058, "learning_rate": 9.085419777743465e-09, "loss": 0.4162, "step": 981 }, { "epoch": 1.94, "grad_norm": 0.21829769015312195, "learning_rate": 8.15448036932176e-09, "loss": 0.3911, "step": 982 }, { "epoch": 1.942, "grad_norm": 0.192356139421463, "learning_rate": 7.273808789862724e-09, "loss": 0.3076, "step": 983 }, { "epoch": 1.944, "grad_norm": 0.20806097984313965, "learning_rate": 6.4434139077201865e-09, "loss": 0.2808, "step": 984 }, { "epoch": 1.946, "grad_norm": 0.2533554434776306, "learning_rate": 5.6633040849601865e-09, "loss": 0.264, "step": 985 }, { "epoch": 1.948, "grad_norm": 0.25440603494644165, "learning_rate": 4.933487177280483e-09, "loss": 0.386, "step": 986 }, { "epoch": 1.95, "grad_norm": 0.2403300553560257, "learning_rate": 4.253970533929508e-09, "loss": 0.2665, "step": 987 }, { "epoch": 1.952, "grad_norm": 0.18095187842845917, "learning_rate": 3.6247609976319818e-09, "loss": 0.2414, "step": 988 }, { "epoch": 1.954, "grad_norm": 0.43698740005493164, "learning_rate": 3.0458649045211897e-09, "loss": 0.4131, "step": 989 }, { "epoch": 1.956, "grad_norm": 0.2908496856689453, "learning_rate": 2.5172880840745873e-09, "loss": 0.2955, "step": 990 }, { "epoch": 1.958, "grad_norm": 0.19435322284698486, "learning_rate": 2.0390358590538507e-09, "loss": 0.1839, "step": 991 }, { "epoch": 1.96, "grad_norm": 0.20639224350452423, "learning_rate": 1.61111304545436e-09, "loss": 0.336, "step": 992 }, { "epoch": 1.962, "grad_norm": 0.18591168522834778, "learning_rate": 1.2335239524541298e-09, "loss": 0.2653, "step": 993 }, { "epoch": 1.964, "grad_norm": 0.2295517921447754, "learning_rate": 9.062723823710651e-10, "loss": 0.3478, "step": 994 }, { "epoch": 1.966, "grad_norm": 0.2810915410518646, "learning_rate": 6.293616306246586e-10, "loss": 0.3266, "step": 995 }, { "epoch": 1.968, "grad_norm": 0.19316555559635162, "learning_rate": 4.027944857032395e-10, "loss": 0.2753, "step": 996 }, { "epoch": 1.97, "grad_norm": 0.24243375658988953, "learning_rate": 2.265732291356626e-10, "loss": 0.2786, "step": 997 }, { "epoch": 1.972, "grad_norm": 0.27688726782798767, "learning_rate": 1.0069963546743833e-10, "loss": 0.2615, "step": 998 }, { "epoch": 1.974, "grad_norm": 0.18696589767932892, "learning_rate": 2.5174972244634834e-11, "loss": 0.2866, "step": 999 }, { "epoch": 1.976, "grad_norm": 0.21791526675224304, "learning_rate": 0.0, "loss": 0.2074, "step": 1000 }, { "epoch": 1.976, "eval_loss": 0.26330506801605225, "eval_runtime": 76.7272, "eval_samples_per_second": 7.194, "eval_steps_per_second": 0.899, "step": 1000 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.531674674724864e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }