diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,6155 +3,3089 @@ "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, - "global_step": 43800, + "global_step": 21900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.00228310502283105, - "grad_norm": 33.85030746459961, - "learning_rate": 2.9965753424657534e-05, - "loss": 4.7256, + "epoch": 0.0045662100456621, + "grad_norm": 22.38092803955078, + "learning_rate": 2.993150684931507e-05, + "loss": 5.0082, "step": 50 }, { - "epoch": 0.0045662100456621, - "grad_norm": 28.36233139038086, - "learning_rate": 2.993150684931507e-05, - "loss": 4.0093, + "epoch": 0.0091324200913242, + "grad_norm": 19.525426864624023, + "learning_rate": 2.9863013698630136e-05, + "loss": 4.0853, "step": 100 }, { - "epoch": 0.00684931506849315, - "grad_norm": 40.79988098144531, - "learning_rate": 2.9897260273972603e-05, - "loss": 3.3789, + "epoch": 0.0136986301369863, + "grad_norm": 29.21298599243164, + "learning_rate": 2.9794520547945206e-05, + "loss": 3.8879, "step": 150 }, { - "epoch": 0.0091324200913242, - "grad_norm": 38.37752914428711, - "learning_rate": 2.9863013698630136e-05, - "loss": 2.5531, + "epoch": 0.0182648401826484, + "grad_norm": 29.60974884033203, + "learning_rate": 2.9726027397260275e-05, + "loss": 3.5946, "step": 200 }, { - "epoch": 0.01141552511415525, - "grad_norm": 46.2108268737793, - "learning_rate": 2.9828767123287673e-05, - "loss": 2.4412, + "epoch": 0.0228310502283105, + "grad_norm": 20.75732421875, + "learning_rate": 2.9657534246575345e-05, + "loss": 3.1165, "step": 250 }, { - "epoch": 0.0136986301369863, - "grad_norm": 22.16556739807129, - "learning_rate": 2.9794520547945206e-05, - "loss": 2.1968, + "epoch": 0.0273972602739726, + "grad_norm": 31.49699592590332, + "learning_rate": 2.958904109589041e-05, + "loss": 2.7346, "step": 300 }, { - "epoch": 0.01598173515981735, - "grad_norm": 38.14749526977539, - "learning_rate": 2.9760273972602742e-05, - "loss": 2.0913, + "epoch": 0.0319634703196347, + "grad_norm": 43.215335845947266, + "learning_rate": 2.952054794520548e-05, + "loss": 2.3509, "step": 350 }, { - "epoch": 0.0182648401826484, - "grad_norm": 38.938079833984375, - "learning_rate": 2.9726027397260275e-05, - "loss": 2.0564, + "epoch": 0.0365296803652968, + "grad_norm": 29.553388595581055, + "learning_rate": 2.945205479452055e-05, + "loss": 2.3374, "step": 400 }, { - "epoch": 0.02054794520547945, - "grad_norm": 51.680259704589844, - "learning_rate": 2.969178082191781e-05, - "loss": 1.9041, + "epoch": 0.0410958904109589, + "grad_norm": 30.272083282470703, + "learning_rate": 2.938356164383562e-05, + "loss": 2.0551, "step": 450 }, { - "epoch": 0.0228310502283105, - "grad_norm": 23.21047019958496, - "learning_rate": 2.9657534246575345e-05, - "loss": 1.7974, + "epoch": 0.045662100456621, + "grad_norm": 23.157560348510742, + "learning_rate": 2.9315068493150685e-05, + "loss": 1.9659, "step": 500 }, { - "epoch": 0.02511415525114155, - "grad_norm": 27.936336517333984, - "learning_rate": 2.9623287671232878e-05, - "loss": 1.6824, + "epoch": 0.0502283105022831, + "grad_norm": 39.72032165527344, + "learning_rate": 2.9246575342465755e-05, + "loss": 1.7552, "step": 550 }, { - "epoch": 0.0273972602739726, - "grad_norm": 121.84676361083984, - "learning_rate": 2.958904109589041e-05, - "loss": 2.0853, + "epoch": 0.0547945205479452, + "grad_norm": 24.441036224365234, + "learning_rate": 2.9178082191780824e-05, + "loss": 1.7992, "step": 600 }, { - "epoch": 0.02968036529680365, - "grad_norm": 51.72910690307617, - "learning_rate": 2.9554794520547947e-05, - "loss": 1.8569, + "epoch": 0.0593607305936073, + "grad_norm": 32.07455825805664, + "learning_rate": 2.910958904109589e-05, + "loss": 1.6968, "step": 650 }, { - "epoch": 0.0319634703196347, - "grad_norm": 27.697223663330078, - "learning_rate": 2.952054794520548e-05, - "loss": 1.7805, + "epoch": 0.0639269406392694, + "grad_norm": 25.034423828125, + "learning_rate": 2.9041095890410956e-05, + "loss": 1.788, "step": 700 }, { - "epoch": 0.03424657534246575, - "grad_norm": 59.87869644165039, - "learning_rate": 2.9486301369863017e-05, - "loss": 1.8421, + "epoch": 0.0684931506849315, + "grad_norm": 33.06129455566406, + "learning_rate": 2.8972602739726026e-05, + "loss": 1.8157, "step": 750 }, { - "epoch": 0.0365296803652968, - "grad_norm": 20.744386672973633, - "learning_rate": 2.945205479452055e-05, - "loss": 1.8149, + "epoch": 0.0730593607305936, + "grad_norm": 32.716251373291016, + "learning_rate": 2.8904109589041095e-05, + "loss": 1.7502, "step": 800 }, { - "epoch": 0.03881278538812785, - "grad_norm": 22.272050857543945, - "learning_rate": 2.9417808219178083e-05, - "loss": 1.6892, + "epoch": 0.0776255707762557, + "grad_norm": 27.18515968322754, + "learning_rate": 2.8835616438356165e-05, + "loss": 1.5629, "step": 850 }, { - "epoch": 0.0410958904109589, - "grad_norm": 34.34733963012695, - "learning_rate": 2.938356164383562e-05, - "loss": 1.6013, + "epoch": 0.0821917808219178, + "grad_norm": 19.844303131103516, + "learning_rate": 2.876712328767123e-05, + "loss": 1.6421, "step": 900 }, { - "epoch": 0.04337899543378995, - "grad_norm": 19.482553482055664, - "learning_rate": 2.9349315068493152e-05, - "loss": 1.6772, + "epoch": 0.0867579908675799, + "grad_norm": 160.48770141601562, + "learning_rate": 2.86986301369863e-05, + "loss": 1.6035, "step": 950 }, { - "epoch": 0.045662100456621, - "grad_norm": 19.61932945251465, - "learning_rate": 2.9315068493150685e-05, - "loss": 1.7372, + "epoch": 0.091324200913242, + "grad_norm": 35.13261032104492, + "learning_rate": 2.863013698630137e-05, + "loss": 1.4939, "step": 1000 }, { - "epoch": 0.04794520547945205, - "grad_norm": 47.28583526611328, - "learning_rate": 2.9280821917808222e-05, - "loss": 1.6459, + "epoch": 0.0958904109589041, + "grad_norm": 24.28845977783203, + "learning_rate": 2.856164383561644e-05, + "loss": 1.5611, "step": 1050 }, { - "epoch": 0.0502283105022831, - "grad_norm": 74.12429809570312, - "learning_rate": 2.9246575342465755e-05, - "loss": 1.4242, + "epoch": 0.1004566210045662, + "grad_norm": 30.07063865661621, + "learning_rate": 2.8493150684931505e-05, + "loss": 1.4715, "step": 1100 }, { - "epoch": 0.05251141552511415, - "grad_norm": 28.49104118347168, - "learning_rate": 2.921232876712329e-05, - "loss": 1.4958, + "epoch": 0.1050228310502283, + "grad_norm": 23.672300338745117, + "learning_rate": 2.8424657534246575e-05, + "loss": 1.534, "step": 1150 }, { - "epoch": 0.0547945205479452, - "grad_norm": 69.21525573730469, - "learning_rate": 2.9178082191780824e-05, - "loss": 1.4848, + "epoch": 0.1095890410958904, + "grad_norm": 27.45283317565918, + "learning_rate": 2.8356164383561644e-05, + "loss": 1.5268, "step": 1200 }, { - "epoch": 0.05707762557077625, - "grad_norm": 28.827421188354492, - "learning_rate": 2.9143835616438357e-05, - "loss": 1.451, + "epoch": 0.1141552511415525, + "grad_norm": 26.83701515197754, + "learning_rate": 2.8287671232876714e-05, + "loss": 1.4089, "step": 1250 }, { - "epoch": 0.0593607305936073, - "grad_norm": 69.60295867919922, - "learning_rate": 2.910958904109589e-05, - "loss": 1.6166, + "epoch": 0.1187214611872146, + "grad_norm": 19.307844161987305, + "learning_rate": 2.821917808219178e-05, + "loss": 1.4929, "step": 1300 }, { - "epoch": 0.06164383561643835, - "grad_norm": 43.529075622558594, - "learning_rate": 2.9075342465753423e-05, - "loss": 1.5593, + "epoch": 0.1232876712328767, + "grad_norm": 14.790838241577148, + "learning_rate": 2.815068493150685e-05, + "loss": 1.4687, "step": 1350 }, { - "epoch": 0.0639269406392694, - "grad_norm": 21.382102966308594, - "learning_rate": 2.9041095890410956e-05, - "loss": 1.5083, + "epoch": 0.1278538812785388, + "grad_norm": 16.656993865966797, + "learning_rate": 2.808219178082192e-05, + "loss": 1.3456, "step": 1400 }, { - "epoch": 0.06621004566210045, - "grad_norm": 38.85325622558594, - "learning_rate": 2.9006849315068493e-05, - "loss": 1.6731, + "epoch": 0.1324200913242009, + "grad_norm": 14.602429389953613, + "learning_rate": 2.801369863013699e-05, + "loss": 1.2725, "step": 1450 }, { - "epoch": 0.0684931506849315, - "grad_norm": 31.006072998046875, - "learning_rate": 2.8972602739726026e-05, - "loss": 1.4695, + "epoch": 0.136986301369863, + "grad_norm": 23.853199005126953, + "learning_rate": 2.7945205479452054e-05, + "loss": 1.2684, "step": 1500 }, { - "epoch": 0.07077625570776255, - "grad_norm": 26.75164222717285, - "learning_rate": 2.8938356164383562e-05, - "loss": 1.574, + "epoch": 0.1415525114155251, + "grad_norm": 40.28663635253906, + "learning_rate": 2.7876712328767124e-05, + "loss": 1.2728, "step": 1550 }, { - "epoch": 0.0730593607305936, - "grad_norm": 38.76344299316406, - "learning_rate": 2.8904109589041095e-05, - "loss": 1.5674, + "epoch": 0.1461187214611872, + "grad_norm": 11.038789749145508, + "learning_rate": 2.7808219178082193e-05, + "loss": 1.3578, "step": 1600 }, { - "epoch": 0.07534246575342465, - "grad_norm": 34.11776351928711, - "learning_rate": 2.886986301369863e-05, - "loss": 1.3804, + "epoch": 0.1506849315068493, + "grad_norm": 22.868593215942383, + "learning_rate": 2.7739726027397263e-05, + "loss": 1.3941, "step": 1650 }, { - "epoch": 0.0776255707762557, - "grad_norm": 45.00776290893555, - "learning_rate": 2.8835616438356165e-05, - "loss": 1.5021, + "epoch": 0.1552511415525114, + "grad_norm": 23.89753532409668, + "learning_rate": 2.767123287671233e-05, + "loss": 1.3071, "step": 1700 }, { - "epoch": 0.07990867579908675, - "grad_norm": 85.93338775634766, - "learning_rate": 2.8801369863013698e-05, - "loss": 1.4673, + "epoch": 0.1598173515981735, + "grad_norm": 40.02177429199219, + "learning_rate": 2.76027397260274e-05, + "loss": 1.3711, "step": 1750 }, { - "epoch": 0.0821917808219178, - "grad_norm": 20.37348747253418, - "learning_rate": 2.876712328767123e-05, - "loss": 1.5315, + "epoch": 0.1643835616438356, + "grad_norm": 40.53044128417969, + "learning_rate": 2.7534246575342468e-05, + "loss": 1.1694, "step": 1800 }, { - "epoch": 0.08447488584474885, - "grad_norm": 18.73834800720215, - "learning_rate": 2.8732876712328767e-05, - "loss": 1.4074, + "epoch": 0.1689497716894977, + "grad_norm": 38.70663070678711, + "learning_rate": 2.7465753424657537e-05, + "loss": 1.2284, "step": 1850 }, { - "epoch": 0.0867579908675799, - "grad_norm": 38.19795608520508, - "learning_rate": 2.86986301369863e-05, - "loss": 1.4442, + "epoch": 0.1735159817351598, + "grad_norm": 114.71417236328125, + "learning_rate": 2.7397260273972603e-05, + "loss": 1.1634, "step": 1900 }, { - "epoch": 0.08904109589041095, - "grad_norm": 18.31586265563965, - "learning_rate": 2.8664383561643837e-05, - "loss": 1.4918, + "epoch": 0.1780821917808219, + "grad_norm": 34.596923828125, + "learning_rate": 2.732876712328767e-05, + "loss": 1.2858, "step": 1950 }, { - "epoch": 0.091324200913242, - "grad_norm": 44.28574752807617, - "learning_rate": 2.863013698630137e-05, - "loss": 1.3071, + "epoch": 0.182648401826484, + "grad_norm": 23.52661895751953, + "learning_rate": 2.726027397260274e-05, + "loss": 1.2273, "step": 2000 }, { - "epoch": 0.09360730593607305, - "grad_norm": 51.20829391479492, - "learning_rate": 2.8595890410958903e-05, - "loss": 1.6107, + "epoch": 0.1872146118721461, + "grad_norm": 22.8360652923584, + "learning_rate": 2.719178082191781e-05, + "loss": 1.2622, "step": 2050 }, { - "epoch": 0.0958904109589041, - "grad_norm": 45.16651153564453, - "learning_rate": 2.856164383561644e-05, - "loss": 1.1947, + "epoch": 0.1917808219178082, + "grad_norm": 17.08322525024414, + "learning_rate": 2.7123287671232875e-05, + "loss": 1.3012, "step": 2100 }, { - "epoch": 0.09817351598173515, - "grad_norm": 3.6301872730255127, - "learning_rate": 2.8527397260273972e-05, - "loss": 1.4133, + "epoch": 0.1963470319634703, + "grad_norm": 12.923150062561035, + "learning_rate": 2.7054794520547944e-05, + "loss": 1.3847, "step": 2150 }, { - "epoch": 0.1004566210045662, - "grad_norm": 19.277009963989258, - "learning_rate": 2.8493150684931505e-05, - "loss": 1.2874, + "epoch": 0.2009132420091324, + "grad_norm": 73.03246307373047, + "learning_rate": 2.6986301369863014e-05, + "loss": 1.3173, "step": 2200 }, { - "epoch": 0.10273972602739725, - "grad_norm": 36.330936431884766, - "learning_rate": 2.8458904109589042e-05, - "loss": 1.4955, + "epoch": 0.2054794520547945, + "grad_norm": 16.656768798828125, + "learning_rate": 2.6917808219178083e-05, + "loss": 1.1779, "step": 2250 }, { - "epoch": 0.1050228310502283, - "grad_norm": 15.057435035705566, - "learning_rate": 2.8424657534246575e-05, - "loss": 1.3309, + "epoch": 0.2100456621004566, + "grad_norm": 22.919973373413086, + "learning_rate": 2.684931506849315e-05, + "loss": 1.2497, "step": 2300 }, { - "epoch": 0.10730593607305935, - "grad_norm": 31.65122413635254, - "learning_rate": 2.839041095890411e-05, - "loss": 1.3836, + "epoch": 0.2146118721461187, + "grad_norm": 20.0987491607666, + "learning_rate": 2.678082191780822e-05, + "loss": 1.1787, "step": 2350 }, { - "epoch": 0.1095890410958904, - "grad_norm": 26.276851654052734, - "learning_rate": 2.8356164383561644e-05, - "loss": 1.4872, + "epoch": 0.2191780821917808, + "grad_norm": 84.04356384277344, + "learning_rate": 2.6712328767123288e-05, + "loss": 1.1989, "step": 2400 }, { - "epoch": 0.11187214611872145, - "grad_norm": 80.19303131103516, - "learning_rate": 2.8321917808219177e-05, - "loss": 1.4708, + "epoch": 0.2237442922374429, + "grad_norm": 12.585476875305176, + "learning_rate": 2.6643835616438358e-05, + "loss": 1.2324, "step": 2450 }, { - "epoch": 0.1141552511415525, - "grad_norm": 23.674320220947266, - "learning_rate": 2.8287671232876714e-05, - "loss": 1.1475, + "epoch": 0.228310502283105, + "grad_norm": 18.206226348876953, + "learning_rate": 2.6575342465753424e-05, + "loss": 1.2541, "step": 2500 }, { - "epoch": 0.11643835616438356, - "grad_norm": 29.5810604095459, - "learning_rate": 2.8253424657534247e-05, - "loss": 1.5903, + "epoch": 0.2328767123287671, + "grad_norm": 24.236295700073242, + "learning_rate": 2.6506849315068493e-05, + "loss": 1.1755, "step": 2550 }, { - "epoch": 0.1187214611872146, - "grad_norm": 32.49135208129883, - "learning_rate": 2.821917808219178e-05, - "loss": 1.3248, + "epoch": 0.2374429223744292, + "grad_norm": 21.91484260559082, + "learning_rate": 2.6438356164383563e-05, + "loss": 1.1436, "step": 2600 }, { - "epoch": 0.12100456621004566, - "grad_norm": 33.26783752441406, - "learning_rate": 2.8184931506849316e-05, - "loss": 1.5587, + "epoch": 0.2420091324200913, + "grad_norm": 25.741945266723633, + "learning_rate": 2.6369863013698632e-05, + "loss": 1.146, "step": 2650 }, { - "epoch": 0.1232876712328767, - "grad_norm": 34.50761795043945, - "learning_rate": 2.815068493150685e-05, - "loss": 1.3713, + "epoch": 0.2465753424657534, + "grad_norm": 21.776227951049805, + "learning_rate": 2.6301369863013698e-05, + "loss": 1.1521, "step": 2700 }, { - "epoch": 0.12557077625570776, - "grad_norm": 21.493181228637695, - "learning_rate": 2.8116438356164386e-05, - "loss": 1.2665, + "epoch": 0.2511415525114155, + "grad_norm": 30.281368255615234, + "learning_rate": 2.6232876712328768e-05, + "loss": 1.0747, "step": 2750 }, { - "epoch": 0.1278538812785388, - "grad_norm": 10.42421817779541, - "learning_rate": 2.808219178082192e-05, - "loss": 1.3406, + "epoch": 0.2557077625570776, + "grad_norm": 16.21628189086914, + "learning_rate": 2.6164383561643837e-05, + "loss": 1.2503, "step": 2800 }, { - "epoch": 0.13013698630136986, - "grad_norm": 18.451990127563477, - "learning_rate": 2.8047945205479452e-05, - "loss": 1.2502, + "epoch": 0.2602739726027397, + "grad_norm": 17.022933959960938, + "learning_rate": 2.6095890410958907e-05, + "loss": 1.1719, "step": 2850 }, { - "epoch": 0.1324200913242009, - "grad_norm": 27.949146270751953, - "learning_rate": 2.801369863013699e-05, - "loss": 1.2492, + "epoch": 0.2648401826484018, + "grad_norm": 17.665477752685547, + "learning_rate": 2.6027397260273973e-05, + "loss": 1.091, "step": 2900 }, { - "epoch": 0.13470319634703196, - "grad_norm": 19.943056106567383, - "learning_rate": 2.797945205479452e-05, - "loss": 1.4071, + "epoch": 0.2694063926940639, + "grad_norm": 12.710230827331543, + "learning_rate": 2.5958904109589042e-05, + "loss": 1.1828, "step": 2950 }, { - "epoch": 0.136986301369863, - "grad_norm": 30.464204788208008, - "learning_rate": 2.7945205479452054e-05, - "loss": 1.2135, + "epoch": 0.273972602739726, + "grad_norm": 23.11914825439453, + "learning_rate": 2.589041095890411e-05, + "loss": 1.0025, "step": 3000 }, { - "epoch": 0.13926940639269406, - "grad_norm": 20.832355499267578, - "learning_rate": 2.791095890410959e-05, - "loss": 1.2872, + "epoch": 0.2785388127853881, + "grad_norm": 17.97935676574707, + "learning_rate": 2.582191780821918e-05, + "loss": 1.2145, "step": 3050 }, { - "epoch": 0.1415525114155251, - "grad_norm": 13.37592601776123, - "learning_rate": 2.7876712328767124e-05, - "loss": 1.1165, + "epoch": 0.2831050228310502, + "grad_norm": 28.123554229736328, + "learning_rate": 2.5753424657534247e-05, + "loss": 1.2629, "step": 3100 }, { - "epoch": 0.14383561643835616, - "grad_norm": 36.62568664550781, - "learning_rate": 2.784246575342466e-05, - "loss": 1.2849, + "epoch": 0.2876712328767123, + "grad_norm": 17.924665451049805, + "learning_rate": 2.5684931506849317e-05, + "loss": 1.0439, "step": 3150 }, { - "epoch": 0.1461187214611872, - "grad_norm": 11.884871482849121, - "learning_rate": 2.7808219178082193e-05, - "loss": 1.241, + "epoch": 0.2922374429223744, + "grad_norm": 24.03557777404785, + "learning_rate": 2.5616438356164386e-05, + "loss": 1.127, "step": 3200 }, { - "epoch": 0.14840182648401826, - "grad_norm": 22.552181243896484, - "learning_rate": 2.7773972602739726e-05, - "loss": 1.1695, + "epoch": 0.2968036529680365, + "grad_norm": 30.70620346069336, + "learning_rate": 2.5547945205479452e-05, + "loss": 1.1464, "step": 3250 }, { - "epoch": 0.1506849315068493, - "grad_norm": 29.05255126953125, - "learning_rate": 2.7739726027397263e-05, - "loss": 1.3266, + "epoch": 0.3013698630136986, + "grad_norm": 21.80393409729004, + "learning_rate": 2.5479452054794518e-05, + "loss": 0.9603, "step": 3300 }, { - "epoch": 0.15296803652968036, - "grad_norm": 27.039731979370117, - "learning_rate": 2.7705479452054796e-05, - "loss": 1.4528, + "epoch": 0.3059360730593607, + "grad_norm": 23.686079025268555, + "learning_rate": 2.5410958904109588e-05, + "loss": 1.0745, "step": 3350 }, { - "epoch": 0.1552511415525114, - "grad_norm": 12.077860832214355, - "learning_rate": 2.767123287671233e-05, - "loss": 1.0587, + "epoch": 0.3105022831050228, + "grad_norm": 10.258773803710938, + "learning_rate": 2.5342465753424657e-05, + "loss": 1.0776, "step": 3400 }, { - "epoch": 0.15753424657534246, - "grad_norm": 33.64236068725586, - "learning_rate": 2.7636986301369865e-05, - "loss": 1.3053, + "epoch": 0.3150684931506849, + "grad_norm": 28.77837562561035, + "learning_rate": 2.5273972602739727e-05, + "loss": 1.1781, "step": 3450 }, { - "epoch": 0.1598173515981735, - "grad_norm": 7.781187534332275, - "learning_rate": 2.76027397260274e-05, - "loss": 1.5692, + "epoch": 0.319634703196347, + "grad_norm": 39.38608932495117, + "learning_rate": 2.5205479452054793e-05, + "loss": 0.9749, "step": 3500 }, { - "epoch": 0.16210045662100456, - "grad_norm": 58.28102111816406, - "learning_rate": 2.7568493150684935e-05, - "loss": 1.0579, + "epoch": 0.3242009132420091, + "grad_norm": 29.596742630004883, + "learning_rate": 2.5136986301369862e-05, + "loss": 1.1099, "step": 3550 }, { - "epoch": 0.1643835616438356, - "grad_norm": 46.713260650634766, - "learning_rate": 2.7534246575342468e-05, - "loss": 1.1327, + "epoch": 0.3287671232876712, + "grad_norm": 15.371848106384277, + "learning_rate": 2.5068493150684932e-05, + "loss": 1.1363, "step": 3600 }, { - "epoch": 0.16666666666666666, - "grad_norm": 38.657493591308594, - "learning_rate": 2.75e-05, - "loss": 1.299, + "epoch": 0.3333333333333333, + "grad_norm": 27.008398056030273, + "learning_rate": 2.5e-05, + "loss": 1.2275, "step": 3650 }, { - "epoch": 0.1689497716894977, - "grad_norm": 7.117087364196777, - "learning_rate": 2.7465753424657537e-05, - "loss": 1.0367, + "epoch": 0.3378995433789954, + "grad_norm": 54.85331726074219, + "learning_rate": 2.4931506849315067e-05, + "loss": 1.2297, "step": 3700 }, { - "epoch": 0.17123287671232876, - "grad_norm": 33.10891342163086, - "learning_rate": 2.743150684931507e-05, - "loss": 1.1318, + "epoch": 0.3424657534246575, + "grad_norm": 10.650010108947754, + "learning_rate": 2.4863013698630137e-05, + "loss": 1.1716, "step": 3750 }, { - "epoch": 0.1735159817351598, - "grad_norm": 38.68271255493164, - "learning_rate": 2.7397260273972603e-05, - "loss": 1.1906, + "epoch": 0.3470319634703196, + "grad_norm": 21.537841796875, + "learning_rate": 2.4794520547945206e-05, + "loss": 1.2375, "step": 3800 }, { - "epoch": 0.17579908675799086, - "grad_norm": 45.01628494262695, - "learning_rate": 2.736301369863014e-05, - "loss": 1.1285, + "epoch": 0.3515981735159817, + "grad_norm": 31.040218353271484, + "learning_rate": 2.4726027397260276e-05, + "loss": 1.0511, "step": 3850 }, { - "epoch": 0.1780821917808219, - "grad_norm": 20.761018753051758, - "learning_rate": 2.732876712328767e-05, - "loss": 1.4117, + "epoch": 0.3561643835616438, + "grad_norm": 13.31843090057373, + "learning_rate": 2.4657534246575342e-05, + "loss": 1.1581, "step": 3900 }, { - "epoch": 0.18036529680365296, - "grad_norm": 46.919952392578125, - "learning_rate": 2.7294520547945206e-05, - "loss": 1.0591, + "epoch": 0.3607305936073059, + "grad_norm": 22.53802490234375, + "learning_rate": 2.458904109589041e-05, + "loss": 1.0627, "step": 3950 }, { - "epoch": 0.182648401826484, - "grad_norm": 78.8492431640625, - "learning_rate": 2.726027397260274e-05, - "loss": 1.3741, + "epoch": 0.365296803652968, + "grad_norm": 20.266468048095703, + "learning_rate": 2.452054794520548e-05, + "loss": 1.1103, "step": 4000 }, { - "epoch": 0.18493150684931506, - "grad_norm": 25.57372283935547, - "learning_rate": 2.7226027397260272e-05, - "loss": 1.1287, + "epoch": 0.3698630136986301, + "grad_norm": 9.198668479919434, + "learning_rate": 2.445205479452055e-05, + "loss": 1.136, "step": 4050 }, { - "epoch": 0.1872146118721461, - "grad_norm": 16.47197723388672, - "learning_rate": 2.719178082191781e-05, - "loss": 1.3216, + "epoch": 0.3744292237442922, + "grad_norm": 13.770018577575684, + "learning_rate": 2.4383561643835616e-05, + "loss": 1.0984, "step": 4100 }, { - "epoch": 0.18949771689497716, - "grad_norm": 36.24203109741211, - "learning_rate": 2.715753424657534e-05, - "loss": 1.3095, + "epoch": 0.3789954337899543, + "grad_norm": 25.558441162109375, + "learning_rate": 2.4315068493150686e-05, + "loss": 1.062, "step": 4150 }, { - "epoch": 0.1917808219178082, - "grad_norm": 8.002535820007324, - "learning_rate": 2.7123287671232875e-05, - "loss": 1.1426, + "epoch": 0.3835616438356164, + "grad_norm": 25.940107345581055, + "learning_rate": 2.4246575342465755e-05, + "loss": 1.2237, "step": 4200 }, { - "epoch": 0.19406392694063926, - "grad_norm": 9.112885475158691, - "learning_rate": 2.708904109589041e-05, - "loss": 1.4292, + "epoch": 0.3881278538812785, + "grad_norm": 11.530945777893066, + "learning_rate": 2.4178082191780825e-05, + "loss": 1.0717, "step": 4250 }, { - "epoch": 0.1963470319634703, - "grad_norm": 7.080036640167236, - "learning_rate": 2.7054794520547944e-05, - "loss": 1.2665, + "epoch": 0.3926940639269406, + "grad_norm": 13.686588287353516, + "learning_rate": 2.410958904109589e-05, + "loss": 1.0011, "step": 4300 }, { - "epoch": 0.19863013698630136, - "grad_norm": 8.403115272521973, - "learning_rate": 2.702054794520548e-05, - "loss": 1.3686, + "epoch": 0.3972602739726027, + "grad_norm": 24.325557708740234, + "learning_rate": 2.404109589041096e-05, + "loss": 1.2033, "step": 4350 }, { - "epoch": 0.2009132420091324, - "grad_norm": 29.843015670776367, - "learning_rate": 2.6986301369863014e-05, - "loss": 1.0842, + "epoch": 0.4018264840182648, + "grad_norm": 24.21858787536621, + "learning_rate": 2.397260273972603e-05, + "loss": 1.01, "step": 4400 }, { - "epoch": 0.20319634703196346, - "grad_norm": 16.962310791015625, - "learning_rate": 2.6952054794520547e-05, - "loss": 1.1983, + "epoch": 0.4063926940639269, + "grad_norm": 14.875985145568848, + "learning_rate": 2.39041095890411e-05, + "loss": 1.0378, "step": 4450 }, { - "epoch": 0.2054794520547945, - "grad_norm": 19.98363494873047, - "learning_rate": 2.6917808219178083e-05, - "loss": 1.136, + "epoch": 0.410958904109589, + "grad_norm": 38.33986282348633, + "learning_rate": 2.3835616438356165e-05, + "loss": 1.0722, "step": 4500 }, { - "epoch": 0.20776255707762556, - "grad_norm": 23.507699966430664, - "learning_rate": 2.6883561643835616e-05, - "loss": 1.4167, + "epoch": 0.4155251141552511, + "grad_norm": 10.655511856079102, + "learning_rate": 2.376712328767123e-05, + "loss": 1.1182, "step": 4550 }, { - "epoch": 0.2100456621004566, - "grad_norm": 85.11144256591797, - "learning_rate": 2.684931506849315e-05, - "loss": 1.0667, + "epoch": 0.4200913242009132, + "grad_norm": 26.336122512817383, + "learning_rate": 2.36986301369863e-05, + "loss": 1.0772, "step": 4600 }, { - "epoch": 0.21232876712328766, - "grad_norm": 25.13855743408203, - "learning_rate": 2.6815068493150686e-05, - "loss": 1.1755, + "epoch": 0.4246575342465753, + "grad_norm": 26.254093170166016, + "learning_rate": 2.363013698630137e-05, + "loss": 0.9908, "step": 4650 }, { - "epoch": 0.2146118721461187, - "grad_norm": 21.816020965576172, - "learning_rate": 2.678082191780822e-05, - "loss": 1.1357, + "epoch": 0.4292237442922374, + "grad_norm": 7.081357002258301, + "learning_rate": 2.3561643835616436e-05, + "loss": 1.0079, "step": 4700 }, { - "epoch": 0.21689497716894976, - "grad_norm": 27.655017852783203, - "learning_rate": 2.6746575342465755e-05, - "loss": 1.2578, + "epoch": 0.4337899543378995, + "grad_norm": 20.019088745117188, + "learning_rate": 2.3493150684931506e-05, + "loss": 1.1188, "step": 4750 }, { - "epoch": 0.2191780821917808, - "grad_norm": 24.329933166503906, - "learning_rate": 2.6712328767123288e-05, - "loss": 1.1243, + "epoch": 0.4383561643835616, + "grad_norm": 30.188098907470703, + "learning_rate": 2.3424657534246575e-05, + "loss": 1.0616, "step": 4800 }, { - "epoch": 0.22146118721461186, - "grad_norm": 9.320807456970215, - "learning_rate": 2.667808219178082e-05, - "loss": 1.2211, + "epoch": 0.4429223744292237, + "grad_norm": 6.621674537658691, + "learning_rate": 2.3356164383561645e-05, + "loss": 1.0935, "step": 4850 }, { - "epoch": 0.2237442922374429, - "grad_norm": 14.245413780212402, - "learning_rate": 2.6643835616438358e-05, - "loss": 1.2804, + "epoch": 0.4474885844748858, + "grad_norm": 21.145673751831055, + "learning_rate": 2.328767123287671e-05, + "loss": 1.0178, "step": 4900 }, { - "epoch": 0.22602739726027396, - "grad_norm": 46.82653045654297, - "learning_rate": 2.660958904109589e-05, - "loss": 1.2949, + "epoch": 0.4520547945205479, + "grad_norm": 26.22977066040039, + "learning_rate": 2.321917808219178e-05, + "loss": 1.1739, "step": 4950 }, { - "epoch": 0.228310502283105, - "grad_norm": 43.06188201904297, - "learning_rate": 2.6575342465753424e-05, - "loss": 1.2012, + "epoch": 0.45662100456621, + "grad_norm": 22.71933364868164, + "learning_rate": 2.315068493150685e-05, + "loss": 1.122, "step": 5000 }, { - "epoch": 0.23059360730593606, - "grad_norm": 50.084381103515625, - "learning_rate": 2.654109589041096e-05, - "loss": 1.2965, + "epoch": 0.4611872146118721, + "grad_norm": 16.020483016967773, + "learning_rate": 2.308219178082192e-05, + "loss": 1.0494, "step": 5050 }, { - "epoch": 0.2328767123287671, - "grad_norm": 32.71940994262695, - "learning_rate": 2.6506849315068493e-05, - "loss": 1.0847, + "epoch": 0.4657534246575342, + "grad_norm": 10.217668533325195, + "learning_rate": 2.3013698630136985e-05, + "loss": 1.1577, "step": 5100 }, { - "epoch": 0.23515981735159816, - "grad_norm": 53.56361389160156, - "learning_rate": 2.647260273972603e-05, - "loss": 1.2125, + "epoch": 0.4703196347031963, + "grad_norm": 13.561128616333008, + "learning_rate": 2.2945205479452055e-05, + "loss": 1.1065, "step": 5150 }, { - "epoch": 0.2374429223744292, - "grad_norm": 57.06380081176758, - "learning_rate": 2.6438356164383563e-05, - "loss": 1.06, + "epoch": 0.4748858447488584, + "grad_norm": 24.07544708251953, + "learning_rate": 2.2876712328767124e-05, + "loss": 1.1721, "step": 5200 }, { - "epoch": 0.23972602739726026, - "grad_norm": 41.62690734863281, - "learning_rate": 2.6404109589041096e-05, - "loss": 1.1048, + "epoch": 0.4794520547945205, + "grad_norm": 27.471532821655273, + "learning_rate": 2.2808219178082194e-05, + "loss": 1.0425, "step": 5250 }, { - "epoch": 0.2420091324200913, - "grad_norm": 44.06789016723633, - "learning_rate": 2.6369863013698632e-05, - "loss": 1.2219, + "epoch": 0.4840182648401826, + "grad_norm": 39.17247772216797, + "learning_rate": 2.273972602739726e-05, + "loss": 1.0182, "step": 5300 }, { - "epoch": 0.24429223744292236, - "grad_norm": 14.321037292480469, - "learning_rate": 2.6335616438356165e-05, - "loss": 1.2137, + "epoch": 0.4885844748858447, + "grad_norm": 11.253133773803711, + "learning_rate": 2.267123287671233e-05, + "loss": 1.0516, "step": 5350 }, { - "epoch": 0.2465753424657534, - "grad_norm": 31.366552352905273, - "learning_rate": 2.6301369863013698e-05, - "loss": 1.2281, + "epoch": 0.4931506849315068, + "grad_norm": 19.816268920898438, + "learning_rate": 2.26027397260274e-05, + "loss": 0.9359, "step": 5400 }, { - "epoch": 0.24885844748858446, - "grad_norm": 19.642141342163086, - "learning_rate": 2.6267123287671235e-05, - "loss": 1.0646, + "epoch": 0.4977168949771689, + "grad_norm": 21.89589500427246, + "learning_rate": 2.253424657534247e-05, + "loss": 1.0047, "step": 5450 }, { - "epoch": 0.2511415525114155, - "grad_norm": 24.77635955810547, - "learning_rate": 2.6232876712328768e-05, - "loss": 1.0827, + "epoch": 0.502283105022831, + "grad_norm": 18.662717819213867, + "learning_rate": 2.2465753424657534e-05, + "loss": 1.1092, "step": 5500 }, { - "epoch": 0.2534246575342466, - "grad_norm": 21.257320404052734, - "learning_rate": 2.6198630136986304e-05, - "loss": 1.1798, + "epoch": 0.5068493150684932, + "grad_norm": 13.65588092803955, + "learning_rate": 2.2397260273972604e-05, + "loss": 1.2777, "step": 5550 }, { - "epoch": 0.2557077625570776, - "grad_norm": 7.442146301269531, - "learning_rate": 2.6164383561643837e-05, - "loss": 1.2409, + "epoch": 0.5114155251141552, + "grad_norm": 22.897354125976562, + "learning_rate": 2.2328767123287673e-05, + "loss": 1.0557, "step": 5600 }, { - "epoch": 0.2579908675799087, - "grad_norm": 40.682579040527344, - "learning_rate": 2.613013698630137e-05, - "loss": 1.2001, + "epoch": 0.5159817351598174, + "grad_norm": 22.30970573425293, + "learning_rate": 2.2260273972602743e-05, + "loss": 1.0701, "step": 5650 }, { - "epoch": 0.2602739726027397, - "grad_norm": 22.984914779663086, - "learning_rate": 2.6095890410958907e-05, - "loss": 1.0922, + "epoch": 0.5205479452054794, + "grad_norm": 55.639892578125, + "learning_rate": 2.219178082191781e-05, + "loss": 1.1001, "step": 5700 }, { - "epoch": 0.2625570776255708, - "grad_norm": 14.088912010192871, - "learning_rate": 2.606164383561644e-05, - "loss": 1.0361, + "epoch": 0.5251141552511416, + "grad_norm": 11.327408790588379, + "learning_rate": 2.212328767123288e-05, + "loss": 1.1506, "step": 5750 }, { - "epoch": 0.2648401826484018, - "grad_norm": 26.859743118286133, - "learning_rate": 2.6027397260273973e-05, - "loss": 1.3059, + "epoch": 0.5296803652968036, + "grad_norm": 31.288894653320312, + "learning_rate": 2.2054794520547945e-05, + "loss": 1.0774, "step": 5800 }, { - "epoch": 0.2671232876712329, - "grad_norm": 27.274446487426758, - "learning_rate": 2.599315068493151e-05, - "loss": 1.2146, + "epoch": 0.5342465753424658, + "grad_norm": 11.587857246398926, + "learning_rate": 2.1986301369863014e-05, + "loss": 1.0175, "step": 5850 }, { - "epoch": 0.2694063926940639, - "grad_norm": 15.877359390258789, - "learning_rate": 2.5958904109589042e-05, - "loss": 1.047, + "epoch": 0.5388127853881278, + "grad_norm": 27.174907684326172, + "learning_rate": 2.191780821917808e-05, + "loss": 1.1566, "step": 5900 }, { - "epoch": 0.271689497716895, - "grad_norm": 27.11214256286621, - "learning_rate": 2.592465753424658e-05, - "loss": 1.1303, + "epoch": 0.54337899543379, + "grad_norm": 28.325786590576172, + "learning_rate": 2.184931506849315e-05, + "loss": 1.1089, "step": 5950 }, { - "epoch": 0.273972602739726, - "grad_norm": 25.463417053222656, - "learning_rate": 2.589041095890411e-05, - "loss": 0.9916, + "epoch": 0.547945205479452, + "grad_norm": 21.70798683166504, + "learning_rate": 2.178082191780822e-05, + "loss": 1.0575, "step": 6000 }, { - "epoch": 0.2762557077625571, - "grad_norm": 5.419765472412109, - "learning_rate": 2.5856164383561645e-05, - "loss": 1.3228, + "epoch": 0.5525114155251142, + "grad_norm": 18.79369354248047, + "learning_rate": 2.171232876712329e-05, + "loss": 1.1327, "step": 6050 }, { - "epoch": 0.2785388127853881, - "grad_norm": 26.771987915039062, - "learning_rate": 2.582191780821918e-05, - "loss": 1.1163, + "epoch": 0.5570776255707762, + "grad_norm": 31.291170120239258, + "learning_rate": 2.1643835616438355e-05, + "loss": 1.1412, "step": 6100 }, { - "epoch": 0.2808219178082192, - "grad_norm": 40.04314422607422, - "learning_rate": 2.5787671232876714e-05, - "loss": 1.2227, + "epoch": 0.5616438356164384, + "grad_norm": 15.10459041595459, + "learning_rate": 2.1575342465753424e-05, + "loss": 1.1289, "step": 6150 }, { - "epoch": 0.2831050228310502, - "grad_norm": 31.262903213500977, - "learning_rate": 2.5753424657534247e-05, - "loss": 1.0956, + "epoch": 0.5662100456621004, + "grad_norm": 23.595136642456055, + "learning_rate": 2.1506849315068494e-05, + "loss": 0.9419, "step": 6200 }, { - "epoch": 0.2853881278538813, - "grad_norm": 8.211708068847656, - "learning_rate": 2.5719178082191784e-05, - "loss": 0.8686, + "epoch": 0.5707762557077626, + "grad_norm": 9.336952209472656, + "learning_rate": 2.1438356164383563e-05, + "loss": 0.9357, "step": 6250 }, { - "epoch": 0.2876712328767123, - "grad_norm": 7.24912166595459, - "learning_rate": 2.5684931506849317e-05, - "loss": 1.2297, + "epoch": 0.5753424657534246, + "grad_norm": 34.959354400634766, + "learning_rate": 2.136986301369863e-05, + "loss": 1.1525, "step": 6300 }, { - "epoch": 0.2899543378995434, - "grad_norm": 11.469908714294434, - "learning_rate": 2.5650684931506853e-05, - "loss": 1.1037, + "epoch": 0.5799086757990868, + "grad_norm": 25.974088668823242, + "learning_rate": 2.13013698630137e-05, + "loss": 1.0915, "step": 6350 }, { - "epoch": 0.2922374429223744, - "grad_norm": 11.30005168914795, - "learning_rate": 2.5616438356164386e-05, - "loss": 1.1043, + "epoch": 0.5844748858447488, + "grad_norm": 26.807876586914062, + "learning_rate": 2.1232876712328768e-05, + "loss": 1.0783, "step": 6400 }, { - "epoch": 0.2945205479452055, - "grad_norm": 9.771041870117188, - "learning_rate": 2.558219178082192e-05, - "loss": 1.09, + "epoch": 0.589041095890411, + "grad_norm": 11.836869239807129, + "learning_rate": 2.1164383561643838e-05, + "loss": 1.1106, "step": 6450 }, { - "epoch": 0.2968036529680365, - "grad_norm": 22.025493621826172, - "learning_rate": 2.5547945205479452e-05, - "loss": 1.1252, + "epoch": 0.593607305936073, + "grad_norm": 20.5205078125, + "learning_rate": 2.1095890410958904e-05, + "loss": 1.0021, "step": 6500 }, { - "epoch": 0.2990867579908676, - "grad_norm": 22.842763900756836, - "learning_rate": 2.5513698630136985e-05, - "loss": 1.0219, + "epoch": 0.5981735159817352, + "grad_norm": 63.918914794921875, + "learning_rate": 2.1027397260273973e-05, + "loss": 0.9942, "step": 6550 }, { - "epoch": 0.3013698630136986, - "grad_norm": 49.29541015625, - "learning_rate": 2.5479452054794518e-05, - "loss": 0.8537, + "epoch": 0.6027397260273972, + "grad_norm": 15.824874877929688, + "learning_rate": 2.0958904109589043e-05, + "loss": 1.0382, "step": 6600 }, { - "epoch": 0.3036529680365297, - "grad_norm": 32.22005844116211, - "learning_rate": 2.5445205479452055e-05, - "loss": 1.1211, + "epoch": 0.6073059360730594, + "grad_norm": 23.539623260498047, + "learning_rate": 2.0890410958904112e-05, + "loss": 1.1323, "step": 6650 }, { - "epoch": 0.3059360730593607, - "grad_norm": 49.50102615356445, - "learning_rate": 2.5410958904109588e-05, - "loss": 1.0462, + "epoch": 0.6118721461187214, + "grad_norm": 22.72905921936035, + "learning_rate": 2.0821917808219178e-05, + "loss": 1.096, "step": 6700 }, { - "epoch": 0.3082191780821918, - "grad_norm": 69.23637390136719, - "learning_rate": 2.5376712328767124e-05, - "loss": 0.9964, + "epoch": 0.6164383561643836, + "grad_norm": 21.393850326538086, + "learning_rate": 2.0753424657534248e-05, + "loss": 1.0491, "step": 6750 }, { - "epoch": 0.3105022831050228, - "grad_norm": 15.762770652770996, - "learning_rate": 2.5342465753424657e-05, - "loss": 1.3032, + "epoch": 0.6210045662100456, + "grad_norm": 10.425342559814453, + "learning_rate": 2.0684931506849317e-05, + "loss": 0.9894, "step": 6800 }, { - "epoch": 0.3127853881278539, - "grad_norm": 36.22660827636719, - "learning_rate": 2.530821917808219e-05, - "loss": 1.1776, + "epoch": 0.6255707762557078, + "grad_norm": 31.147842407226562, + "learning_rate": 2.0616438356164387e-05, + "loss": 1.0229, "step": 6850 }, { - "epoch": 0.3150684931506849, - "grad_norm": 6.89371395111084, - "learning_rate": 2.5273972602739727e-05, - "loss": 1.1224, + "epoch": 0.6301369863013698, + "grad_norm": 13.265896797180176, + "learning_rate": 2.0547945205479453e-05, + "loss": 1.0372, "step": 6900 }, { - "epoch": 0.317351598173516, - "grad_norm": 76.80335235595703, - "learning_rate": 2.523972602739726e-05, - "loss": 0.9386, + "epoch": 0.634703196347032, + "grad_norm": 19.299884796142578, + "learning_rate": 2.0479452054794522e-05, + "loss": 0.9861, "step": 6950 }, { - "epoch": 0.319634703196347, - "grad_norm": 50.3594970703125, - "learning_rate": 2.5205479452054793e-05, - "loss": 1.0721, + "epoch": 0.639269406392694, + "grad_norm": 15.215560913085938, + "learning_rate": 2.041095890410959e-05, + "loss": 1.0685, "step": 7000 }, { - "epoch": 0.3219178082191781, - "grad_norm": 26.805213928222656, - "learning_rate": 2.517123287671233e-05, - "loss": 1.2563, + "epoch": 0.6438356164383562, + "grad_norm": 12.113781929016113, + "learning_rate": 2.034246575342466e-05, + "loss": 1.0452, "step": 7050 }, { - "epoch": 0.3242009132420091, - "grad_norm": 39.83240509033203, - "learning_rate": 2.5136986301369862e-05, - "loss": 1.0924, + "epoch": 0.6484018264840182, + "grad_norm": 53.0004768371582, + "learning_rate": 2.0273972602739724e-05, + "loss": 1.0177, "step": 7100 }, { - "epoch": 0.3264840182648402, - "grad_norm": 41.219818115234375, - "learning_rate": 2.51027397260274e-05, - "loss": 1.0771, + "epoch": 0.6529680365296804, + "grad_norm": 10.609742164611816, + "learning_rate": 2.0205479452054793e-05, + "loss": 0.9896, "step": 7150 }, { - "epoch": 0.3287671232876712, - "grad_norm": 63.68681335449219, - "learning_rate": 2.5068493150684932e-05, - "loss": 1.1793, + "epoch": 0.6575342465753424, + "grad_norm": 13.45383071899414, + "learning_rate": 2.0136986301369863e-05, + "loss": 0.7549, "step": 7200 }, { - "epoch": 0.3310502283105023, - "grad_norm": 96.54421997070312, - "learning_rate": 2.5034246575342465e-05, - "loss": 1.185, + "epoch": 0.6621004566210046, + "grad_norm": 21.812686920166016, + "learning_rate": 2.0068493150684932e-05, + "loss": 1.0402, "step": 7250 }, { - "epoch": 0.3333333333333333, - "grad_norm": 35.43101119995117, - "learning_rate": 2.5e-05, - "loss": 1.3505, + "epoch": 0.6666666666666666, + "grad_norm": 7.622537612915039, + "learning_rate": 1.9999999999999998e-05, + "loss": 0.9468, "step": 7300 }, { - "epoch": 0.3356164383561644, - "grad_norm": 26.726581573486328, - "learning_rate": 2.4965753424657534e-05, - "loss": 1.0832, + "epoch": 0.6712328767123288, + "grad_norm": 14.563289642333984, + "learning_rate": 1.9931506849315068e-05, + "loss": 1.0154, "step": 7350 }, { - "epoch": 0.3378995433789954, - "grad_norm": 22.066526412963867, - "learning_rate": 2.4931506849315067e-05, - "loss": 1.2593, + "epoch": 0.6757990867579908, + "grad_norm": 29.9089298248291, + "learning_rate": 1.9863013698630137e-05, + "loss": 0.9872, "step": 7400 }, { - "epoch": 0.3401826484018265, - "grad_norm": 9.076361656188965, - "learning_rate": 2.4897260273972604e-05, - "loss": 0.9935, + "epoch": 0.680365296803653, + "grad_norm": 7.758576393127441, + "learning_rate": 1.9794520547945207e-05, + "loss": 0.8618, "step": 7450 }, { - "epoch": 0.3424657534246575, - "grad_norm": 32.710853576660156, - "learning_rate": 2.4863013698630137e-05, - "loss": 1.2397, + "epoch": 0.684931506849315, + "grad_norm": 15.849843978881836, + "learning_rate": 1.9726027397260273e-05, + "loss": 0.9981, "step": 7500 }, { - "epoch": 0.3447488584474886, - "grad_norm": 24.153223037719727, - "learning_rate": 2.4828767123287673e-05, - "loss": 1.1828, + "epoch": 0.6894977168949772, + "grad_norm": 13.798691749572754, + "learning_rate": 1.9657534246575342e-05, + "loss": 1.0578, "step": 7550 }, { - "epoch": 0.3470319634703196, - "grad_norm": 21.1640682220459, - "learning_rate": 2.4794520547945206e-05, - "loss": 1.2872, + "epoch": 0.6940639269406392, + "grad_norm": 12.977076530456543, + "learning_rate": 1.9589041095890412e-05, + "loss": 1.0081, "step": 7600 }, { - "epoch": 0.3493150684931507, - "grad_norm": 8.183709144592285, - "learning_rate": 2.476027397260274e-05, - "loss": 1.0255, + "epoch": 0.6986301369863014, + "grad_norm": 25.823368072509766, + "learning_rate": 1.952054794520548e-05, + "loss": 0.9483, "step": 7650 }, { - "epoch": 0.3515981735159817, - "grad_norm": 32.05094528198242, - "learning_rate": 2.4726027397260276e-05, - "loss": 1.1408, + "epoch": 0.7031963470319634, + "grad_norm": 1.7260483503341675, + "learning_rate": 1.9452054794520547e-05, + "loss": 0.9396, "step": 7700 }, { - "epoch": 0.3538812785388128, - "grad_norm": 42.780330657958984, - "learning_rate": 2.469178082191781e-05, - "loss": 1.0889, + "epoch": 0.7077625570776256, + "grad_norm": 28.217559814453125, + "learning_rate": 1.9383561643835617e-05, + "loss": 1.0159, "step": 7750 }, { - "epoch": 0.3561643835616438, - "grad_norm": 12.330619812011719, - "learning_rate": 2.4657534246575342e-05, - "loss": 1.1555, + "epoch": 0.7123287671232876, + "grad_norm": 15.566514015197754, + "learning_rate": 1.9315068493150686e-05, + "loss": 1.0673, "step": 7800 }, { - "epoch": 0.3584474885844749, - "grad_norm": 52.48133850097656, - "learning_rate": 2.4623287671232878e-05, - "loss": 0.9871, + "epoch": 0.7168949771689498, + "grad_norm": 49.06792449951172, + "learning_rate": 1.9246575342465756e-05, + "loss": 0.9817, "step": 7850 }, { - "epoch": 0.3607305936073059, - "grad_norm": 17.115123748779297, - "learning_rate": 2.458904109589041e-05, - "loss": 1.2171, + "epoch": 0.7214611872146118, + "grad_norm": 35.709224700927734, + "learning_rate": 1.9178082191780822e-05, + "loss": 0.9818, "step": 7900 }, { - "epoch": 0.363013698630137, - "grad_norm": 24.72197914123535, - "learning_rate": 2.4554794520547948e-05, - "loss": 0.9396, + "epoch": 0.726027397260274, + "grad_norm": 20.94626808166504, + "learning_rate": 1.910958904109589e-05, + "loss": 0.9113, "step": 7950 }, { - "epoch": 0.365296803652968, - "grad_norm": 59.79161834716797, - "learning_rate": 2.452054794520548e-05, - "loss": 1.161, + "epoch": 0.730593607305936, + "grad_norm": 25.5698184967041, + "learning_rate": 1.904109589041096e-05, + "loss": 1.0525, "step": 8000 }, { - "epoch": 0.3675799086757991, - "grad_norm": 27.012365341186523, - "learning_rate": 2.4486301369863014e-05, - "loss": 1.2269, + "epoch": 0.7351598173515982, + "grad_norm": 36.41669845581055, + "learning_rate": 1.897260273972603e-05, + "loss": 1.007, "step": 8050 }, { - "epoch": 0.3698630136986301, - "grad_norm": 10.282275199890137, - "learning_rate": 2.445205479452055e-05, - "loss": 1.0582, + "epoch": 0.7397260273972602, + "grad_norm": 17.15513038635254, + "learning_rate": 1.8904109589041096e-05, + "loss": 0.9568, "step": 8100 }, { - "epoch": 0.3721461187214612, - "grad_norm": 16.697723388671875, - "learning_rate": 2.4417808219178083e-05, - "loss": 1.1504, + "epoch": 0.7442922374429224, + "grad_norm": 24.435543060302734, + "learning_rate": 1.8835616438356166e-05, + "loss": 1.0073, "step": 8150 }, { - "epoch": 0.3744292237442922, - "grad_norm": 19.30514144897461, - "learning_rate": 2.4383561643835616e-05, - "loss": 1.0279, + "epoch": 0.7488584474885844, + "grad_norm": 26.5279483795166, + "learning_rate": 1.8767123287671235e-05, + "loss": 0.9768, "step": 8200 }, - { - "epoch": 0.3767123287671233, - "grad_norm": 49.45153045654297, - "learning_rate": 2.4349315068493153e-05, - "loss": 1.0799, - "step": 8250 - }, - { - "epoch": 0.3789954337899543, - "grad_norm": 33.181060791015625, - "learning_rate": 2.4315068493150686e-05, - "loss": 1.1191, - "step": 8300 - }, - { - "epoch": 0.3812785388127854, - "grad_norm": 79.8065185546875, - "learning_rate": 2.4280821917808222e-05, - "loss": 1.1442, - "step": 8350 - }, - { - "epoch": 0.3835616438356164, - "grad_norm": 48.79990005493164, - "learning_rate": 2.4246575342465755e-05, - "loss": 1.349, - "step": 8400 - }, - { - "epoch": 0.3858447488584475, - "grad_norm": 56.24677658081055, - "learning_rate": 2.4212328767123288e-05, - "loss": 1.0238, - "step": 8450 - }, - { - "epoch": 0.3881278538812785, - "grad_norm": 12.791101455688477, - "learning_rate": 2.4178082191780825e-05, - "loss": 1.0045, - "step": 8500 - }, - { - "epoch": 0.3904109589041096, - "grad_norm": 13.716742515563965, - "learning_rate": 2.4143835616438358e-05, - "loss": 1.0964, - "step": 8550 - }, - { - "epoch": 0.3926940639269406, - "grad_norm": 6.694267272949219, - "learning_rate": 2.410958904109589e-05, - "loss": 0.9396, - "step": 8600 - }, - { - "epoch": 0.3949771689497717, - "grad_norm": 19.237689971923828, - "learning_rate": 2.4075342465753427e-05, - "loss": 1.0931, - "step": 8650 - }, - { - "epoch": 0.3972602739726027, - "grad_norm": 23.344566345214844, - "learning_rate": 2.404109589041096e-05, - "loss": 1.3482, - "step": 8700 - }, - { - "epoch": 0.3995433789954338, - "grad_norm": 9.369404792785645, - "learning_rate": 2.4006849315068497e-05, - "loss": 1.0205, - "step": 8750 - }, - { - "epoch": 0.4018264840182648, - "grad_norm": 20.9238224029541, - "learning_rate": 2.397260273972603e-05, - "loss": 1.0606, - "step": 8800 - }, - { - "epoch": 0.4041095890410959, - "grad_norm": 66.00227355957031, - "learning_rate": 2.3938356164383563e-05, - "loss": 1.103, - "step": 8850 - }, - { - "epoch": 0.4063926940639269, - "grad_norm": 5.158641815185547, - "learning_rate": 2.39041095890411e-05, - "loss": 1.0207, - "step": 8900 - }, - { - "epoch": 0.408675799086758, - "grad_norm": 19.67837142944336, - "learning_rate": 2.3869863013698632e-05, - "loss": 1.108, - "step": 8950 - }, - { - "epoch": 0.410958904109589, - "grad_norm": 48.31447219848633, - "learning_rate": 2.3835616438356165e-05, - "loss": 1.0708, - "step": 9000 - }, - { - "epoch": 0.4132420091324201, - "grad_norm": 18.173908233642578, - "learning_rate": 2.38013698630137e-05, - "loss": 1.0449, - "step": 9050 - }, - { - "epoch": 0.4155251141552511, - "grad_norm": 57.505226135253906, - "learning_rate": 2.376712328767123e-05, - "loss": 1.1911, - "step": 9100 - }, - { - "epoch": 0.4178082191780822, - "grad_norm": 39.649169921875, - "learning_rate": 2.3732876712328768e-05, - "loss": 1.1708, - "step": 9150 - }, - { - "epoch": 0.4200913242009132, - "grad_norm": 7.622274398803711, - "learning_rate": 2.36986301369863e-05, - "loss": 0.9928, - "step": 9200 - }, - { - "epoch": 0.4223744292237443, - "grad_norm": 17.452634811401367, - "learning_rate": 2.3664383561643834e-05, - "loss": 0.9558, - "step": 9250 - }, - { - "epoch": 0.4246575342465753, - "grad_norm": 59.85329055786133, - "learning_rate": 2.363013698630137e-05, - "loss": 1.0805, - "step": 9300 - }, - { - "epoch": 0.4269406392694064, - "grad_norm": 9.634559631347656, - "learning_rate": 2.3595890410958903e-05, - "loss": 0.9829, - "step": 9350 - }, - { - "epoch": 0.4292237442922374, - "grad_norm": 1.924180507659912, - "learning_rate": 2.3561643835616436e-05, - "loss": 1.0737, - "step": 9400 - }, - { - "epoch": 0.4315068493150685, - "grad_norm": 31.276365280151367, - "learning_rate": 2.3527397260273973e-05, - "loss": 1.0905, - "step": 9450 - }, - { - "epoch": 0.4337899543378995, - "grad_norm": 29.218658447265625, - "learning_rate": 2.3493150684931506e-05, - "loss": 1.2167, - "step": 9500 - }, - { - "epoch": 0.4360730593607306, - "grad_norm": 42.0178108215332, - "learning_rate": 2.3458904109589042e-05, - "loss": 1.0009, - "step": 9550 - }, - { - "epoch": 0.4383561643835616, - "grad_norm": 36.67307662963867, - "learning_rate": 2.3424657534246575e-05, - "loss": 1.1016, - "step": 9600 - }, - { - "epoch": 0.4406392694063927, - "grad_norm": 21.634462356567383, - "learning_rate": 2.339041095890411e-05, - "loss": 1.0895, - "step": 9650 - }, - { - "epoch": 0.4429223744292237, - "grad_norm": 20.32236671447754, - "learning_rate": 2.3356164383561645e-05, - "loss": 1.1135, - "step": 9700 - }, - { - "epoch": 0.4452054794520548, - "grad_norm": 5.574302673339844, - "learning_rate": 2.3321917808219178e-05, - "loss": 1.1313, - "step": 9750 - }, - { - "epoch": 0.4474885844748858, - "grad_norm": 5.582201957702637, - "learning_rate": 2.328767123287671e-05, - "loss": 1.0072, - "step": 9800 - }, - { - "epoch": 0.4497716894977169, - "grad_norm": 58.499603271484375, - "learning_rate": 2.3253424657534247e-05, - "loss": 1.2044, - "step": 9850 - }, - { - "epoch": 0.4520547945205479, - "grad_norm": 18.887069702148438, - "learning_rate": 2.321917808219178e-05, - "loss": 1.0175, - "step": 9900 - }, - { - "epoch": 0.454337899543379, - "grad_norm": 33.383880615234375, - "learning_rate": 2.3184931506849317e-05, - "loss": 1.1733, - "step": 9950 - }, - { - "epoch": 0.45662100456621, - "grad_norm": 85.34075927734375, - "learning_rate": 2.315068493150685e-05, - "loss": 1.1876, - "step": 10000 - }, - { - "epoch": 0.4589041095890411, - "grad_norm": 22.957134246826172, - "learning_rate": 2.3116438356164383e-05, - "loss": 1.074, - "step": 10050 - }, - { - "epoch": 0.4611872146118721, - "grad_norm": 57.47038269042969, - "learning_rate": 2.308219178082192e-05, - "loss": 0.9007, - "step": 10100 - }, - { - "epoch": 0.4634703196347032, - "grad_norm": 11.614636421203613, - "learning_rate": 2.3047945205479452e-05, - "loss": 1.0877, - "step": 10150 - }, - { - "epoch": 0.4657534246575342, - "grad_norm": 8.970778465270996, - "learning_rate": 2.3013698630136985e-05, - "loss": 1.3286, - "step": 10200 - }, - { - "epoch": 0.4680365296803653, - "grad_norm": 43.425514221191406, - "learning_rate": 2.2979452054794522e-05, - "loss": 1.1883, - "step": 10250 - }, - { - "epoch": 0.4703196347031963, - "grad_norm": 12.85531234741211, - "learning_rate": 2.2945205479452055e-05, - "loss": 1.0921, - "step": 10300 - }, - { - "epoch": 0.4726027397260274, - "grad_norm": 6.603804111480713, - "learning_rate": 2.291095890410959e-05, - "loss": 1.1883, - "step": 10350 - }, - { - "epoch": 0.4748858447488584, - "grad_norm": 47.8962516784668, - "learning_rate": 2.2876712328767124e-05, - "loss": 1.1814, - "step": 10400 - }, - { - "epoch": 0.4771689497716895, - "grad_norm": 59.406280517578125, - "learning_rate": 2.2842465753424657e-05, - "loss": 1.115, - "step": 10450 - }, - { - "epoch": 0.4794520547945205, - "grad_norm": 43.54423522949219, - "learning_rate": 2.2808219178082194e-05, - "loss": 0.998, - "step": 10500 - }, - { - "epoch": 0.4817351598173516, - "grad_norm": 17.37261962890625, - "learning_rate": 2.2773972602739727e-05, - "loss": 0.9542, - "step": 10550 - }, - { - "epoch": 0.4840182648401826, - "grad_norm": 38.04684829711914, - "learning_rate": 2.273972602739726e-05, - "loss": 1.0195, - "step": 10600 - }, - { - "epoch": 0.4863013698630137, - "grad_norm": 17.202608108520508, - "learning_rate": 2.2705479452054796e-05, - "loss": 1.109, - "step": 10650 - }, - { - "epoch": 0.4885844748858447, - "grad_norm": 4.1588897705078125, - "learning_rate": 2.267123287671233e-05, - "loss": 1.0125, - "step": 10700 - }, - { - "epoch": 0.4908675799086758, - "grad_norm": 10.435503005981445, - "learning_rate": 2.2636986301369866e-05, - "loss": 1.0044, - "step": 10750 - }, - { - "epoch": 0.4931506849315068, - "grad_norm": 62.7014045715332, - "learning_rate": 2.26027397260274e-05, - "loss": 1.0457, - "step": 10800 - }, - { - "epoch": 0.4954337899543379, - "grad_norm": 19.47702407836914, - "learning_rate": 2.2568493150684932e-05, - "loss": 1.0468, - "step": 10850 - }, - { - "epoch": 0.4977168949771689, - "grad_norm": 42.62397766113281, - "learning_rate": 2.253424657534247e-05, - "loss": 1.0374, - "step": 10900 - }, - { - "epoch": 0.5, - "grad_norm": 52.27951431274414, - "learning_rate": 2.25e-05, - "loss": 1.0967, - "step": 10950 - }, - { - "epoch": 0.502283105022831, - "grad_norm": 8.714099884033203, - "learning_rate": 2.2465753424657534e-05, - "loss": 1.2492, - "step": 11000 - }, - { - "epoch": 0.5045662100456622, - "grad_norm": 10.670646667480469, - "learning_rate": 2.243150684931507e-05, - "loss": 1.23, - "step": 11050 - }, - { - "epoch": 0.5068493150684932, - "grad_norm": 30.391632080078125, - "learning_rate": 2.2397260273972604e-05, - "loss": 1.3576, - "step": 11100 - }, - { - "epoch": 0.5091324200913242, - "grad_norm": 36.597408294677734, - "learning_rate": 2.2363013698630137e-05, - "loss": 0.9834, - "step": 11150 - }, - { - "epoch": 0.5114155251141552, - "grad_norm": 58.925941467285156, - "learning_rate": 2.2328767123287673e-05, - "loss": 1.139, - "step": 11200 - }, - { - "epoch": 0.5136986301369864, - "grad_norm": 52.20880126953125, - "learning_rate": 2.2294520547945206e-05, - "loss": 1.1037, - "step": 11250 - }, - { - "epoch": 0.5159817351598174, - "grad_norm": 104.13248443603516, - "learning_rate": 2.2260273972602743e-05, - "loss": 1.0124, - "step": 11300 - }, - { - "epoch": 0.5182648401826484, - "grad_norm": 37.718406677246094, - "learning_rate": 2.2226027397260276e-05, - "loss": 1.1873, - "step": 11350 - }, - { - "epoch": 0.5205479452054794, - "grad_norm": 31.402559280395508, - "learning_rate": 2.219178082191781e-05, - "loss": 1.1506, - "step": 11400 - }, - { - "epoch": 0.5228310502283106, - "grad_norm": 7.707183361053467, - "learning_rate": 2.2157534246575345e-05, - "loss": 1.2423, - "step": 11450 - }, - { - "epoch": 0.5251141552511416, - "grad_norm": 45.80585861206055, - "learning_rate": 2.212328767123288e-05, - "loss": 1.1457, - "step": 11500 - }, - { - "epoch": 0.5273972602739726, - "grad_norm": 58.7805061340332, - "learning_rate": 2.208904109589041e-05, - "loss": 1.0302, - "step": 11550 - }, - { - "epoch": 0.5296803652968036, - "grad_norm": 29.0611515045166, - "learning_rate": 2.2054794520547945e-05, - "loss": 1.1852, - "step": 11600 - }, - { - "epoch": 0.5319634703196348, - "grad_norm": 17.17108917236328, - "learning_rate": 2.2020547945205478e-05, - "loss": 1.0164, - "step": 11650 - }, - { - "epoch": 0.5342465753424658, - "grad_norm": 28.332836151123047, - "learning_rate": 2.1986301369863014e-05, - "loss": 0.968, - "step": 11700 - }, - { - "epoch": 0.5365296803652968, - "grad_norm": 7.278358459472656, - "learning_rate": 2.1952054794520547e-05, - "loss": 1.2058, - "step": 11750 - }, - { - "epoch": 0.5388127853881278, - "grad_norm": 15.584250450134277, - "learning_rate": 2.191780821917808e-05, - "loss": 1.0892, - "step": 11800 - }, - { - "epoch": 0.541095890410959, - "grad_norm": 22.4052791595459, - "learning_rate": 2.1883561643835617e-05, - "loss": 0.9984, - "step": 11850 - }, - { - "epoch": 0.54337899543379, - "grad_norm": 31.585800170898438, - "learning_rate": 2.184931506849315e-05, - "loss": 1.2338, - "step": 11900 - }, - { - "epoch": 0.545662100456621, - "grad_norm": 19.224178314208984, - "learning_rate": 2.1815068493150683e-05, - "loss": 1.0994, - "step": 11950 - }, - { - "epoch": 0.547945205479452, - "grad_norm": 15.44568157196045, - "learning_rate": 2.178082191780822e-05, - "loss": 1.0938, - "step": 12000 - }, - { - "epoch": 0.5502283105022832, - "grad_norm": 9.826760292053223, - "learning_rate": 2.1746575342465752e-05, - "loss": 1.1607, - "step": 12050 - }, - { - "epoch": 0.5525114155251142, - "grad_norm": 8.327254295349121, - "learning_rate": 2.171232876712329e-05, - "loss": 1.0973, - "step": 12100 - }, - { - "epoch": 0.5547945205479452, - "grad_norm": 39.339962005615234, - "learning_rate": 2.167808219178082e-05, - "loss": 1.1066, - "step": 12150 - }, - { - "epoch": 0.5570776255707762, - "grad_norm": 27.773502349853516, - "learning_rate": 2.1643835616438355e-05, - "loss": 0.9664, - "step": 12200 - }, - { - "epoch": 0.5593607305936074, - "grad_norm": 34.17534637451172, - "learning_rate": 2.160958904109589e-05, - "loss": 1.2806, - "step": 12250 - }, - { - "epoch": 0.5616438356164384, - "grad_norm": 14.751806259155273, - "learning_rate": 2.1575342465753424e-05, - "loss": 1.0973, - "step": 12300 - }, - { - "epoch": 0.5639269406392694, - "grad_norm": 39.10814666748047, - "learning_rate": 2.1541095890410957e-05, - "loss": 1.0821, - "step": 12350 - }, - { - "epoch": 0.5662100456621004, - "grad_norm": 14.468631744384766, - "learning_rate": 2.1506849315068494e-05, - "loss": 0.8218, - "step": 12400 - }, - { - "epoch": 0.5684931506849316, - "grad_norm": 30.599267959594727, - "learning_rate": 2.1472602739726027e-05, - "loss": 1.0412, - "step": 12450 - }, - { - "epoch": 0.5707762557077626, - "grad_norm": 15.382964134216309, - "learning_rate": 2.1438356164383563e-05, - "loss": 0.9351, - "step": 12500 - }, - { - "epoch": 0.5730593607305936, - "grad_norm": 44.646480560302734, - "learning_rate": 2.1404109589041096e-05, - "loss": 1.1423, - "step": 12550 - }, - { - "epoch": 0.5753424657534246, - "grad_norm": 40.618309020996094, - "learning_rate": 2.136986301369863e-05, - "loss": 1.0657, - "step": 12600 - }, - { - "epoch": 0.5776255707762558, - "grad_norm": 50.3376579284668, - "learning_rate": 2.1335616438356166e-05, - "loss": 0.9775, - "step": 12650 - }, - { - "epoch": 0.5799086757990868, - "grad_norm": 60.17688751220703, - "learning_rate": 2.13013698630137e-05, - "loss": 1.1291, - "step": 12700 - }, - { - "epoch": 0.5821917808219178, - "grad_norm": 11.148223876953125, - "learning_rate": 2.126712328767123e-05, - "loss": 1.1556, - "step": 12750 - }, - { - "epoch": 0.5844748858447488, - "grad_norm": 20.931018829345703, - "learning_rate": 2.1232876712328768e-05, - "loss": 1.0614, - "step": 12800 - }, - { - "epoch": 0.58675799086758, - "grad_norm": 29.79435157775879, - "learning_rate": 2.11986301369863e-05, - "loss": 1.2341, - "step": 12850 - }, - { - "epoch": 0.589041095890411, - "grad_norm": 27.72561264038086, - "learning_rate": 2.1164383561643838e-05, - "loss": 0.9783, - "step": 12900 - }, - { - "epoch": 0.591324200913242, - "grad_norm": 6.564560890197754, - "learning_rate": 2.113013698630137e-05, - "loss": 1.0293, - "step": 12950 - }, - { - "epoch": 0.593607305936073, - "grad_norm": 68.2934799194336, - "learning_rate": 2.1095890410958904e-05, - "loss": 1.1324, - "step": 13000 - }, - { - "epoch": 0.5958904109589042, - "grad_norm": 8.194302558898926, - "learning_rate": 2.106164383561644e-05, - "loss": 0.9099, - "step": 13050 - }, - { - "epoch": 0.5981735159817352, - "grad_norm": 25.813316345214844, - "learning_rate": 2.1027397260273973e-05, - "loss": 1.0368, - "step": 13100 - }, - { - "epoch": 0.6004566210045662, - "grad_norm": 19.83176612854004, - "learning_rate": 2.0993150684931506e-05, - "loss": 0.9496, - "step": 13150 - }, - { - "epoch": 0.6027397260273972, - "grad_norm": 17.05171012878418, - "learning_rate": 2.0958904109589043e-05, - "loss": 1.1386, - "step": 13200 - }, - { - "epoch": 0.6050228310502284, - "grad_norm": 12.28995418548584, - "learning_rate": 2.0924657534246576e-05, - "loss": 1.1897, - "step": 13250 - }, - { - "epoch": 0.6073059360730594, - "grad_norm": 22.48783302307129, - "learning_rate": 2.0890410958904112e-05, - "loss": 1.1413, - "step": 13300 - }, - { - "epoch": 0.6095890410958904, - "grad_norm": 37.44598388671875, - "learning_rate": 2.0856164383561645e-05, - "loss": 1.1138, - "step": 13350 - }, - { - "epoch": 0.6118721461187214, - "grad_norm": 19.184656143188477, - "learning_rate": 2.0821917808219178e-05, - "loss": 1.0924, - "step": 13400 - }, - { - "epoch": 0.6141552511415526, - "grad_norm": 30.34543228149414, - "learning_rate": 2.0787671232876715e-05, - "loss": 1.0382, - "step": 13450 - }, - { - "epoch": 0.6164383561643836, - "grad_norm": 18.820228576660156, - "learning_rate": 2.0753424657534248e-05, - "loss": 1.0618, - "step": 13500 - }, - { - "epoch": 0.6187214611872146, - "grad_norm": 3.0790977478027344, - "learning_rate": 2.071917808219178e-05, - "loss": 1.0412, - "step": 13550 - }, - { - "epoch": 0.6210045662100456, - "grad_norm": 42.972923278808594, - "learning_rate": 2.0684931506849317e-05, - "loss": 0.9701, - "step": 13600 - }, - { - "epoch": 0.6232876712328768, - "grad_norm": 13.229798316955566, - "learning_rate": 2.065068493150685e-05, - "loss": 0.9746, - "step": 13650 - }, - { - "epoch": 0.6255707762557078, - "grad_norm": 87.67366790771484, - "learning_rate": 2.0616438356164387e-05, - "loss": 1.1411, - "step": 13700 - }, - { - "epoch": 0.6278538812785388, - "grad_norm": 12.436323165893555, - "learning_rate": 2.058219178082192e-05, - "loss": 1.208, - "step": 13750 - }, - { - "epoch": 0.6301369863013698, - "grad_norm": 19.009395599365234, - "learning_rate": 2.0547945205479453e-05, - "loss": 1.0635, - "step": 13800 - }, - { - "epoch": 0.632420091324201, - "grad_norm": 6.680107593536377, - "learning_rate": 2.051369863013699e-05, - "loss": 1.0463, - "step": 13850 - }, - { - "epoch": 0.634703196347032, - "grad_norm": 11.398260116577148, - "learning_rate": 2.0479452054794522e-05, - "loss": 0.9179, - "step": 13900 - }, - { - "epoch": 0.636986301369863, - "grad_norm": 4.983315467834473, - "learning_rate": 2.0445205479452055e-05, - "loss": 1.0699, - "step": 13950 - }, - { - "epoch": 0.639269406392694, - "grad_norm": 29.882274627685547, - "learning_rate": 2.041095890410959e-05, - "loss": 1.1485, - "step": 14000 - }, - { - "epoch": 0.6415525114155252, - "grad_norm": 9.104654312133789, - "learning_rate": 2.0376712328767125e-05, - "loss": 0.9783, - "step": 14050 - }, - { - "epoch": 0.6438356164383562, - "grad_norm": 8.866716384887695, - "learning_rate": 2.034246575342466e-05, - "loss": 0.9843, - "step": 14100 - }, - { - "epoch": 0.6461187214611872, - "grad_norm": 20.7504825592041, - "learning_rate": 2.0308219178082194e-05, - "loss": 0.9999, - "step": 14150 - }, - { - "epoch": 0.6484018264840182, - "grad_norm": 16.239980697631836, - "learning_rate": 2.0273972602739724e-05, - "loss": 0.9305, - "step": 14200 - }, - { - "epoch": 0.6506849315068494, - "grad_norm": 52.316673278808594, - "learning_rate": 2.023972602739726e-05, - "loss": 0.9185, - "step": 14250 - }, - { - "epoch": 0.6529680365296804, - "grad_norm": 13.330124855041504, - "learning_rate": 2.0205479452054793e-05, - "loss": 0.9943, - "step": 14300 - }, - { - "epoch": 0.6552511415525114, - "grad_norm": 13.177003860473633, - "learning_rate": 2.0171232876712326e-05, - "loss": 0.7542, - "step": 14350 - }, - { - "epoch": 0.6575342465753424, - "grad_norm": 36.442108154296875, - "learning_rate": 2.0136986301369863e-05, - "loss": 0.8722, - "step": 14400 - }, - { - "epoch": 0.6598173515981736, - "grad_norm": 26.251983642578125, - "learning_rate": 2.0102739726027396e-05, - "loss": 0.9657, - "step": 14450 - }, - { - "epoch": 0.6621004566210046, - "grad_norm": 35.95085144042969, - "learning_rate": 2.0068493150684932e-05, - "loss": 1.0656, - "step": 14500 - }, - { - "epoch": 0.6643835616438356, - "grad_norm": 29.169719696044922, - "learning_rate": 2.0034246575342465e-05, - "loss": 1.076, - "step": 14550 - }, - { - "epoch": 0.6666666666666666, - "grad_norm": 49.64678955078125, - "learning_rate": 1.9999999999999998e-05, - "loss": 0.9267, - "step": 14600 - }, - { - "epoch": 0.6689497716894978, - "grad_norm": 25.249004364013672, - "learning_rate": 1.9965753424657535e-05, - "loss": 1.1205, - "step": 14650 - }, - { - "epoch": 0.6712328767123288, - "grad_norm": 21.96654510498047, - "learning_rate": 1.9931506849315068e-05, - "loss": 0.8928, - "step": 14700 - }, - { - "epoch": 0.6735159817351598, - "grad_norm": 78.21800994873047, - "learning_rate": 1.98972602739726e-05, - "loss": 0.9646, - "step": 14750 - }, - { - "epoch": 0.6757990867579908, - "grad_norm": 49.2692756652832, - "learning_rate": 1.9863013698630137e-05, - "loss": 1.0115, - "step": 14800 - }, - { - "epoch": 0.678082191780822, - "grad_norm": 35.43125915527344, - "learning_rate": 1.982876712328767e-05, - "loss": 1.0473, - "step": 14850 - }, - { - "epoch": 0.680365296803653, - "grad_norm": 18.292205810546875, - "learning_rate": 1.9794520547945207e-05, - "loss": 0.7609, - "step": 14900 - }, - { - "epoch": 0.682648401826484, - "grad_norm": 18.42896842956543, - "learning_rate": 1.976027397260274e-05, - "loss": 0.989, - "step": 14950 - }, - { - "epoch": 0.684931506849315, - "grad_norm": 25.584640502929688, - "learning_rate": 1.9726027397260273e-05, - "loss": 1.1391, - "step": 15000 - }, - { - "epoch": 0.6872146118721462, - "grad_norm": 12.116101264953613, - "learning_rate": 1.969178082191781e-05, - "loss": 1.09, - "step": 15050 - }, - { - "epoch": 0.6894977168949772, - "grad_norm": 4.520615100860596, - "learning_rate": 1.9657534246575342e-05, - "loss": 0.9834, - "step": 15100 - }, - { - "epoch": 0.6917808219178082, - "grad_norm": 16.461069107055664, - "learning_rate": 1.9623287671232875e-05, - "loss": 0.8777, - "step": 15150 - }, - { - "epoch": 0.6940639269406392, - "grad_norm": 41.71174240112305, - "learning_rate": 1.9589041095890412e-05, - "loss": 1.2619, - "step": 15200 - }, - { - "epoch": 0.6963470319634704, - "grad_norm": 10.733747482299805, - "learning_rate": 1.9554794520547945e-05, - "loss": 1.0843, - "step": 15250 - }, - { - "epoch": 0.6986301369863014, - "grad_norm": 34.77432632446289, - "learning_rate": 1.952054794520548e-05, - "loss": 0.9292, - "step": 15300 - }, - { - "epoch": 0.7009132420091324, - "grad_norm": 46.95319747924805, - "learning_rate": 1.9486301369863014e-05, - "loss": 1.0058, - "step": 15350 - }, - { - "epoch": 0.7031963470319634, - "grad_norm": 1.2955800294876099, - "learning_rate": 1.9452054794520547e-05, - "loss": 0.9672, - "step": 15400 - }, - { - "epoch": 0.7054794520547946, - "grad_norm": 16.37827491760254, - "learning_rate": 1.9417808219178084e-05, - "loss": 0.9386, - "step": 15450 - }, - { - "epoch": 0.7077625570776256, - "grad_norm": 41.87287139892578, - "learning_rate": 1.9383561643835617e-05, - "loss": 1.0772, - "step": 15500 - }, - { - "epoch": 0.7100456621004566, - "grad_norm": 8.469085693359375, - "learning_rate": 1.934931506849315e-05, - "loss": 0.9201, - "step": 15550 - }, - { - "epoch": 0.7123287671232876, - "grad_norm": 5.279160976409912, - "learning_rate": 1.9315068493150686e-05, - "loss": 1.2076, - "step": 15600 - }, - { - "epoch": 0.7146118721461188, - "grad_norm": 24.052440643310547, - "learning_rate": 1.928082191780822e-05, - "loss": 1.079, - "step": 15650 - }, - { - "epoch": 0.7168949771689498, - "grad_norm": 43.76991271972656, - "learning_rate": 1.9246575342465756e-05, - "loss": 0.9909, - "step": 15700 - }, - { - "epoch": 0.7191780821917808, - "grad_norm": 0.4643230438232422, - "learning_rate": 1.921232876712329e-05, - "loss": 1.0314, - "step": 15750 - }, - { - "epoch": 0.7214611872146118, - "grad_norm": 31.16683006286621, - "learning_rate": 1.9178082191780822e-05, - "loss": 1.0446, - "step": 15800 - }, - { - "epoch": 0.723744292237443, - "grad_norm": 2.468339443206787, - "learning_rate": 1.9143835616438358e-05, - "loss": 0.9801, - "step": 15850 - }, - { - "epoch": 0.726027397260274, - "grad_norm": 18.055511474609375, - "learning_rate": 1.910958904109589e-05, - "loss": 0.9679, - "step": 15900 - }, - { - "epoch": 0.728310502283105, - "grad_norm": 11.843624114990234, - "learning_rate": 1.9075342465753424e-05, - "loss": 1.049, - "step": 15950 - }, - { - "epoch": 0.730593607305936, - "grad_norm": 28.83637237548828, - "learning_rate": 1.904109589041096e-05, - "loss": 1.0862, - "step": 16000 - }, - { - "epoch": 0.7328767123287672, - "grad_norm": 41.827579498291016, - "learning_rate": 1.9006849315068494e-05, - "loss": 0.9812, - "step": 16050 - }, - { - "epoch": 0.7351598173515982, - "grad_norm": 19.110126495361328, - "learning_rate": 1.897260273972603e-05, - "loss": 1.2085, - "step": 16100 - }, - { - "epoch": 0.7374429223744292, - "grad_norm": 7.440555572509766, - "learning_rate": 1.8938356164383563e-05, - "loss": 1.0045, - "step": 16150 - }, - { - "epoch": 0.7397260273972602, - "grad_norm": 14.894591331481934, - "learning_rate": 1.8904109589041096e-05, - "loss": 1.0214, - "step": 16200 - }, - { - "epoch": 0.7420091324200914, - "grad_norm": 50.265140533447266, - "learning_rate": 1.8869863013698633e-05, - "loss": 1.0115, - "step": 16250 - }, - { - "epoch": 0.7442922374429224, - "grad_norm": 48.24840545654297, - "learning_rate": 1.8835616438356166e-05, - "loss": 1.022, - "step": 16300 - }, - { - "epoch": 0.7465753424657534, - "grad_norm": 23.087678909301758, - "learning_rate": 1.88013698630137e-05, - "loss": 0.9043, - "step": 16350 - }, - { - "epoch": 0.7488584474885844, - "grad_norm": 41.646461486816406, - "learning_rate": 1.8767123287671235e-05, - "loss": 1.0964, - "step": 16400 - }, - { - "epoch": 0.7511415525114156, - "grad_norm": 18.607566833496094, - "learning_rate": 1.8732876712328768e-05, - "loss": 1.1307, - "step": 16450 - }, { "epoch": 0.7534246575342466, - "grad_norm": 19.668004989624023, + "grad_norm": 18.786693572998047, "learning_rate": 1.8698630136986305e-05, - "loss": 0.9917, - "step": 16500 - }, - { - "epoch": 0.7557077625570776, - "grad_norm": 81.84849548339844, - "learning_rate": 1.8664383561643838e-05, - "loss": 1.1245, - "step": 16550 + "loss": 1.0579, + "step": 8250 }, { "epoch": 0.7579908675799086, - "grad_norm": 17.170801162719727, + "grad_norm": 22.201034545898438, "learning_rate": 1.863013698630137e-05, - "loss": 0.8943, - "step": 16600 - }, - { - "epoch": 0.7602739726027398, - "grad_norm": 20.587200164794922, - "learning_rate": 1.8595890410958907e-05, - "loss": 0.9715, - "step": 16650 + "loss": 1.003, + "step": 8300 }, { "epoch": 0.7625570776255708, - "grad_norm": 65.86396026611328, + "grad_norm": 28.934587478637695, "learning_rate": 1.856164383561644e-05, - "loss": 1.0448, - "step": 16700 - }, - { - "epoch": 0.7648401826484018, - "grad_norm": 16.04901695251465, - "learning_rate": 1.8527397260273973e-05, - "loss": 1.0682, - "step": 16750 + "loss": 0.9753, + "step": 8350 }, { "epoch": 0.7671232876712328, - "grad_norm": 19.407167434692383, + "grad_norm": 17.935937881469727, "learning_rate": 1.8493150684931506e-05, - "loss": 0.9607, - "step": 16800 - }, - { - "epoch": 0.769406392694064, - "grad_norm": 13.71429443359375, - "learning_rate": 1.845890410958904e-05, - "loss": 1.0604, - "step": 16850 + "loss": 0.9038, + "step": 8400 }, { "epoch": 0.771689497716895, - "grad_norm": 51.34487533569336, + "grad_norm": 29.648881912231445, "learning_rate": 1.8424657534246576e-05, - "loss": 0.8779, - "step": 16900 - }, - { - "epoch": 0.773972602739726, - "grad_norm": 13.411955833435059, - "learning_rate": 1.839041095890411e-05, - "loss": 1.0547, - "step": 16950 + "loss": 1.0109, + "step": 8450 }, { "epoch": 0.776255707762557, - "grad_norm": 60.41900634765625, + "grad_norm": 32.65835189819336, "learning_rate": 1.8356164383561642e-05, - "loss": 1.0141, - "step": 17000 - }, - { - "epoch": 0.7785388127853882, - "grad_norm": 65.25275421142578, - "learning_rate": 1.832191780821918e-05, - "loss": 1.0541, - "step": 17050 + "loss": 1.0207, + "step": 8500 }, { "epoch": 0.7808219178082192, - "grad_norm": 64.50421905517578, + "grad_norm": 21.674053192138672, "learning_rate": 1.828767123287671e-05, - "loss": 1.0467, - "step": 17100 - }, - { - "epoch": 0.7831050228310502, - "grad_norm": 35.63545608520508, - "learning_rate": 1.8253424657534244e-05, - "loss": 1.0431, - "step": 17150 + "loss": 1.0573, + "step": 8550 }, { "epoch": 0.7853881278538812, - "grad_norm": 8.089126586914062, + "grad_norm": 19.038619995117188, "learning_rate": 1.821917808219178e-05, - "loss": 1.0258, - "step": 17200 - }, - { - "epoch": 0.7876712328767124, - "grad_norm": 33.44828414916992, - "learning_rate": 1.8184931506849314e-05, - "loss": 0.9516, - "step": 17250 + "loss": 1.042, + "step": 8600 }, { "epoch": 0.7899543378995434, - "grad_norm": 52.08647537231445, + "grad_norm": 23.994884490966797, "learning_rate": 1.815068493150685e-05, - "loss": 1.0529, - "step": 17300 - }, - { - "epoch": 0.7922374429223744, - "grad_norm": 33.359886169433594, - "learning_rate": 1.8116438356164383e-05, - "loss": 1.0985, - "step": 17350 + "loss": 1.0316, + "step": 8650 }, { "epoch": 0.7945205479452054, - "grad_norm": 43.1573486328125, + "grad_norm": 26.702028274536133, "learning_rate": 1.8082191780821916e-05, - "loss": 0.987, - "step": 17400 - }, - { - "epoch": 0.7968036529680366, - "grad_norm": 41.887718200683594, - "learning_rate": 1.8047945205479453e-05, - "loss": 0.9562, - "step": 17450 + "loss": 0.9592, + "step": 8700 }, { "epoch": 0.7990867579908676, - "grad_norm": 4.935914993286133, + "grad_norm": 7.485787391662598, "learning_rate": 1.8013698630136986e-05, - "loss": 1.011, - "step": 17500 - }, - { - "epoch": 0.8013698630136986, - "grad_norm": 17.437244415283203, - "learning_rate": 1.797945205479452e-05, - "loss": 1.1939, - "step": 17550 + "loss": 1.0175, + "step": 8750 }, { "epoch": 0.8036529680365296, - "grad_norm": 38.83684158325195, + "grad_norm": 24.249893188476562, "learning_rate": 1.7945205479452055e-05, - "loss": 1.0831, - "step": 17600 - }, - { - "epoch": 0.8059360730593608, - "grad_norm": 389.5946960449219, - "learning_rate": 1.791095890410959e-05, - "loss": 0.9442, - "step": 17650 + "loss": 1.0643, + "step": 8800 }, { "epoch": 0.8082191780821918, - "grad_norm": 55.63192367553711, + "grad_norm": 31.136962890625, "learning_rate": 1.7876712328767125e-05, - "loss": 0.8581, - "step": 17700 - }, - { - "epoch": 0.8105022831050228, - "grad_norm": 12.910749435424805, - "learning_rate": 1.7842465753424658e-05, - "loss": 0.9023, - "step": 17750 + "loss": 0.8719, + "step": 8850 }, { "epoch": 0.8127853881278538, - "grad_norm": 25.35328483581543, + "grad_norm": 18.02696418762207, "learning_rate": 1.780821917808219e-05, - "loss": 1.0642, - "step": 17800 - }, - { - "epoch": 0.815068493150685, - "grad_norm": 14.35108470916748, - "learning_rate": 1.7773972602739727e-05, - "loss": 0.9517, - "step": 17850 + "loss": 0.9169, + "step": 8900 }, { "epoch": 0.817351598173516, - "grad_norm": 16.878704071044922, + "grad_norm": 7.9297261238098145, "learning_rate": 1.773972602739726e-05, - "loss": 1.0333, - "step": 17900 - }, - { - "epoch": 0.819634703196347, - "grad_norm": 4.217007160186768, - "learning_rate": 1.7705479452054793e-05, - "loss": 1.048, - "step": 17950 + "loss": 0.9241, + "step": 8950 }, { "epoch": 0.821917808219178, - "grad_norm": 7.6629767417907715, + "grad_norm": 15.147187232971191, "learning_rate": 1.767123287671233e-05, - "loss": 0.9847, - "step": 18000 - }, - { - "epoch": 0.8242009132420092, - "grad_norm": 8.383101463317871, - "learning_rate": 1.7636986301369863e-05, - "loss": 0.8269, - "step": 18050 + "loss": 0.9451, + "step": 9000 }, { "epoch": 0.8264840182648402, - "grad_norm": 21.882862091064453, + "grad_norm": 26.369773864746094, "learning_rate": 1.76027397260274e-05, - "loss": 1.0138, - "step": 18100 - }, - { - "epoch": 0.8287671232876712, - "grad_norm": 15.08014965057373, - "learning_rate": 1.7568493150684932e-05, - "loss": 1.0072, - "step": 18150 + "loss": 0.9198, + "step": 9050 }, { "epoch": 0.8310502283105022, - "grad_norm": 9.188797950744629, + "grad_norm": 10.24289321899414, "learning_rate": 1.7534246575342465e-05, - "loss": 0.998, - "step": 18200 - }, - { - "epoch": 0.8333333333333334, - "grad_norm": 8.108163833618164, - "learning_rate": 1.7500000000000002e-05, - "loss": 0.8591, - "step": 18250 + "loss": 0.9423, + "step": 9100 }, { "epoch": 0.8356164383561644, - "grad_norm": 8.082106590270996, + "grad_norm": 17.285388946533203, "learning_rate": 1.7465753424657535e-05, - "loss": 0.9486, - "step": 18300 - }, - { - "epoch": 0.8378995433789954, - "grad_norm": 9.28232479095459, - "learning_rate": 1.7431506849315068e-05, - "loss": 1.1733, - "step": 18350 + "loss": 0.8636, + "step": 9150 }, { "epoch": 0.8401826484018264, - "grad_norm": 42.009498596191406, + "grad_norm": 26.444520950317383, "learning_rate": 1.7397260273972604e-05, - "loss": 1.0863, - "step": 18400 - }, - { - "epoch": 0.8424657534246576, - "grad_norm": 16.338623046875, - "learning_rate": 1.7363013698630137e-05, - "loss": 0.8428, - "step": 18450 + "loss": 1.0647, + "step": 9200 }, { "epoch": 0.8447488584474886, - "grad_norm": 22.031343460083008, + "grad_norm": 37.353084564208984, "learning_rate": 1.7328767123287674e-05, - "loss": 0.9678, - "step": 18500 - }, - { - "epoch": 0.8470319634703196, - "grad_norm": 60.101600646972656, - "learning_rate": 1.7294520547945207e-05, - "loss": 1.0961, - "step": 18550 + "loss": 0.9484, + "step": 9250 }, { "epoch": 0.8493150684931506, - "grad_norm": 17.335065841674805, + "grad_norm": 9.881476402282715, "learning_rate": 1.726027397260274e-05, - "loss": 0.9917, - "step": 18600 - }, - { - "epoch": 0.8515981735159818, - "grad_norm": 1.8705586194992065, - "learning_rate": 1.7226027397260276e-05, - "loss": 0.9031, - "step": 18650 + "loss": 1.014, + "step": 9300 }, { "epoch": 0.8538812785388128, - "grad_norm": 8.452926635742188, + "grad_norm": 27.257709503173828, "learning_rate": 1.719178082191781e-05, - "loss": 0.9161, - "step": 18700 - }, - { - "epoch": 0.8561643835616438, - "grad_norm": 22.091020584106445, - "learning_rate": 1.7157534246575342e-05, - "loss": 0.9697, - "step": 18750 + "loss": 0.9169, + "step": 9350 }, { "epoch": 0.8584474885844748, - "grad_norm": 14.892813682556152, + "grad_norm": 20.704347610473633, "learning_rate": 1.712328767123288e-05, - "loss": 1.1043, - "step": 18800 - }, - { - "epoch": 0.860730593607306, - "grad_norm": 5.6726274490356445, - "learning_rate": 1.7089041095890412e-05, - "loss": 0.9272, - "step": 18850 + "loss": 0.9597, + "step": 9400 }, { "epoch": 0.863013698630137, - "grad_norm": 4.598392963409424, + "grad_norm": 20.994293212890625, "learning_rate": 1.705479452054795e-05, - "loss": 1.0438, - "step": 18900 - }, - { - "epoch": 0.865296803652968, - "grad_norm": 10.54848575592041, - "learning_rate": 1.702054794520548e-05, - "loss": 1.068, - "step": 18950 + "loss": 0.9328, + "step": 9450 }, { "epoch": 0.867579908675799, - "grad_norm": 43.7364616394043, + "grad_norm": 31.617778778076172, "learning_rate": 1.6986301369863014e-05, - "loss": 0.9575, - "step": 19000 - }, - { - "epoch": 0.8698630136986302, - "grad_norm": 17.5739803314209, - "learning_rate": 1.695205479452055e-05, - "loss": 0.933, - "step": 19050 + "loss": 0.9988, + "step": 9500 }, { "epoch": 0.8721461187214612, - "grad_norm": 2.4345619678497314, + "grad_norm": 24.76189422607422, "learning_rate": 1.6917808219178084e-05, - "loss": 0.8702, - "step": 19100 - }, - { - "epoch": 0.8744292237442922, - "grad_norm": 27.58513069152832, - "learning_rate": 1.6883561643835617e-05, - "loss": 0.9842, - "step": 19150 + "loss": 0.9338, + "step": 9550 }, { "epoch": 0.8767123287671232, - "grad_norm": 44.58627700805664, + "grad_norm": 27.70781135559082, "learning_rate": 1.6849315068493153e-05, - "loss": 0.9117, - "step": 19200 - }, - { - "epoch": 0.8789954337899544, - "grad_norm": 23.16618537902832, - "learning_rate": 1.6815068493150686e-05, - "loss": 0.9699, - "step": 19250 + "loss": 0.8994, + "step": 9600 }, { "epoch": 0.8812785388127854, - "grad_norm": 10.979578971862793, + "grad_norm": 6.332718372344971, "learning_rate": 1.6780821917808223e-05, - "loss": 0.9397, - "step": 19300 - }, - { - "epoch": 0.8835616438356164, - "grad_norm": 12.799473762512207, - "learning_rate": 1.6746575342465753e-05, - "loss": 1.0797, - "step": 19350 + "loss": 0.8815, + "step": 9650 }, { "epoch": 0.8858447488584474, - "grad_norm": 6.065539836883545, + "grad_norm": 25.22089195251465, "learning_rate": 1.6712328767123286e-05, - "loss": 1.1177, - "step": 19400 - }, - { - "epoch": 0.8881278538812786, - "grad_norm": 14.423439979553223, - "learning_rate": 1.6678082191780822e-05, - "loss": 0.8818, - "step": 19450 + "loss": 1.0243, + "step": 9700 }, { "epoch": 0.8904109589041096, - "grad_norm": 1.1675946712493896, + "grad_norm": 6.610196590423584, "learning_rate": 1.6643835616438355e-05, - "loss": 0.8819, - "step": 19500 - }, - { - "epoch": 0.8926940639269406, - "grad_norm": 18.09140396118164, - "learning_rate": 1.6609589041095888e-05, - "loss": 0.9183, - "step": 19550 + "loss": 0.856, + "step": 9750 }, { "epoch": 0.8949771689497716, - "grad_norm": 11.98416805267334, + "grad_norm": 9.7604341506958, "learning_rate": 1.6575342465753425e-05, - "loss": 1.0715, - "step": 19600 - }, - { - "epoch": 0.8972602739726028, - "grad_norm": 16.66376495361328, - "learning_rate": 1.6541095890410958e-05, - "loss": 0.7896, - "step": 19650 + "loss": 0.9275, + "step": 9800 }, { "epoch": 0.8995433789954338, - "grad_norm": 17.945493698120117, + "grad_norm": 13.222633361816406, "learning_rate": 1.6506849315068494e-05, - "loss": 0.8961, - "step": 19700 - }, - { - "epoch": 0.9018264840182648, - "grad_norm": 34.556827545166016, - "learning_rate": 1.6472602739726027e-05, - "loss": 0.9543, - "step": 19750 + "loss": 0.7912, + "step": 9850 }, { "epoch": 0.9041095890410958, - "grad_norm": 16.06093406677246, + "grad_norm": 8.07390022277832, "learning_rate": 1.643835616438356e-05, - "loss": 0.913, - "step": 19800 - }, - { - "epoch": 0.906392694063927, - "grad_norm": 28.835208892822266, - "learning_rate": 1.6404109589041096e-05, - "loss": 1.068, - "step": 19850 + "loss": 0.9296, + "step": 9900 }, { "epoch": 0.908675799086758, - "grad_norm": 3.6414895057678223, + "grad_norm": 8.25124740600586, "learning_rate": 1.636986301369863e-05, - "loss": 0.9695, - "step": 19900 - }, - { - "epoch": 0.910958904109589, - "grad_norm": 16.142841339111328, - "learning_rate": 1.6335616438356163e-05, - "loss": 1.0057, - "step": 19950 + "loss": 0.9269, + "step": 9950 }, { "epoch": 0.91324200913242, - "grad_norm": 15.585680961608887, + "grad_norm": 22.882007598876953, "learning_rate": 1.63013698630137e-05, - "loss": 0.964, - "step": 20000 - }, - { - "epoch": 0.9155251141552512, - "grad_norm": 23.79058265686035, - "learning_rate": 1.6267123287671232e-05, - "loss": 0.8243, - "step": 20050 + "loss": 0.9694, + "step": 10000 }, { "epoch": 0.9178082191780822, - "grad_norm": 14.258800506591797, + "grad_norm": 8.665299415588379, "learning_rate": 1.623287671232877e-05, - "loss": 0.9247, - "step": 20100 - }, - { - "epoch": 0.9200913242009132, - "grad_norm": 16.076623916625977, - "learning_rate": 1.61986301369863e-05, - "loss": 0.8446, - "step": 20150 + "loss": 0.8643, + "step": 10050 }, { "epoch": 0.9223744292237442, - "grad_norm": 19.330589294433594, + "grad_norm": 13.816045761108398, "learning_rate": 1.6164383561643835e-05, - "loss": 1.0295, - "step": 20200 - }, - { - "epoch": 0.9246575342465754, - "grad_norm": 29.074434280395508, - "learning_rate": 1.613013698630137e-05, - "loss": 1.0043, - "step": 20250 + "loss": 0.9476, + "step": 10100 }, { "epoch": 0.9269406392694064, - "grad_norm": 19.37755012512207, + "grad_norm": 11.227286338806152, "learning_rate": 1.6095890410958904e-05, - "loss": 0.8284, - "step": 20300 - }, - { - "epoch": 0.9292237442922374, - "grad_norm": 17.540924072265625, - "learning_rate": 1.6061643835616437e-05, - "loss": 0.9585, - "step": 20350 + "loss": 0.8805, + "step": 10150 }, { "epoch": 0.9315068493150684, - "grad_norm": 15.943336486816406, + "grad_norm": 7.756823539733887, "learning_rate": 1.6027397260273974e-05, - "loss": 0.9534, - "step": 20400 - }, - { - "epoch": 0.9337899543378996, - "grad_norm": 14.471035957336426, - "learning_rate": 1.5993150684931507e-05, - "loss": 0.9346, - "step": 20450 + "loss": 0.9185, + "step": 10200 }, { "epoch": 0.9360730593607306, - "grad_norm": 2.5866785049438477, + "grad_norm": 3.170640230178833, "learning_rate": 1.5958904109589043e-05, - "loss": 0.874, - "step": 20500 - }, - { - "epoch": 0.9383561643835616, - "grad_norm": 23.115089416503906, - "learning_rate": 1.5924657534246576e-05, - "loss": 1.0657, - "step": 20550 + "loss": 0.8761, + "step": 10250 }, { "epoch": 0.9406392694063926, - "grad_norm": 37.99090576171875, + "grad_norm": 11.567336082458496, "learning_rate": 1.589041095890411e-05, - "loss": 0.8484, - "step": 20600 - }, - { - "epoch": 0.9429223744292238, - "grad_norm": 8.818378448486328, - "learning_rate": 1.5856164383561646e-05, - "loss": 0.8682, - "step": 20650 + "loss": 0.9298, + "step": 10300 }, { "epoch": 0.9452054794520548, - "grad_norm": 64.82261657714844, + "grad_norm": 23.299612045288086, "learning_rate": 1.582191780821918e-05, - "loss": 0.9238, - "step": 20700 - }, - { - "epoch": 0.9474885844748858, - "grad_norm": 4.82705545425415, - "learning_rate": 1.578767123287671e-05, - "loss": 1.0183, - "step": 20750 + "loss": 0.8361, + "step": 10350 }, { "epoch": 0.9497716894977168, - "grad_norm": 35.37221908569336, + "grad_norm": 14.57529067993164, "learning_rate": 1.5753424657534248e-05, - "loss": 0.9661, - "step": 20800 - }, - { - "epoch": 0.952054794520548, - "grad_norm": 59.23238754272461, - "learning_rate": 1.571917808219178e-05, - "loss": 1.1756, - "step": 20850 + "loss": 0.9669, + "step": 10400 }, { "epoch": 0.954337899543379, - "grad_norm": 5.600244045257568, + "grad_norm": 10.777979850769043, "learning_rate": 1.5684931506849318e-05, - "loss": 0.9528, - "step": 20900 - }, - { - "epoch": 0.95662100456621, - "grad_norm": 36.54751968383789, - "learning_rate": 1.565068493150685e-05, - "loss": 0.8113, - "step": 20950 + "loss": 0.9632, + "step": 10450 }, { "epoch": 0.958904109589041, - "grad_norm": 21.206743240356445, + "grad_norm": 25.46247100830078, "learning_rate": 1.5616438356164384e-05, - "loss": 1.0184, - "step": 21000 - }, - { - "epoch": 0.9611872146118722, - "grad_norm": 5.483078479766846, - "learning_rate": 1.558219178082192e-05, - "loss": 1.0164, - "step": 21050 + "loss": 0.8625, + "step": 10500 }, { "epoch": 0.9634703196347032, - "grad_norm": 10.294635772705078, - "learning_rate": 1.5547945205479453e-05, - "loss": 0.8765, - "step": 21100 - }, - { - "epoch": 0.9657534246575342, - "grad_norm": 15.811431884765625, - "learning_rate": 1.5513698630136986e-05, - "loss": 1.0893, - "step": 21150 + "grad_norm": 8.925729751586914, + "learning_rate": 1.5547945205479453e-05, + "loss": 0.903, + "step": 10550 }, { "epoch": 0.9680365296803652, - "grad_norm": 12.22139835357666, + "grad_norm": 11.709217071533203, "learning_rate": 1.5479452054794523e-05, - "loss": 0.8607, - "step": 21200 - }, - { - "epoch": 0.9703196347031964, - "grad_norm": 32.24748229980469, - "learning_rate": 1.5445205479452056e-05, - "loss": 0.9461, - "step": 21250 + "loss": 0.9446, + "step": 10600 }, { "epoch": 0.9726027397260274, - "grad_norm": 105.09614562988281, + "grad_norm": 36.75910568237305, "learning_rate": 1.5410958904109592e-05, - "loss": 1.0007, - "step": 21300 - }, - { - "epoch": 0.9748858447488584, - "grad_norm": 39.1617546081543, - "learning_rate": 1.5376712328767125e-05, - "loss": 0.8074, - "step": 21350 + "loss": 0.9568, + "step": 10650 }, { "epoch": 0.9771689497716894, - "grad_norm": 54.6515007019043, + "grad_norm": 30.059825897216797, "learning_rate": 1.5342465753424658e-05, - "loss": 1.0728, - "step": 21400 - }, - { - "epoch": 0.9794520547945206, - "grad_norm": 11.429344177246094, - "learning_rate": 1.5308219178082195e-05, - "loss": 1.0846, - "step": 21450 + "loss": 0.9603, + "step": 10700 }, { "epoch": 0.9817351598173516, - "grad_norm": 35.74282455444336, + "grad_norm": 23.478477478027344, "learning_rate": 1.5273972602739728e-05, - "loss": 0.9805, - "step": 21500 - }, - { - "epoch": 0.9840182648401826, - "grad_norm": 28.15215301513672, - "learning_rate": 1.5239726027397259e-05, - "loss": 1.057, - "step": 21550 + "loss": 0.9828, + "step": 10750 }, { "epoch": 0.9863013698630136, - "grad_norm": 24.915042877197266, + "grad_norm": 34.233699798583984, "learning_rate": 1.5205479452054795e-05, - "loss": 0.9953, - "step": 21600 - }, - { - "epoch": 0.9885844748858448, - "grad_norm": 54.70051574707031, - "learning_rate": 1.5171232876712328e-05, - "loss": 0.8781, - "step": 21650 + "loss": 0.9867, + "step": 10800 }, { "epoch": 0.9908675799086758, - "grad_norm": 57.645572662353516, + "grad_norm": 19.399288177490234, "learning_rate": 1.5136986301369865e-05, - "loss": 0.9031, - "step": 21700 - }, - { - "epoch": 0.9931506849315068, - "grad_norm": 49.95241165161133, - "learning_rate": 1.5102739726027398e-05, - "loss": 1.1313, - "step": 21750 + "loss": 0.8618, + "step": 10850 }, { "epoch": 0.9954337899543378, - "grad_norm": 11.169427871704102, + "grad_norm": 9.319437026977539, "learning_rate": 1.5068493150684931e-05, - "loss": 0.7831, - "step": 21800 - }, - { - "epoch": 0.997716894977169, - "grad_norm": 1.9000864028930664, - "learning_rate": 1.5034246575342467e-05, - "loss": 0.9135, - "step": 21850 + "loss": 0.9352, + "step": 10900 }, { "epoch": 1.0, - "grad_norm": 8.12338638305664, + "grad_norm": 8.767410278320312, "learning_rate": 1.5e-05, - "loss": 0.8517, - "step": 21900 - }, - { - "epoch": 1.0022831050228311, - "grad_norm": 9.123151779174805, - "learning_rate": 1.4965753424657535e-05, - "loss": 0.8109, - "step": 21950 + "loss": 0.8511, + "step": 10950 }, { "epoch": 1.004566210045662, - "grad_norm": 1.5693093538284302, + "grad_norm": 15.091998100280762, "learning_rate": 1.4931506849315068e-05, - "loss": 0.637, - "step": 22000 - }, - { - "epoch": 1.0068493150684932, - "grad_norm": 57.09550094604492, - "learning_rate": 1.4897260273972603e-05, - "loss": 0.7138, - "step": 22050 + "loss": 0.6653, + "step": 11000 }, { "epoch": 1.009132420091324, - "grad_norm": 22.930618286132812, + "grad_norm": 23.510337829589844, "learning_rate": 1.4863013698630138e-05, - "loss": 0.5267, - "step": 22100 - }, - { - "epoch": 1.0114155251141552, - "grad_norm": 6.480524063110352, - "learning_rate": 1.4828767123287672e-05, - "loss": 0.583, - "step": 22150 + "loss": 0.6163, + "step": 11050 }, { "epoch": 1.0136986301369864, - "grad_norm": 8.956890106201172, + "grad_norm": 8.980317115783691, "learning_rate": 1.4794520547945205e-05, - "loss": 0.838, - "step": 22200 - }, - { - "epoch": 1.0159817351598173, - "grad_norm": 28.80997657775879, - "learning_rate": 1.476027397260274e-05, - "loss": 0.7791, - "step": 22250 + "loss": 0.7783, + "step": 11100 }, { "epoch": 1.0182648401826484, - "grad_norm": 51.440643310546875, + "grad_norm": 36.67283248901367, "learning_rate": 1.4726027397260275e-05, - "loss": 0.6699, - "step": 22300 - }, - { - "epoch": 1.0205479452054795, - "grad_norm": 16.62401580810547, - "learning_rate": 1.469178082191781e-05, - "loss": 0.6056, - "step": 22350 + "loss": 0.7681, + "step": 11150 }, { "epoch": 1.0228310502283104, - "grad_norm": 6.103922367095947, + "grad_norm": 4.734091758728027, "learning_rate": 1.4657534246575343e-05, - "loss": 0.7163, - "step": 22400 - }, - { - "epoch": 1.0251141552511416, - "grad_norm": 13.347216606140137, - "learning_rate": 1.4623287671232877e-05, - "loss": 0.8158, - "step": 22450 + "loss": 0.6181, + "step": 11200 }, { "epoch": 1.0273972602739727, - "grad_norm": 10.218559265136719, + "grad_norm": 7.513982772827148, "learning_rate": 1.4589041095890412e-05, - "loss": 0.7067, - "step": 22500 - }, - { - "epoch": 1.0296803652968036, - "grad_norm": 20.360658645629883, - "learning_rate": 1.4554794520547945e-05, - "loss": 0.5413, - "step": 22550 + "loss": 0.7297, + "step": 11250 }, { "epoch": 1.0319634703196348, - "grad_norm": 14.813282012939453, + "grad_norm": 16.1832275390625, "learning_rate": 1.4520547945205478e-05, - "loss": 0.7121, - "step": 22600 - }, - { - "epoch": 1.0342465753424657, - "grad_norm": 4.041282653808594, - "learning_rate": 1.4486301369863013e-05, - "loss": 0.6259, - "step": 22650 + "loss": 0.6487, + "step": 11300 }, { "epoch": 1.0365296803652968, - "grad_norm": 8.718100547790527, + "grad_norm": 25.557220458984375, "learning_rate": 1.4452054794520548e-05, - "loss": 0.679, - "step": 22700 - }, - { - "epoch": 1.038812785388128, - "grad_norm": 24.674726486206055, - "learning_rate": 1.4417808219178082e-05, - "loss": 0.5784, - "step": 22750 + "loss": 0.7089, + "step": 11350 }, { "epoch": 1.0410958904109588, - "grad_norm": 18.543848037719727, + "grad_norm": 11.07717227935791, "learning_rate": 1.4383561643835615e-05, - "loss": 0.6259, - "step": 22800 - }, - { - "epoch": 1.04337899543379, - "grad_norm": 11.60615348815918, - "learning_rate": 1.434931506849315e-05, - "loss": 0.6859, - "step": 22850 + "loss": 0.6266, + "step": 11400 }, { "epoch": 1.045662100456621, - "grad_norm": 19.409120559692383, + "grad_norm": 11.381722450256348, "learning_rate": 1.4315068493150685e-05, - "loss": 0.724, - "step": 22900 - }, - { - "epoch": 1.047945205479452, - "grad_norm": 18.087451934814453, - "learning_rate": 1.428082191780822e-05, - "loss": 0.6719, - "step": 22950 + "loss": 0.6758, + "step": 11450 }, { "epoch": 1.0502283105022832, - "grad_norm": 37.11878967285156, + "grad_norm": 9.080512046813965, "learning_rate": 1.4246575342465753e-05, - "loss": 0.5994, - "step": 23000 - }, - { - "epoch": 1.052511415525114, - "grad_norm": 35.999149322509766, - "learning_rate": 1.4212328767123287e-05, - "loss": 0.8059, - "step": 23050 + "loss": 0.6865, + "step": 11500 }, { "epoch": 1.0547945205479452, - "grad_norm": 23.739049911499023, + "grad_norm": 10.727092742919922, "learning_rate": 1.4178082191780822e-05, - "loss": 0.4798, - "step": 23100 - }, - { - "epoch": 1.0570776255707763, - "grad_norm": 4.680600166320801, - "learning_rate": 1.4143835616438357e-05, - "loss": 0.6557, - "step": 23150 + "loss": 0.6494, + "step": 11550 }, { "epoch": 1.0593607305936072, - "grad_norm": 31.171585083007812, + "grad_norm": 4.969435691833496, "learning_rate": 1.410958904109589e-05, - "loss": 0.6122, - "step": 23200 - }, - { - "epoch": 1.0616438356164384, - "grad_norm": 10.616646766662598, - "learning_rate": 1.4075342465753425e-05, - "loss": 0.7256, - "step": 23250 + "loss": 0.5874, + "step": 11600 }, { "epoch": 1.0639269406392695, - "grad_norm": 5.910991191864014, + "grad_norm": 12.03378963470459, "learning_rate": 1.404109589041096e-05, - "loss": 0.5245, - "step": 23300 - }, - { - "epoch": 1.0662100456621004, - "grad_norm": 6.659880638122559, - "learning_rate": 1.4006849315068494e-05, - "loss": 0.6245, - "step": 23350 + "loss": 0.6301, + "step": 11650 }, { "epoch": 1.0684931506849316, - "grad_norm": 28.779626846313477, + "grad_norm": 8.797894477844238, "learning_rate": 1.3972602739726027e-05, - "loss": 0.7664, - "step": 23400 - }, - { - "epoch": 1.0707762557077625, - "grad_norm": 42.86344909667969, - "learning_rate": 1.3938356164383562e-05, - "loss": 0.7317, - "step": 23450 + "loss": 0.6839, + "step": 11700 }, { "epoch": 1.0730593607305936, - "grad_norm": 17.784263610839844, + "grad_norm": 9.395676612854004, "learning_rate": 1.3904109589041097e-05, - "loss": 0.7032, - "step": 23500 - }, - { - "epoch": 1.0753424657534247, - "grad_norm": 33.24246597290039, - "learning_rate": 1.3869863013698631e-05, - "loss": 0.6477, - "step": 23550 + "loss": 0.6672, + "step": 11750 }, { "epoch": 1.0776255707762556, - "grad_norm": 23.703388214111328, + "grad_norm": 26.268247604370117, "learning_rate": 1.3835616438356164e-05, - "loss": 0.5103, - "step": 23600 - }, - { - "epoch": 1.0799086757990868, - "grad_norm": 27.350101470947266, - "learning_rate": 1.38013698630137e-05, - "loss": 0.7417, - "step": 23650 + "loss": 0.6398, + "step": 11800 }, { "epoch": 1.0821917808219177, - "grad_norm": 32.2100715637207, + "grad_norm": 19.63582992553711, "learning_rate": 1.3767123287671234e-05, - "loss": 0.9106, - "step": 23700 - }, - { - "epoch": 1.0844748858447488, - "grad_norm": 8.949542999267578, - "learning_rate": 1.3732876712328769e-05, - "loss": 0.6949, - "step": 23750 + "loss": 0.7825, + "step": 11850 }, { "epoch": 1.08675799086758, - "grad_norm": 31.792240142822266, + "grad_norm": 23.42495346069336, "learning_rate": 1.3698630136986302e-05, - "loss": 0.725, - "step": 23800 - }, - { - "epoch": 1.0890410958904109, - "grad_norm": 51.95265579223633, - "learning_rate": 1.3664383561643835e-05, - "loss": 0.73, - "step": 23850 + "loss": 0.7078, + "step": 11900 }, { "epoch": 1.091324200913242, - "grad_norm": 19.113056182861328, + "grad_norm": 29.21826934814453, "learning_rate": 1.363013698630137e-05, - "loss": 0.6004, - "step": 23900 - }, - { - "epoch": 1.0936073059360731, - "grad_norm": 16.85398292541504, - "learning_rate": 1.3595890410958904e-05, - "loss": 0.5843, - "step": 23950 + "loss": 0.6973, + "step": 11950 }, { "epoch": 1.095890410958904, - "grad_norm": 20.591157913208008, + "grad_norm": 13.69019603729248, "learning_rate": 1.3561643835616437e-05, - "loss": 0.6938, - "step": 24000 - }, - { - "epoch": 1.0981735159817352, - "grad_norm": 22.592805862426758, - "learning_rate": 1.3527397260273972e-05, - "loss": 0.6366, - "step": 24050 + "loss": 0.6325, + "step": 12000 }, { "epoch": 1.1004566210045663, - "grad_norm": 77.25360870361328, + "grad_norm": 22.293821334838867, "learning_rate": 1.3493150684931507e-05, - "loss": 0.7245, - "step": 24100 - }, - { - "epoch": 1.1027397260273972, - "grad_norm": 35.447452545166016, - "learning_rate": 1.3458904109589042e-05, - "loss": 0.7001, - "step": 24150 + "loss": 0.6833, + "step": 12050 }, { "epoch": 1.1050228310502284, - "grad_norm": 50.135650634765625, + "grad_norm": 9.701814651489258, "learning_rate": 1.3424657534246575e-05, - "loss": 0.6473, - "step": 24200 - }, - { - "epoch": 1.1073059360730593, - "grad_norm": 9.847333908081055, - "learning_rate": 1.339041095890411e-05, - "loss": 0.6206, - "step": 24250 + "loss": 0.6518, + "step": 12100 }, { "epoch": 1.1095890410958904, - "grad_norm": 16.841964721679688, + "grad_norm": 25.30548667907715, "learning_rate": 1.3356164383561644e-05, - "loss": 0.754, - "step": 24300 - }, - { - "epoch": 1.1118721461187215, - "grad_norm": 25.09712028503418, - "learning_rate": 1.3321917808219179e-05, - "loss": 0.7678, - "step": 24350 + "loss": 0.7458, + "step": 12150 }, { "epoch": 1.1141552511415524, - "grad_norm": 20.83018684387207, + "grad_norm": 11.800670623779297, "learning_rate": 1.3287671232876712e-05, - "loss": 0.7375, - "step": 24400 - }, - { - "epoch": 1.1164383561643836, - "grad_norm": 1.3051903247833252, - "learning_rate": 1.3253424657534247e-05, - "loss": 0.73, - "step": 24450 + "loss": 0.755, + "step": 12200 }, { "epoch": 1.1187214611872145, - "grad_norm": 21.558069229125977, + "grad_norm": 15.019549369812012, "learning_rate": 1.3219178082191781e-05, - "loss": 0.6535, - "step": 24500 - }, - { - "epoch": 1.1210045662100456, - "grad_norm": 53.48995590209961, - "learning_rate": 1.3184931506849316e-05, - "loss": 0.7592, - "step": 24550 + "loss": 0.6871, + "step": 12250 }, { "epoch": 1.1232876712328768, - "grad_norm": 6.8114166259765625, + "grad_norm": 8.90066909790039, "learning_rate": 1.3150684931506849e-05, - "loss": 0.6735, - "step": 24600 - }, - { - "epoch": 1.1255707762557077, - "grad_norm": 16.559179306030273, - "learning_rate": 1.3116438356164384e-05, - "loss": 0.4556, - "step": 24650 + "loss": 0.738, + "step": 12300 }, { "epoch": 1.1278538812785388, - "grad_norm": 2.124957323074341, + "grad_norm": 6.738426685333252, "learning_rate": 1.3082191780821919e-05, - "loss": 0.92, - "step": 24700 - }, - { - "epoch": 1.13013698630137, - "grad_norm": 34.67999267578125, - "learning_rate": 1.3047945205479453e-05, - "loss": 0.6309, - "step": 24750 + "loss": 0.6828, + "step": 12350 }, { "epoch": 1.1324200913242009, - "grad_norm": 9.184309005737305, + "grad_norm": 7.866949558258057, "learning_rate": 1.3013698630136986e-05, - "loss": 0.7584, - "step": 24800 - }, - { - "epoch": 1.134703196347032, - "grad_norm": 17.552547454833984, - "learning_rate": 1.2979452054794521e-05, - "loss": 0.6098, - "step": 24850 + "loss": 0.708, + "step": 12400 }, { "epoch": 1.1369863013698631, - "grad_norm": 37.24542999267578, + "grad_norm": 26.066892623901367, "learning_rate": 1.2945205479452056e-05, - "loss": 0.8336, - "step": 24900 - }, - { - "epoch": 1.139269406392694, - "grad_norm": 52.120880126953125, - "learning_rate": 1.291095890410959e-05, - "loss": 0.6372, - "step": 24950 + "loss": 0.7078, + "step": 12450 }, { "epoch": 1.1415525114155252, - "grad_norm": 13.773005485534668, + "grad_norm": 7.540081024169922, "learning_rate": 1.2876712328767124e-05, - "loss": 0.6734, - "step": 25000 - }, - { - "epoch": 1.143835616438356, - "grad_norm": 12.726426124572754, - "learning_rate": 1.2842465753424658e-05, - "loss": 0.5249, - "step": 25050 + "loss": 0.6958, + "step": 12500 }, { "epoch": 1.1461187214611872, - "grad_norm": 8.800257682800293, + "grad_norm": 13.667712211608887, "learning_rate": 1.2808219178082193e-05, - "loss": 0.5094, - "step": 25100 - }, - { - "epoch": 1.1484018264840183, - "grad_norm": 77.74162292480469, - "learning_rate": 1.2773972602739726e-05, - "loss": 0.7281, - "step": 25150 + "loss": 0.5233, + "step": 12550 }, { "epoch": 1.1506849315068493, - "grad_norm": 8.394207954406738, + "grad_norm": 17.8378849029541, "learning_rate": 1.2739726027397259e-05, - "loss": 0.615, - "step": 25200 - }, - { - "epoch": 1.1529680365296804, - "grad_norm": 62.93704605102539, - "learning_rate": 1.2705479452054794e-05, - "loss": 0.5909, - "step": 25250 + "loss": 0.669, + "step": 12600 }, { "epoch": 1.1552511415525113, - "grad_norm": 18.461288452148438, + "grad_norm": 25.038570404052734, "learning_rate": 1.2671232876712329e-05, - "loss": 0.6578, - "step": 25300 - }, - { - "epoch": 1.1575342465753424, - "grad_norm": 17.798723220825195, - "learning_rate": 1.2636986301369863e-05, - "loss": 0.7566, - "step": 25350 + "loss": 0.5989, + "step": 12650 }, { "epoch": 1.1598173515981736, - "grad_norm": 0.32706037163734436, + "grad_norm": 2.4400246143341064, "learning_rate": 1.2602739726027396e-05, - "loss": 0.6193, - "step": 25400 - }, - { - "epoch": 1.1621004566210045, - "grad_norm": 19.030363082885742, - "learning_rate": 1.2568493150684931e-05, - "loss": 0.6249, - "step": 25450 + "loss": 0.745, + "step": 12700 }, { "epoch": 1.1643835616438356, - "grad_norm": 0.1700681447982788, + "grad_norm": 3.209836006164551, "learning_rate": 1.2534246575342466e-05, - "loss": 0.7782, - "step": 25500 - }, - { - "epoch": 1.1666666666666667, - "grad_norm": 147.6623077392578, - "learning_rate": 1.25e-05, - "loss": 0.6307, - "step": 25550 + "loss": 0.6943, + "step": 12750 }, { "epoch": 1.1689497716894977, - "grad_norm": 13.00705623626709, + "grad_norm": 12.491949081420898, "learning_rate": 1.2465753424657534e-05, - "loss": 0.6638, - "step": 25600 - }, - { - "epoch": 1.1712328767123288, - "grad_norm": 10.626334190368652, - "learning_rate": 1.2431506849315068e-05, - "loss": 0.7118, - "step": 25650 + "loss": 0.6115, + "step": 12800 }, { "epoch": 1.17351598173516, - "grad_norm": 5.709634304046631, + "grad_norm": 11.220638275146484, "learning_rate": 1.2397260273972603e-05, - "loss": 0.6299, - "step": 25700 - }, - { - "epoch": 1.1757990867579908, - "grad_norm": 30.29009246826172, - "learning_rate": 1.2363013698630138e-05, - "loss": 0.7486, - "step": 25750 + "loss": 0.5898, + "step": 12850 }, { "epoch": 1.178082191780822, - "grad_norm": 18.62900733947754, + "grad_norm": 16.73185920715332, "learning_rate": 1.2328767123287671e-05, - "loss": 0.5936, - "step": 25800 - }, - { - "epoch": 1.1803652968036529, - "grad_norm": 14.033234596252441, - "learning_rate": 1.2294520547945206e-05, - "loss": 0.7004, - "step": 25850 + "loss": 0.6734, + "step": 12900 }, { "epoch": 1.182648401826484, - "grad_norm": 38.063873291015625, + "grad_norm": 14.307918548583984, "learning_rate": 1.226027397260274e-05, - "loss": 0.7239, - "step": 25900 - }, - { - "epoch": 1.1849315068493151, - "grad_norm": 40.94581985473633, - "learning_rate": 1.2226027397260275e-05, - "loss": 0.6232, - "step": 25950 + "loss": 0.7266, + "step": 12950 }, { "epoch": 1.187214611872146, - "grad_norm": 10.695091247558594, + "grad_norm": 14.836384773254395, "learning_rate": 1.2191780821917808e-05, - "loss": 0.8526, - "step": 26000 - }, - { - "epoch": 1.1894977168949772, - "grad_norm": 19.597412109375, - "learning_rate": 1.2157534246575343e-05, - "loss": 0.6878, - "step": 26050 + "loss": 0.7586, + "step": 13000 }, { "epoch": 1.191780821917808, - "grad_norm": 57.820003509521484, + "grad_norm": 29.282228469848633, "learning_rate": 1.2123287671232878e-05, - "loss": 0.7689, - "step": 26100 - }, - { - "epoch": 1.1940639269406392, - "grad_norm": 19.449691772460938, - "learning_rate": 1.2089041095890412e-05, - "loss": 0.6267, - "step": 26150 + "loss": 0.7917, + "step": 13050 }, { "epoch": 1.1963470319634704, - "grad_norm": 30.782194137573242, + "grad_norm": 26.005468368530273, "learning_rate": 1.2054794520547945e-05, - "loss": 0.6165, - "step": 26200 - }, - { - "epoch": 1.1986301369863013, - "grad_norm": 27.298837661743164, - "learning_rate": 1.202054794520548e-05, - "loss": 0.8057, - "step": 26250 + "loss": 0.6413, + "step": 13100 }, { "epoch": 1.2009132420091324, - "grad_norm": 185.9980010986328, + "grad_norm": 8.961703300476074, "learning_rate": 1.1986301369863015e-05, - "loss": 0.7694, - "step": 26300 - }, - { - "epoch": 1.2031963470319635, - "grad_norm": 3.1694886684417725, - "learning_rate": 1.195205479452055e-05, - "loss": 0.6598, - "step": 26350 + "loss": 0.792, + "step": 13150 }, { "epoch": 1.2054794520547945, - "grad_norm": 6.1167097091674805, + "grad_norm": 10.702567100524902, "learning_rate": 1.1917808219178083e-05, - "loss": 0.6464, - "step": 26400 - }, - { - "epoch": 1.2077625570776256, - "grad_norm": 15.646038055419922, - "learning_rate": 1.1883561643835616e-05, - "loss": 0.9328, - "step": 26450 + "loss": 0.6594, + "step": 13200 }, { "epoch": 1.2100456621004567, - "grad_norm": 7.978808403015137, + "grad_norm": 13.418671607971191, "learning_rate": 1.184931506849315e-05, - "loss": 0.6384, - "step": 26500 - }, - { - "epoch": 1.2123287671232876, - "grad_norm": 21.067642211914062, - "learning_rate": 1.1815068493150685e-05, - "loss": 0.8629, - "step": 26550 + "loss": 0.7547, + "step": 13250 }, { "epoch": 1.2146118721461188, - "grad_norm": 27.418428421020508, + "grad_norm": 17.413429260253906, "learning_rate": 1.1780821917808218e-05, - "loss": 0.6208, - "step": 26600 - }, - { - "epoch": 1.2168949771689497, - "grad_norm": 10.672882080078125, - "learning_rate": 1.1746575342465753e-05, - "loss": 0.5777, - "step": 26650 + "loss": 0.7908, + "step": 13300 }, { "epoch": 1.2191780821917808, - "grad_norm": 82.25353240966797, + "grad_norm": 18.375572204589844, "learning_rate": 1.1712328767123288e-05, - "loss": 0.9331, - "step": 26700 - }, - { - "epoch": 1.221461187214612, - "grad_norm": 12.071101188659668, - "learning_rate": 1.1678082191780822e-05, - "loss": 0.7766, - "step": 26750 + "loss": 0.7298, + "step": 13350 }, { "epoch": 1.2237442922374429, - "grad_norm": 23.268993377685547, + "grad_norm": 12.126824378967285, "learning_rate": 1.1643835616438355e-05, - "loss": 0.6875, - "step": 26800 - }, - { - "epoch": 1.226027397260274, - "grad_norm": 39.18050003051758, - "learning_rate": 1.160958904109589e-05, - "loss": 0.7285, - "step": 26850 + "loss": 0.7482, + "step": 13400 }, { "epoch": 1.228310502283105, - "grad_norm": 3.805318832397461, + "grad_norm": 6.292917728424072, "learning_rate": 1.1575342465753425e-05, - "loss": 0.6275, - "step": 26900 - }, - { - "epoch": 1.230593607305936, - "grad_norm": 39.39872741699219, - "learning_rate": 1.154109589041096e-05, - "loss": 0.6889, - "step": 26950 + "loss": 0.7049, + "step": 13450 }, { "epoch": 1.2328767123287672, - "grad_norm": 16.79233169555664, + "grad_norm": 48.449954986572266, "learning_rate": 1.1506849315068493e-05, - "loss": 0.5247, - "step": 27000 - }, - { - "epoch": 1.235159817351598, - "grad_norm": 30.6214656829834, - "learning_rate": 1.1472602739726027e-05, - "loss": 0.5567, - "step": 27050 + "loss": 0.5808, + "step": 13500 }, { "epoch": 1.2374429223744292, - "grad_norm": 5.248475551605225, + "grad_norm": 11.724007606506348, "learning_rate": 1.1438356164383562e-05, - "loss": 0.7133, - "step": 27100 - }, - { - "epoch": 1.2397260273972603, - "grad_norm": 16.61432456970215, - "learning_rate": 1.1404109589041097e-05, - "loss": 0.6672, - "step": 27150 + "loss": 0.6694, + "step": 13550 }, { "epoch": 1.2420091324200913, - "grad_norm": 16.011980056762695, + "grad_norm": 11.694669723510742, "learning_rate": 1.136986301369863e-05, - "loss": 0.7677, - "step": 27200 - }, - { - "epoch": 1.2442922374429224, - "grad_norm": 34.770938873291016, - "learning_rate": 1.1335616438356165e-05, - "loss": 0.6866, - "step": 27250 + "loss": 0.7223, + "step": 13600 }, { "epoch": 1.2465753424657535, - "grad_norm": 39.278892517089844, + "grad_norm": 18.402746200561523, "learning_rate": 1.13013698630137e-05, - "loss": 0.7042, - "step": 27300 - }, - { - "epoch": 1.2488584474885844, - "grad_norm": 21.77368927001953, - "learning_rate": 1.1267123287671234e-05, - "loss": 0.6923, - "step": 27350 + "loss": 0.6794, + "step": 13650 }, { "epoch": 1.2511415525114156, - "grad_norm": 8.305987358093262, + "grad_norm": 10.988481521606445, "learning_rate": 1.1232876712328767e-05, - "loss": 0.6222, - "step": 27400 - }, - { - "epoch": 1.2534246575342465, - "grad_norm": 27.238414764404297, - "learning_rate": 1.1198630136986302e-05, - "loss": 0.6661, - "step": 27450 + "loss": 0.618, + "step": 13700 }, { "epoch": 1.2557077625570776, - "grad_norm": 25.92155647277832, + "grad_norm": 8.175172805786133, "learning_rate": 1.1164383561643837e-05, - "loss": 0.7408, - "step": 27500 - }, - { - "epoch": 1.2579908675799087, - "grad_norm": 16.504167556762695, - "learning_rate": 1.1130136986301371e-05, - "loss": 0.5134, - "step": 27550 + "loss": 0.6552, + "step": 13750 }, { "epoch": 1.2602739726027397, - "grad_norm": 33.571617126464844, + "grad_norm": 17.85883903503418, "learning_rate": 1.1095890410958904e-05, - "loss": 0.7309, - "step": 27600 - }, - { - "epoch": 1.2625570776255708, - "grad_norm": 10.731171607971191, - "learning_rate": 1.106164383561644e-05, - "loss": 0.7008, - "step": 27650 + "loss": 0.6645, + "step": 13800 }, { "epoch": 1.2648401826484017, - "grad_norm": 9.138223648071289, + "grad_norm": 6.0199198722839355, "learning_rate": 1.1027397260273972e-05, - "loss": 0.7206, - "step": 27700 - }, - { - "epoch": 1.2671232876712328, - "grad_norm": 19.61602210998535, - "learning_rate": 1.0993150684931507e-05, - "loss": 0.7169, - "step": 27750 + "loss": 0.7725, + "step": 13850 }, { "epoch": 1.269406392694064, - "grad_norm": 25.721527099609375, + "grad_norm": 12.200461387634277, "learning_rate": 1.095890410958904e-05, - "loss": 0.6631, - "step": 27800 - }, - { - "epoch": 1.271689497716895, - "grad_norm": 15.044477462768555, - "learning_rate": 1.0924657534246575e-05, - "loss": 0.6167, - "step": 27850 + "loss": 0.6685, + "step": 13900 }, { "epoch": 1.273972602739726, - "grad_norm": 13.365096092224121, + "grad_norm": 11.29808235168457, "learning_rate": 1.089041095890411e-05, - "loss": 0.5629, - "step": 27900 - }, - { - "epoch": 1.2762557077625571, - "grad_norm": 26.571229934692383, - "learning_rate": 1.0856164383561644e-05, - "loss": 0.6986, - "step": 27950 + "loss": 0.5974, + "step": 13950 }, { "epoch": 1.278538812785388, - "grad_norm": 23.07392692565918, + "grad_norm": 20.522750854492188, "learning_rate": 1.0821917808219177e-05, - "loss": 0.6209, - "step": 28000 - }, - { - "epoch": 1.2808219178082192, - "grad_norm": 1.8312214612960815, - "learning_rate": 1.0787671232876712e-05, - "loss": 0.7247, - "step": 28050 + "loss": 0.6497, + "step": 14000 }, { "epoch": 1.2831050228310503, - "grad_norm": 3.6338565349578857, + "grad_norm": 4.903714656829834, "learning_rate": 1.0753424657534247e-05, - "loss": 0.896, - "step": 28100 - }, - { - "epoch": 1.2853881278538812, - "grad_norm": 22.483678817749023, - "learning_rate": 1.0719178082191782e-05, - "loss": 0.7087, - "step": 28150 + "loss": 0.805, + "step": 14050 }, { "epoch": 1.2876712328767124, - "grad_norm": 29.26844024658203, + "grad_norm": 14.09408950805664, "learning_rate": 1.0684931506849315e-05, - "loss": 0.7095, - "step": 28200 - }, - { - "epoch": 1.2899543378995433, - "grad_norm": 0.29415565729141235, - "learning_rate": 1.065068493150685e-05, - "loss": 0.5849, - "step": 28250 + "loss": 0.6975, + "step": 14100 }, { "epoch": 1.2922374429223744, - "grad_norm": 0.7176687121391296, + "grad_norm": 5.306293964385986, "learning_rate": 1.0616438356164384e-05, - "loss": 0.657, - "step": 28300 - }, - { - "epoch": 1.2945205479452055, - "grad_norm": 11.345915794372559, - "learning_rate": 1.0582191780821919e-05, - "loss": 0.5598, - "step": 28350 + "loss": 0.6181, + "step": 14150 }, { "epoch": 1.2968036529680365, - "grad_norm": 35.81932067871094, + "grad_norm": 15.217981338500977, "learning_rate": 1.0547945205479452e-05, - "loss": 0.8267, - "step": 28400 - }, - { - "epoch": 1.2990867579908676, - "grad_norm": 30.678194046020508, - "learning_rate": 1.0513698630136987e-05, - "loss": 0.6508, - "step": 28450 + "loss": 0.6363, + "step": 14200 }, { "epoch": 1.3013698630136985, - "grad_norm": 3.55430006980896, + "grad_norm": 2.86833119392395, "learning_rate": 1.0479452054794521e-05, - "loss": 0.6641, - "step": 28500 - }, - { - "epoch": 1.3036529680365296, - "grad_norm": 18.566883087158203, - "learning_rate": 1.0445205479452056e-05, - "loss": 0.7889, - "step": 28550 + "loss": 0.632, + "step": 14250 }, { "epoch": 1.3059360730593608, - "grad_norm": 19.019390106201172, + "grad_norm": 12.824779510498047, "learning_rate": 1.0410958904109589e-05, - "loss": 0.6177, - "step": 28600 - }, - { - "epoch": 1.308219178082192, - "grad_norm": 17.26028823852539, - "learning_rate": 1.0376712328767124e-05, - "loss": 0.7045, - "step": 28650 + "loss": 0.7499, + "step": 14300 }, { "epoch": 1.3105022831050228, - "grad_norm": 33.43575668334961, + "grad_norm": 17.47176170349121, "learning_rate": 1.0342465753424659e-05, - "loss": 0.6046, - "step": 28700 - }, - { - "epoch": 1.312785388127854, - "grad_norm": 51.09929275512695, - "learning_rate": 1.0308219178082193e-05, - "loss": 0.5056, - "step": 28750 + "loss": 0.581, + "step": 14350 }, { "epoch": 1.3150684931506849, - "grad_norm": 17.530790328979492, + "grad_norm": 7.620934009552002, "learning_rate": 1.0273972602739726e-05, - "loss": 0.6507, - "step": 28800 - }, - { - "epoch": 1.317351598173516, - "grad_norm": 26.142900466918945, - "learning_rate": 1.0239726027397261e-05, - "loss": 0.5859, - "step": 28850 + "loss": 0.5913, + "step": 14400 }, { "epoch": 1.3196347031963471, - "grad_norm": 16.933874130249023, + "grad_norm": 11.1875, "learning_rate": 1.0205479452054796e-05, - "loss": 0.7274, - "step": 28900 - }, - { - "epoch": 1.321917808219178, - "grad_norm": 5.472013473510742, - "learning_rate": 1.017123287671233e-05, - "loss": 0.6445, - "step": 28950 + "loss": 0.6465, + "step": 14450 }, { "epoch": 1.3242009132420092, - "grad_norm": 20.826509475708008, + "grad_norm": 24.72510528564453, "learning_rate": 1.0136986301369862e-05, - "loss": 0.7194, - "step": 29000 - }, - { - "epoch": 1.32648401826484, - "grad_norm": 36.42399597167969, - "learning_rate": 1.0102739726027397e-05, - "loss": 0.6405, - "step": 29050 + "loss": 0.6796, + "step": 14500 }, { "epoch": 1.3287671232876712, - "grad_norm": 17.444469451904297, + "grad_norm": 6.102226734161377, "learning_rate": 1.0068493150684931e-05, - "loss": 0.7348, - "step": 29100 - }, - { - "epoch": 1.3310502283105023, - "grad_norm": 12.610512733459473, - "learning_rate": 1.0034246575342466e-05, - "loss": 0.6521, - "step": 29150 + "loss": 0.7074, + "step": 14550 }, { "epoch": 1.3333333333333333, - "grad_norm": 14.737506866455078, + "grad_norm": 6.332953929901123, "learning_rate": 9.999999999999999e-06, - "loss": 0.6979, - "step": 29200 - }, - { - "epoch": 1.3356164383561644, - "grad_norm": 27.808490753173828, - "learning_rate": 9.965753424657534e-06, - "loss": 0.7839, - "step": 29250 + "loss": 0.6338, + "step": 14600 }, { "epoch": 1.3378995433789953, - "grad_norm": 7.368427753448486, + "grad_norm": 28.30670738220215, "learning_rate": 9.931506849315069e-06, - "loss": 0.6117, - "step": 29300 - }, - { - "epoch": 1.3401826484018264, - "grad_norm": 27.834564208984375, - "learning_rate": 9.897260273972603e-06, - "loss": 0.6565, - "step": 29350 + "loss": 0.7109, + "step": 14650 }, { "epoch": 1.3424657534246576, - "grad_norm": 33.823081970214844, + "grad_norm": 40.727230072021484, "learning_rate": 9.863013698630136e-06, - "loss": 0.6681, - "step": 29400 - }, - { - "epoch": 1.3447488584474887, - "grad_norm": 26.18691635131836, - "learning_rate": 9.828767123287671e-06, - "loss": 0.9009, - "step": 29450 + "loss": 0.7342, + "step": 14700 }, { "epoch": 1.3470319634703196, - "grad_norm": 20.92985725402832, + "grad_norm": 11.026389122009277, "learning_rate": 9.794520547945206e-06, - "loss": 0.5297, - "step": 29500 - }, - { - "epoch": 1.3493150684931507, - "grad_norm": 4.244528770446777, - "learning_rate": 9.76027397260274e-06, - "loss": 0.6733, - "step": 29550 + "loss": 0.6972, + "step": 14750 }, { "epoch": 1.3515981735159817, - "grad_norm": 67.45909118652344, + "grad_norm": 16.95206642150879, "learning_rate": 9.726027397260274e-06, - "loss": 0.575, - "step": 29600 - }, - { - "epoch": 1.3538812785388128, - "grad_norm": 26.713943481445312, - "learning_rate": 9.691780821917808e-06, - "loss": 0.7428, - "step": 29650 + "loss": 0.6509, + "step": 14800 }, { "epoch": 1.356164383561644, - "grad_norm": 8.588297843933105, + "grad_norm": 24.887845993041992, "learning_rate": 9.657534246575343e-06, - "loss": 0.5567, - "step": 29700 - }, - { - "epoch": 1.3584474885844748, - "grad_norm": 22.027597427368164, - "learning_rate": 9.623287671232878e-06, - "loss": 0.7798, - "step": 29750 + "loss": 0.6608, + "step": 14850 }, { "epoch": 1.360730593607306, - "grad_norm": 10.512967109680176, + "grad_norm": 5.1824421882629395, "learning_rate": 9.589041095890411e-06, - "loss": 0.5646, - "step": 29800 - }, - { - "epoch": 1.3630136986301369, - "grad_norm": 9.836832046508789, - "learning_rate": 9.554794520547946e-06, - "loss": 0.6904, - "step": 29850 + "loss": 0.664, + "step": 14900 }, { "epoch": 1.365296803652968, - "grad_norm": 74.73491668701172, + "grad_norm": 35.08380889892578, "learning_rate": 9.52054794520548e-06, - "loss": 0.7029, - "step": 29900 - }, - { - "epoch": 1.3675799086757991, - "grad_norm": 36.467491149902344, - "learning_rate": 9.486301369863015e-06, - "loss": 0.7266, - "step": 29950 + "loss": 0.7375, + "step": 14950 }, { "epoch": 1.36986301369863, - "grad_norm": 21.993133544921875, + "grad_norm": 13.273919105529785, "learning_rate": 9.452054794520548e-06, - "loss": 0.6316, - "step": 30000 - }, - { - "epoch": 1.3721461187214612, - "grad_norm": 1.2455904483795166, - "learning_rate": 9.417808219178083e-06, - "loss": 0.5555, - "step": 30050 + "loss": 0.6581, + "step": 15000 }, { "epoch": 1.374429223744292, - "grad_norm": 13.85091495513916, + "grad_norm": 20.243751525878906, "learning_rate": 9.383561643835618e-06, - "loss": 0.7729, - "step": 30100 - }, - { - "epoch": 1.3767123287671232, - "grad_norm": 23.61090850830078, - "learning_rate": 9.349315068493152e-06, - "loss": 0.6255, - "step": 30150 + "loss": 0.7028, + "step": 15050 }, { "epoch": 1.3789954337899544, - "grad_norm": 40.24787139892578, + "grad_norm": 6.9884934425354, "learning_rate": 9.315068493150685e-06, - "loss": 0.7646, - "step": 30200 - }, - { - "epoch": 1.3812785388127855, - "grad_norm": 23.249425888061523, - "learning_rate": 9.28082191780822e-06, - "loss": 0.5965, - "step": 30250 + "loss": 0.6699, + "step": 15100 }, { "epoch": 1.3835616438356164, - "grad_norm": 8.090579986572266, + "grad_norm": 10.110499382019043, "learning_rate": 9.246575342465753e-06, - "loss": 0.4478, - "step": 30300 - }, - { - "epoch": 1.3858447488584476, - "grad_norm": 20.011905670166016, - "learning_rate": 9.212328767123288e-06, - "loss": 0.7777, - "step": 30350 + "loss": 0.5804, + "step": 15150 }, { "epoch": 1.3881278538812785, - "grad_norm": 8.697684288024902, + "grad_norm": 5.272585868835449, "learning_rate": 9.178082191780821e-06, - "loss": 0.6304, - "step": 30400 - }, - { - "epoch": 1.3904109589041096, - "grad_norm": 1.545689582824707, - "learning_rate": 9.143835616438356e-06, - "loss": 0.7816, - "step": 30450 + "loss": 0.7046, + "step": 15200 }, { "epoch": 1.3926940639269407, - "grad_norm": 0.6150842308998108, + "grad_norm": 3.3239293098449707, "learning_rate": 9.10958904109589e-06, - "loss": 0.5802, - "step": 30500 - }, - { - "epoch": 1.3949771689497716, - "grad_norm": 21.5743350982666, - "learning_rate": 9.075342465753425e-06, - "loss": 0.6369, - "step": 30550 + "loss": 0.6621, + "step": 15250 }, { "epoch": 1.3972602739726028, - "grad_norm": 10.7164306640625, + "grad_norm": 9.12402057647705, "learning_rate": 9.041095890410958e-06, - "loss": 0.4902, - "step": 30600 - }, - { - "epoch": 1.3995433789954337, - "grad_norm": 28.598312377929688, - "learning_rate": 9.006849315068493e-06, - "loss": 0.613, - "step": 30650 + "loss": 0.5613, + "step": 15300 }, { "epoch": 1.4018264840182648, - "grad_norm": 24.138431549072266, + "grad_norm": 9.260445594787598, "learning_rate": 8.972602739726028e-06, - "loss": 0.6153, - "step": 30700 - }, - { - "epoch": 1.404109589041096, - "grad_norm": 22.55198860168457, - "learning_rate": 8.938356164383562e-06, - "loss": 0.5419, - "step": 30750 + "loss": 0.6179, + "step": 15350 }, { "epoch": 1.4063926940639269, - "grad_norm": 26.992374420166016, + "grad_norm": 40.01394271850586, "learning_rate": 8.904109589041095e-06, - "loss": 0.5726, - "step": 30800 - }, - { - "epoch": 1.408675799086758, - "grad_norm": 1.8640153408050537, - "learning_rate": 8.86986301369863e-06, - "loss": 0.6794, - "step": 30850 + "loss": 0.5897, + "step": 15400 }, { "epoch": 1.410958904109589, - "grad_norm": 14.333259582519531, + "grad_norm": 43.27082443237305, "learning_rate": 8.835616438356165e-06, - "loss": 0.4972, - "step": 30900 - }, - { - "epoch": 1.41324200913242, - "grad_norm": 64.34575653076172, - "learning_rate": 8.8013698630137e-06, - "loss": 0.9213, - "step": 30950 + "loss": 0.5804, + "step": 15450 }, { "epoch": 1.4155251141552512, - "grad_norm": 9.60162353515625, + "grad_norm": 8.149184226989746, "learning_rate": 8.767123287671233e-06, - "loss": 0.6868, - "step": 31000 - }, - { - "epoch": 1.4178082191780823, - "grad_norm": 8.155502319335938, - "learning_rate": 8.732876712328767e-06, - "loss": 0.6873, - "step": 31050 + "loss": 0.7865, + "step": 15500 }, { "epoch": 1.4200913242009132, - "grad_norm": 5.085892677307129, + "grad_norm": 16.708332061767578, "learning_rate": 8.698630136986302e-06, - "loss": 0.7768, - "step": 31100 - }, - { - "epoch": 1.4223744292237444, - "grad_norm": 52.48747634887695, - "learning_rate": 8.664383561643837e-06, - "loss": 0.687, - "step": 31150 + "loss": 0.6983, + "step": 15550 }, { "epoch": 1.4246575342465753, - "grad_norm": 1.8491209745407104, + "grad_norm": 2.826059103012085, "learning_rate": 8.63013698630137e-06, - "loss": 0.5669, - "step": 31200 - }, - { - "epoch": 1.4269406392694064, - "grad_norm": 12.143204689025879, - "learning_rate": 8.595890410958905e-06, - "loss": 0.5568, - "step": 31250 + "loss": 0.6238, + "step": 15600 }, { "epoch": 1.4292237442922375, - "grad_norm": 2.939903974533081, + "grad_norm": 13.411745071411133, "learning_rate": 8.56164383561644e-06, - "loss": 0.7732, - "step": 31300 - }, - { - "epoch": 1.4315068493150684, - "grad_norm": 17.773130416870117, - "learning_rate": 8.527397260273974e-06, - "loss": 0.7444, - "step": 31350 + "loss": 0.6448, + "step": 15650 }, { "epoch": 1.4337899543378996, - "grad_norm": 47.27958679199219, + "grad_norm": 12.094820976257324, "learning_rate": 8.493150684931507e-06, - "loss": 0.6621, - "step": 31400 - }, - { - "epoch": 1.4360730593607305, - "grad_norm": 50.40327453613281, - "learning_rate": 8.458904109589042e-06, - "loss": 0.84, - "step": 31450 + "loss": 0.6868, + "step": 15700 }, { "epoch": 1.4383561643835616, - "grad_norm": 8.335402488708496, + "grad_norm": 13.272956848144531, "learning_rate": 8.424657534246577e-06, - "loss": 0.6762, - "step": 31500 - }, - { - "epoch": 1.4406392694063928, - "grad_norm": 12.027316093444824, - "learning_rate": 8.390410958904111e-06, - "loss": 0.6736, - "step": 31550 + "loss": 0.6758, + "step": 15750 }, { "epoch": 1.4429223744292237, - "grad_norm": 17.410192489624023, + "grad_norm": 7.710869312286377, "learning_rate": 8.356164383561643e-06, - "loss": 0.5072, - "step": 31600 - }, - { - "epoch": 1.4452054794520548, - "grad_norm": 48.263450622558594, - "learning_rate": 8.321917808219178e-06, - "loss": 0.6268, - "step": 31650 + "loss": 0.6154, + "step": 15800 }, { "epoch": 1.4474885844748857, - "grad_norm": 3.8568694591522217, + "grad_norm": 24.845901489257812, "learning_rate": 8.287671232876712e-06, - "loss": 0.5454, - "step": 31700 - }, - { - "epoch": 1.4497716894977168, - "grad_norm": 13.764704704284668, - "learning_rate": 8.253424657534247e-06, - "loss": 0.6823, - "step": 31750 + "loss": 0.6228, + "step": 15850 }, { "epoch": 1.452054794520548, - "grad_norm": 13.48620319366455, + "grad_norm": 10.101279258728027, "learning_rate": 8.21917808219178e-06, - "loss": 0.7103, - "step": 31800 - }, - { - "epoch": 1.454337899543379, - "grad_norm": 17.291501998901367, - "learning_rate": 8.184931506849315e-06, - "loss": 0.7011, - "step": 31850 + "loss": 0.7418, + "step": 15900 }, { "epoch": 1.45662100456621, - "grad_norm": 1.461418867111206, + "grad_norm": 71.19440460205078, "learning_rate": 8.15068493150685e-06, - "loss": 0.6667, - "step": 31900 - }, - { - "epoch": 1.4589041095890412, - "grad_norm": 16.34942626953125, - "learning_rate": 8.116438356164384e-06, - "loss": 0.7885, - "step": 31950 + "loss": 0.7128, + "step": 15950 }, { "epoch": 1.461187214611872, - "grad_norm": 14.74634075164795, + "grad_norm": 6.2137041091918945, "learning_rate": 8.082191780821917e-06, - "loss": 0.644, - "step": 32000 - }, - { - "epoch": 1.4634703196347032, - "grad_norm": 6.794888973236084, - "learning_rate": 8.047945205479452e-06, - "loss": 0.6738, - "step": 32050 + "loss": 0.7121, + "step": 16000 }, { "epoch": 1.4657534246575343, - "grad_norm": 31.303226470947266, + "grad_norm": 20.703536987304688, "learning_rate": 8.013698630136987e-06, - "loss": 0.8235, - "step": 32100 - }, - { - "epoch": 1.4680365296803652, - "grad_norm": 42.993648529052734, - "learning_rate": 7.979452054794521e-06, - "loss": 0.4712, - "step": 32150 + "loss": 0.7115, + "step": 16050 }, { "epoch": 1.4703196347031964, - "grad_norm": 7.875132083892822, + "grad_norm": 26.53441619873047, "learning_rate": 7.945205479452055e-06, - "loss": 0.5951, - "step": 32200 - }, - { - "epoch": 1.4726027397260273, - "grad_norm": 9.124963760375977, - "learning_rate": 7.91095890410959e-06, - "loss": 0.5968, - "step": 32250 + "loss": 0.5534, + "step": 16100 }, { "epoch": 1.4748858447488584, - "grad_norm": 13.793811798095703, + "grad_norm": 20.233125686645508, "learning_rate": 7.876712328767124e-06, - "loss": 0.6484, - "step": 32300 - }, - { - "epoch": 1.4771689497716896, - "grad_norm": 2.1718921661376953, - "learning_rate": 7.842465753424659e-06, - "loss": 0.6279, - "step": 32350 + "loss": 0.6204, + "step": 16150 }, { "epoch": 1.4794520547945205, - "grad_norm": 60.621543884277344, + "grad_norm": 23.36627769470215, "learning_rate": 7.808219178082192e-06, - "loss": 0.6718, - "step": 32400 - }, - { - "epoch": 1.4817351598173516, - "grad_norm": 6.748918533325195, - "learning_rate": 7.773972602739727e-06, - "loss": 0.6333, - "step": 32450 + "loss": 0.6914, + "step": 16200 }, { "epoch": 1.4840182648401825, - "grad_norm": 10.061300277709961, + "grad_norm": 11.94163703918457, "learning_rate": 7.739726027397261e-06, - "loss": 0.5952, - "step": 32500 - }, - { - "epoch": 1.4863013698630136, - "grad_norm": 55.56220245361328, - "learning_rate": 7.705479452054796e-06, - "loss": 0.6297, - "step": 32550 + "loss": 0.6062, + "step": 16250 }, { "epoch": 1.4885844748858448, - "grad_norm": 31.07186508178711, + "grad_norm": 15.757901191711426, "learning_rate": 7.671232876712329e-06, - "loss": 0.5302, - "step": 32600 - }, - { - "epoch": 1.490867579908676, - "grad_norm": 30.925626754760742, - "learning_rate": 7.636986301369864e-06, - "loss": 0.6608, - "step": 32650 + "loss": 0.6142, + "step": 16300 }, { "epoch": 1.4931506849315068, - "grad_norm": 21.15188217163086, + "grad_norm": 12.007556915283203, "learning_rate": 7.602739726027398e-06, - "loss": 0.6838, - "step": 32700 - }, - { - "epoch": 1.495433789954338, - "grad_norm": 15.808161735534668, - "learning_rate": 7.568493150684932e-06, - "loss": 0.5405, - "step": 32750 + "loss": 0.6708, + "step": 16350 }, { "epoch": 1.4977168949771689, - "grad_norm": 11.866249084472656, + "grad_norm": 9.127739906311035, "learning_rate": 7.5342465753424655e-06, - "loss": 0.6924, - "step": 32800 - }, - { - "epoch": 1.5, - "grad_norm": 29.02684783935547, - "learning_rate": 7.5e-06, - "loss": 0.6354, - "step": 32850 + "loss": 0.6319, + "step": 16400 }, { "epoch": 1.5022831050228311, - "grad_norm": 20.26506996154785, + "grad_norm": 14.81264877319336, "learning_rate": 7.465753424657534e-06, - "loss": 0.6467, - "step": 32900 - }, - { - "epoch": 1.5045662100456623, - "grad_norm": 23.63490867614746, - "learning_rate": 7.431506849315069e-06, - "loss": 0.6682, - "step": 32950 + "loss": 0.6143, + "step": 16450 }, { "epoch": 1.5068493150684932, - "grad_norm": 16.075380325317383, + "grad_norm": 8.986160278320312, "learning_rate": 7.397260273972603e-06, - "loss": 0.6922, - "step": 33000 - }, - { - "epoch": 1.509132420091324, - "grad_norm": 14.159255027770996, - "learning_rate": 7.3630136986301374e-06, - "loss": 0.6063, - "step": 33050 + "loss": 0.7508, + "step": 16500 }, { "epoch": 1.5114155251141552, - "grad_norm": 22.143796920776367, + "grad_norm": 23.52107810974121, "learning_rate": 7.328767123287671e-06, - "loss": 0.7155, - "step": 33100 - }, - { - "epoch": 1.5136986301369864, - "grad_norm": 98.22097778320312, - "learning_rate": 7.294520547945206e-06, - "loss": 0.7567, - "step": 33150 + "loss": 0.6377, + "step": 16550 }, { "epoch": 1.5159817351598175, - "grad_norm": 0.7336256504058838, + "grad_norm": 4.14312219619751, "learning_rate": 7.260273972602739e-06, - "loss": 0.5414, - "step": 33200 - }, - { - "epoch": 1.5182648401826484, - "grad_norm": 0.3773713707923889, - "learning_rate": 7.226027397260274e-06, - "loss": 0.5702, - "step": 33250 + "loss": 0.595, + "step": 16600 }, { "epoch": 1.5205479452054793, - "grad_norm": 8.909625053405762, + "grad_norm": 32.40318298339844, "learning_rate": 7.191780821917808e-06, - "loss": 0.7228, - "step": 33300 - }, - { - "epoch": 1.5228310502283104, - "grad_norm": 21.098960876464844, - "learning_rate": 7.1575342465753425e-06, - "loss": 0.6001, - "step": 33350 + "loss": 0.6177, + "step": 16650 }, { "epoch": 1.5251141552511416, - "grad_norm": 15.906450271606445, + "grad_norm": 13.583507537841797, "learning_rate": 7.123287671232876e-06, - "loss": 0.6303, - "step": 33400 - }, - { - "epoch": 1.5273972602739727, - "grad_norm": 24.9348201751709, - "learning_rate": 7.089041095890411e-06, - "loss": 0.673, - "step": 33450 + "loss": 0.6513, + "step": 16700 }, { "epoch": 1.5296803652968036, - "grad_norm": 8.255683898925781, + "grad_norm": 27.02487564086914, "learning_rate": 7.054794520547945e-06, - "loss": 0.5831, - "step": 33500 - }, - { - "epoch": 1.5319634703196348, - "grad_norm": 73.46847534179688, - "learning_rate": 7.02054794520548e-06, - "loss": 0.7362, - "step": 33550 + "loss": 0.6136, + "step": 16750 }, { "epoch": 1.5342465753424657, - "grad_norm": 88.85016632080078, + "grad_norm": 46.82355880737305, "learning_rate": 6.986301369863014e-06, - "loss": 0.6682, - "step": 33600 - }, - { - "epoch": 1.5365296803652968, - "grad_norm": 52.53008270263672, - "learning_rate": 6.952054794520548e-06, - "loss": 0.5667, - "step": 33650 + "loss": 0.6545, + "step": 16800 }, { "epoch": 1.538812785388128, - "grad_norm": 18.00398826599121, + "grad_norm": 11.266030311584473, "learning_rate": 6.917808219178082e-06, - "loss": 0.6214, - "step": 33700 - }, - { - "epoch": 1.541095890410959, - "grad_norm": 27.124656677246094, - "learning_rate": 6.883561643835617e-06, - "loss": 0.6737, - "step": 33750 + "loss": 0.6438, + "step": 16850 }, { "epoch": 1.54337899543379, - "grad_norm": 39.45083999633789, + "grad_norm": 21.652450561523438, "learning_rate": 6.849315068493151e-06, - "loss": 0.703, - "step": 33800 - }, - { - "epoch": 1.545662100456621, - "grad_norm": 0.20495979487895966, - "learning_rate": 6.815068493150685e-06, - "loss": 0.6271, - "step": 33850 + "loss": 0.6617, + "step": 16900 }, { "epoch": 1.547945205479452, - "grad_norm": 0.8208453059196472, + "grad_norm": 18.254072189331055, "learning_rate": 6.780821917808219e-06, - "loss": 0.6804, - "step": 33900 - }, - { - "epoch": 1.5502283105022832, - "grad_norm": 12.416110038757324, - "learning_rate": 6.746575342465753e-06, - "loss": 0.7786, - "step": 33950 + "loss": 0.645, + "step": 16950 }, { "epoch": 1.5525114155251143, - "grad_norm": 21.69839096069336, + "grad_norm": 7.959854602813721, "learning_rate": 6.712328767123287e-06, - "loss": 0.7048, - "step": 34000 - }, - { - "epoch": 1.5547945205479452, - "grad_norm": 57.30881881713867, - "learning_rate": 6.678082191780822e-06, - "loss": 0.7398, - "step": 34050 + "loss": 0.7444, + "step": 17000 }, { "epoch": 1.5570776255707761, - "grad_norm": 22.58492088317871, + "grad_norm": 15.169598579406738, "learning_rate": 6.643835616438356e-06, - "loss": 0.7027, - "step": 34100 - }, - { - "epoch": 1.5593607305936072, - "grad_norm": 8.803092002868652, - "learning_rate": 6.609589041095891e-06, - "loss": 0.7296, - "step": 34150 + "loss": 0.6788, + "step": 17050 }, { "epoch": 1.5616438356164384, - "grad_norm": 18.931156158447266, + "grad_norm": 14.872618675231934, "learning_rate": 6.5753424657534245e-06, - "loss": 0.7622, - "step": 34200 - }, - { - "epoch": 1.5639269406392695, - "grad_norm": 8.858073234558105, - "learning_rate": 6.541095890410959e-06, - "loss": 0.615, - "step": 34250 + "loss": 0.7433, + "step": 17100 }, { "epoch": 1.5662100456621004, - "grad_norm": 6.284381866455078, + "grad_norm": 12.479876518249512, "learning_rate": 6.506849315068493e-06, - "loss": 0.7147, - "step": 34300 - }, - { - "epoch": 1.5684931506849316, - "grad_norm": 21.08570098876953, - "learning_rate": 6.472602739726028e-06, - "loss": 0.6153, - "step": 34350 + "loss": 0.676, + "step": 17150 }, { "epoch": 1.5707762557077625, - "grad_norm": 2.0850419998168945, + "grad_norm": 4.30610990524292, "learning_rate": 6.438356164383562e-06, - "loss": 0.588, - "step": 34400 - }, - { - "epoch": 1.5730593607305936, - "grad_norm": 0.25530076026916504, - "learning_rate": 6.4041095890410965e-06, - "loss": 0.7249, - "step": 34450 + "loss": 0.597, + "step": 17200 }, { "epoch": 1.5753424657534247, - "grad_norm": 31.284807205200195, + "grad_norm": 9.52687931060791, "learning_rate": 6.3698630136986296e-06, - "loss": 0.6106, - "step": 34500 - }, - { - "epoch": 1.5776255707762559, - "grad_norm": 19.524412155151367, - "learning_rate": 6.335616438356164e-06, - "loss": 0.7259, - "step": 34550 + "loss": 0.6084, + "step": 17250 }, { "epoch": 1.5799086757990868, - "grad_norm": 6.005446910858154, + "grad_norm": 6.067666053771973, "learning_rate": 6.301369863013698e-06, - "loss": 0.5231, - "step": 34600 - }, - { - "epoch": 1.5821917808219177, - "grad_norm": 17.577402114868164, - "learning_rate": 6.267123287671233e-06, - "loss": 0.847, - "step": 34650 + "loss": 0.61, + "step": 17300 }, { "epoch": 1.5844748858447488, - "grad_norm": 31.817855834960938, + "grad_norm": 15.737329483032227, "learning_rate": 6.232876712328767e-06, - "loss": 0.5509, - "step": 34700 - }, - { - "epoch": 1.58675799086758, - "grad_norm": 4.6908063888549805, - "learning_rate": 6.1986301369863016e-06, - "loss": 0.752, - "step": 34750 + "loss": 0.7011, + "step": 17350 }, { "epoch": 1.589041095890411, - "grad_norm": 2.8228561878204346, + "grad_norm": 4.7880730628967285, "learning_rate": 6.1643835616438354e-06, - "loss": 0.7156, - "step": 34800 - }, - { - "epoch": 1.591324200913242, - "grad_norm": 7.878891468048096, - "learning_rate": 6.13013698630137e-06, - "loss": 0.6926, - "step": 34850 + "loss": 0.694, + "step": 17400 }, { "epoch": 1.593607305936073, - "grad_norm": 30.530006408691406, + "grad_norm": 19.6992130279541, "learning_rate": 6.095890410958904e-06, - "loss": 0.7552, - "step": 34900 - }, - { - "epoch": 1.595890410958904, - "grad_norm": 29.396806716918945, - "learning_rate": 6.061643835616439e-06, - "loss": 0.7008, - "step": 34950 + "loss": 0.706, + "step": 17450 }, { "epoch": 1.5981735159817352, - "grad_norm": 10.500929832458496, + "grad_norm": 10.835814476013184, "learning_rate": 6.027397260273973e-06, - "loss": 0.4952, - "step": 35000 - }, - { - "epoch": 1.6004566210045663, - "grad_norm": 2.337519407272339, - "learning_rate": 5.9931506849315074e-06, - "loss": 0.5864, - "step": 35050 + "loss": 0.6298, + "step": 17500 }, { "epoch": 1.6027397260273972, - "grad_norm": 8.646376609802246, + "grad_norm": 13.219555854797363, "learning_rate": 5.958904109589041e-06, - "loss": 0.7454, - "step": 35100 - }, - { - "epoch": 1.6050228310502284, - "grad_norm": 17.153099060058594, - "learning_rate": 5.924657534246575e-06, - "loss": 0.6943, - "step": 35150 + "loss": 0.6496, + "step": 17550 }, { "epoch": 1.6073059360730593, - "grad_norm": 25.350088119506836, + "grad_norm": 22.237091064453125, "learning_rate": 5.890410958904109e-06, - "loss": 0.6803, - "step": 35200 - }, - { - "epoch": 1.6095890410958904, - "grad_norm": 4.12929105758667, - "learning_rate": 5.856164383561644e-06, - "loss": 0.6976, - "step": 35250 + "loss": 0.6585, + "step": 17600 }, { "epoch": 1.6118721461187215, - "grad_norm": 14.61955451965332, + "grad_norm": 12.173138618469238, "learning_rate": 5.821917808219178e-06, - "loss": 0.7487, - "step": 35300 - }, - { - "epoch": 1.6141552511415527, - "grad_norm": 6.208589553833008, - "learning_rate": 5.7876712328767125e-06, - "loss": 0.6569, - "step": 35350 + "loss": 0.712, + "step": 17650 }, { "epoch": 1.6164383561643836, - "grad_norm": 9.5521240234375, + "grad_norm": 6.5948896408081055, "learning_rate": 5.753424657534246e-06, - "loss": 0.6505, - "step": 35400 - }, - { - "epoch": 1.6187214611872145, - "grad_norm": 14.391396522521973, - "learning_rate": 5.719178082191781e-06, - "loss": 0.5558, - "step": 35450 + "loss": 0.6253, + "step": 17700 }, { "epoch": 1.6210045662100456, - "grad_norm": 1.2627131938934326, + "grad_norm": 5.447400093078613, "learning_rate": 5.684931506849315e-06, - "loss": 0.6405, - "step": 35500 - }, - { - "epoch": 1.6232876712328768, - "grad_norm": 83.7956314086914, - "learning_rate": 5.65068493150685e-06, - "loss": 0.5483, - "step": 35550 + "loss": 0.5762, + "step": 17750 }, { "epoch": 1.625570776255708, - "grad_norm": 15.18497085571289, + "grad_norm": 12.413744926452637, "learning_rate": 5.616438356164384e-06, - "loss": 0.6692, - "step": 35600 - }, - { - "epoch": 1.6278538812785388, - "grad_norm": 33.16044998168945, - "learning_rate": 5.582191780821918e-06, - "loss": 0.6939, - "step": 35650 + "loss": 0.6272, + "step": 17800 }, { "epoch": 1.6301369863013697, - "grad_norm": 12.063103675842285, + "grad_norm": 48.877052307128906, "learning_rate": 5.547945205479452e-06, - "loss": 0.5414, - "step": 35700 - }, - { - "epoch": 1.6324200913242009, - "grad_norm": 26.803749084472656, - "learning_rate": 5.513698630136986e-06, - "loss": 0.5353, - "step": 35750 + "loss": 0.589, + "step": 17850 }, { "epoch": 1.634703196347032, - "grad_norm": 6.5856523513793945, + "grad_norm": 21.571834564208984, "learning_rate": 5.47945205479452e-06, - "loss": 0.7678, - "step": 35800 - }, - { - "epoch": 1.6369863013698631, - "grad_norm": 14.661989212036133, - "learning_rate": 5.445205479452055e-06, - "loss": 0.67, - "step": 35850 + "loss": 0.6592, + "step": 17900 }, { "epoch": 1.639269406392694, - "grad_norm": 2.4577255249023438, + "grad_norm": 7.417336940765381, "learning_rate": 5.410958904109589e-06, - "loss": 0.4579, - "step": 35900 - }, - { - "epoch": 1.6415525114155252, - "grad_norm": 17.849546432495117, - "learning_rate": 5.376712328767123e-06, - "loss": 0.7221, - "step": 35950 + "loss": 0.5248, + "step": 17950 }, { "epoch": 1.643835616438356, - "grad_norm": 6.453017234802246, + "grad_norm": 48.4299430847168, "learning_rate": 5.342465753424657e-06, - "loss": 0.6889, - "step": 36000 - }, - { - "epoch": 1.6461187214611872, - "grad_norm": 15.138044357299805, - "learning_rate": 5.308219178082192e-06, - "loss": 0.6234, - "step": 36050 + "loss": 0.6622, + "step": 18000 }, { "epoch": 1.6484018264840183, - "grad_norm": 11.393730163574219, + "grad_norm": 17.760732650756836, "learning_rate": 5.273972602739726e-06, - "loss": 0.6353, - "step": 36100 - }, - { - "epoch": 1.6506849315068495, - "grad_norm": 2.726991891860962, - "learning_rate": 5.239726027397261e-06, - "loss": 0.5305, - "step": 36150 + "loss": 0.626, + "step": 18050 }, { "epoch": 1.6529680365296804, - "grad_norm": 14.54566478729248, + "grad_norm": 14.000800132751465, "learning_rate": 5.2054794520547945e-06, - "loss": 0.8252, - "step": 36200 - }, - { - "epoch": 1.6552511415525113, - "grad_norm": 0.952422022819519, - "learning_rate": 5.171232876712329e-06, - "loss": 0.6773, - "step": 36250 + "loss": 0.6084, + "step": 18100 }, { "epoch": 1.6575342465753424, - "grad_norm": 21.44168472290039, + "grad_norm": 8.742088317871094, "learning_rate": 5.136986301369863e-06, - "loss": 0.6106, - "step": 36300 - }, - { - "epoch": 1.6598173515981736, - "grad_norm": 11.663095474243164, - "learning_rate": 5.102739726027398e-06, - "loss": 0.6514, - "step": 36350 + "loss": 0.6029, + "step": 18150 }, { "epoch": 1.6621004566210047, - "grad_norm": 23.589557647705078, + "grad_norm": 26.16733169555664, "learning_rate": 5.068493150684931e-06, - "loss": 0.708, - "step": 36400 - }, - { - "epoch": 1.6643835616438356, - "grad_norm": 10.283199310302734, - "learning_rate": 5.034246575342466e-06, - "loss": 0.5563, - "step": 36450 + "loss": 0.6828, + "step": 18200 }, { "epoch": 1.6666666666666665, - "grad_norm": 44.115047454833984, + "grad_norm": 29.90041160583496, "learning_rate": 4.9999999999999996e-06, - "loss": 0.5912, - "step": 36500 - }, - { - "epoch": 1.6689497716894977, - "grad_norm": 71.8247299194336, - "learning_rate": 4.965753424657534e-06, - "loss": 0.7993, - "step": 36550 + "loss": 0.5702, + "step": 18250 }, { "epoch": 1.6712328767123288, - "grad_norm": 11.808229446411133, + "grad_norm": 15.568696022033691, "learning_rate": 4.931506849315068e-06, - "loss": 0.5851, - "step": 36600 - }, - { - "epoch": 1.67351598173516, - "grad_norm": 74.73955535888672, - "learning_rate": 4.897260273972603e-06, - "loss": 0.7004, - "step": 36650 + "loss": 0.6376, + "step": 18300 }, { "epoch": 1.6757990867579908, - "grad_norm": 36.06229019165039, + "grad_norm": 15.59715747833252, "learning_rate": 4.863013698630137e-06, - "loss": 0.7382, - "step": 36700 - }, - { - "epoch": 1.678082191780822, - "grad_norm": 53.29566955566406, - "learning_rate": 4.8287671232876716e-06, - "loss": 0.5035, - "step": 36750 + "loss": 0.6776, + "step": 18350 }, { "epoch": 1.6803652968036529, - "grad_norm": 21.9272403717041, + "grad_norm": 11.07044506072998, "learning_rate": 4.7945205479452054e-06, - "loss": 0.5309, - "step": 36800 - }, - { - "epoch": 1.682648401826484, - "grad_norm": 39.56712341308594, - "learning_rate": 4.76027397260274e-06, - "loss": 0.4754, - "step": 36850 + "loss": 0.529, + "step": 18400 }, { "epoch": 1.6849315068493151, - "grad_norm": 2.814680576324463, + "grad_norm": 5.349613666534424, "learning_rate": 4.726027397260274e-06, - "loss": 0.7033, - "step": 36900 - }, - { - "epoch": 1.6872146118721463, - "grad_norm": 95.83110809326172, - "learning_rate": 4.691780821917809e-06, - "loss": 0.6999, - "step": 36950 + "loss": 0.692, + "step": 18450 }, { "epoch": 1.6894977168949772, - "grad_norm": 27.638185501098633, + "grad_norm": 18.147794723510742, "learning_rate": 4.657534246575343e-06, - "loss": 0.6522, - "step": 37000 - }, - { - "epoch": 1.691780821917808, - "grad_norm": 10.899153709411621, - "learning_rate": 4.623287671232877e-06, - "loss": 0.6386, - "step": 37050 + "loss": 0.7057, + "step": 18500 }, { "epoch": 1.6940639269406392, - "grad_norm": 21.410276412963867, + "grad_norm": 13.486499786376953, "learning_rate": 4.5890410958904105e-06, - "loss": 0.695, - "step": 37100 - }, - { - "epoch": 1.6963470319634704, - "grad_norm": 15.208582878112793, - "learning_rate": 4.554794520547945e-06, - "loss": 0.7636, - "step": 37150 + "loss": 0.6415, + "step": 18550 }, { "epoch": 1.6986301369863015, - "grad_norm": 19.083850860595703, + "grad_norm": 10.16304874420166, "learning_rate": 4.520547945205479e-06, - "loss": 0.6331, - "step": 37200 - }, - { - "epoch": 1.7009132420091324, - "grad_norm": 4.408557415008545, - "learning_rate": 4.486301369863014e-06, - "loss": 0.6886, - "step": 37250 + "loss": 0.6604, + "step": 18600 }, { "epoch": 1.7031963470319633, - "grad_norm": 10.206310272216797, + "grad_norm": 29.04235076904297, "learning_rate": 4.452054794520548e-06, - "loss": 0.639, - "step": 37300 - }, - { - "epoch": 1.7054794520547945, - "grad_norm": 18.985891342163086, - "learning_rate": 4.4178082191780825e-06, - "loss": 0.6884, - "step": 37350 + "loss": 0.6773, + "step": 18650 }, { "epoch": 1.7077625570776256, - "grad_norm": 16.533288955688477, + "grad_norm": 9.932119369506836, "learning_rate": 4.383561643835616e-06, - "loss": 0.6446, - "step": 37400 - }, - { - "epoch": 1.7100456621004567, - "grad_norm": 25.728469848632812, - "learning_rate": 4.349315068493151e-06, - "loss": 0.4863, - "step": 37450 + "loss": 0.617, + "step": 18700 }, { "epoch": 1.7123287671232876, - "grad_norm": 88.2020492553711, + "grad_norm": 70.96830749511719, "learning_rate": 4.315068493150685e-06, - "loss": 0.6831, - "step": 37500 - }, - { - "epoch": 1.7146118721461188, - "grad_norm": 44.737815856933594, - "learning_rate": 4.28082191780822e-06, - "loss": 0.4687, - "step": 37550 + "loss": 0.6157, + "step": 18750 }, { "epoch": 1.7168949771689497, - "grad_norm": 1.9043503999710083, + "grad_norm": 15.635278701782227, "learning_rate": 4.246575342465754e-06, - "loss": 0.6608, - "step": 37600 - }, - { - "epoch": 1.7191780821917808, - "grad_norm": 11.180625915527344, - "learning_rate": 4.212328767123288e-06, - "loss": 0.7672, - "step": 37650 + "loss": 0.6127, + "step": 18800 }, { "epoch": 1.721461187214612, - "grad_norm": 3.448392391204834, + "grad_norm": 2.3553667068481445, "learning_rate": 4.178082191780821e-06, - "loss": 0.6176, - "step": 37700 - }, - { - "epoch": 1.723744292237443, - "grad_norm": 10.672887802124023, - "learning_rate": 4.143835616438356e-06, - "loss": 0.5263, - "step": 37750 + "loss": 0.6847, + "step": 18850 }, { "epoch": 1.726027397260274, - "grad_norm": 15.69261360168457, + "grad_norm": 8.880425453186035, "learning_rate": 4.10958904109589e-06, - "loss": 0.4637, - "step": 37800 - }, - { - "epoch": 1.728310502283105, - "grad_norm": 19.786346435546875, - "learning_rate": 4.075342465753425e-06, - "loss": 0.4554, - "step": 37850 + "loss": 0.5043, + "step": 18900 }, { "epoch": 1.730593607305936, - "grad_norm": 31.991483688354492, + "grad_norm": 22.30252456665039, "learning_rate": 4.041095890410959e-06, - "loss": 0.561, - "step": 37900 - }, - { - "epoch": 1.7328767123287672, - "grad_norm": 21.96062469482422, - "learning_rate": 4.006849315068493e-06, - "loss": 0.5969, - "step": 37950 + "loss": 0.5546, + "step": 18950 }, { "epoch": 1.7351598173515983, - "grad_norm": 10.800865173339844, + "grad_norm": 7.0828142166137695, "learning_rate": 3.972602739726027e-06, - "loss": 0.6058, - "step": 38000 - }, - { - "epoch": 1.7374429223744292, - "grad_norm": 10.979826927185059, - "learning_rate": 3.938356164383562e-06, - "loss": 0.6034, - "step": 38050 + "loss": 0.5997, + "step": 19000 }, { "epoch": 1.7397260273972601, - "grad_norm": 41.3328742980957, + "grad_norm": 25.635570526123047, "learning_rate": 3.904109589041096e-06, - "loss": 0.5687, - "step": 38100 - }, - { - "epoch": 1.7420091324200913, - "grad_norm": 38.379608154296875, - "learning_rate": 3.869863013698631e-06, - "loss": 0.6931, - "step": 38150 + "loss": 0.6091, + "step": 19050 }, { "epoch": 1.7442922374429224, - "grad_norm": 3.292733907699585, + "grad_norm": 9.004444122314453, "learning_rate": 3.8356164383561645e-06, - "loss": 0.4162, - "step": 38200 - }, - { - "epoch": 1.7465753424657535, - "grad_norm": 18.47883415222168, - "learning_rate": 3.801369863013699e-06, - "loss": 0.6574, - "step": 38250 + "loss": 0.6257, + "step": 19100 }, { "epoch": 1.7488584474885844, - "grad_norm": 10.917158126831055, + "grad_norm": 5.485713958740234, "learning_rate": 3.7671232876712327e-06, - "loss": 0.6617, - "step": 38300 - }, - { - "epoch": 1.7511415525114156, - "grad_norm": 13.783547401428223, - "learning_rate": 3.732876712328767e-06, - "loss": 0.7701, - "step": 38350 + "loss": 0.6728, + "step": 19150 }, { "epoch": 1.7534246575342465, - "grad_norm": 21.937267303466797, + "grad_norm": 35.63444519042969, "learning_rate": 3.6986301369863014e-06, - "loss": 0.7627, - "step": 38400 - }, - { - "epoch": 1.7557077625570776, - "grad_norm": 15.421838760375977, - "learning_rate": 3.6643835616438357e-06, - "loss": 0.6636, - "step": 38450 + "loss": 0.7696, + "step": 19200 }, { "epoch": 1.7579908675799087, - "grad_norm": 14.788371086120605, + "grad_norm": 29.199115753173828, "learning_rate": 3.6301369863013696e-06, - "loss": 0.5593, - "step": 38500 - }, - { - "epoch": 1.7602739726027399, - "grad_norm": 5.76630163192749, - "learning_rate": 3.595890410958904e-06, - "loss": 0.5112, - "step": 38550 + "loss": 0.59, + "step": 19250 }, { "epoch": 1.7625570776255708, - "grad_norm": 23.72429656982422, + "grad_norm": 18.041336059570312, "learning_rate": 3.561643835616438e-06, - "loss": 0.5729, - "step": 38600 - }, - { - "epoch": 1.7648401826484017, - "grad_norm": 18.512802124023438, - "learning_rate": 3.5273972602739725e-06, - "loss": 0.6827, - "step": 38650 + "loss": 0.5744, + "step": 19300 }, { "epoch": 1.7671232876712328, - "grad_norm": 4.7244720458984375, + "grad_norm": 16.391035079956055, "learning_rate": 3.493150684931507e-06, - "loss": 0.6428, - "step": 38700 - }, - { - "epoch": 1.769406392694064, - "grad_norm": 6.208735466003418, - "learning_rate": 3.458904109589041e-06, - "loss": 0.5986, - "step": 38750 + "loss": 0.6831, + "step": 19350 }, { "epoch": 1.771689497716895, - "grad_norm": 2.6915433406829834, + "grad_norm": 9.0728759765625, "learning_rate": 3.4246575342465754e-06, - "loss": 0.5905, - "step": 38800 - }, - { - "epoch": 1.773972602739726, - "grad_norm": 4.2726969718933105, - "learning_rate": 3.3904109589041093e-06, - "loss": 0.564, - "step": 38850 + "loss": 0.5779, + "step": 19400 }, { "epoch": 1.776255707762557, - "grad_norm": 26.4520263671875, + "grad_norm": 18.102890014648438, "learning_rate": 3.3561643835616436e-06, - "loss": 0.5738, - "step": 38900 - }, - { - "epoch": 1.778538812785388, - "grad_norm": 43.63593673706055, - "learning_rate": 3.321917808219178e-06, - "loss": 0.621, - "step": 38950 + "loss": 0.5507, + "step": 19450 }, { "epoch": 1.7808219178082192, - "grad_norm": 4.847127437591553, + "grad_norm": 17.248735427856445, "learning_rate": 3.2876712328767123e-06, - "loss": 0.8252, - "step": 39000 - }, - { - "epoch": 1.7831050228310503, - "grad_norm": 7.686138153076172, - "learning_rate": 3.2534246575342466e-06, - "loss": 0.7211, - "step": 39050 + "loss": 0.7226, + "step": 19500 }, { "epoch": 1.7853881278538812, - "grad_norm": 42.92139434814453, + "grad_norm": 27.942777633666992, "learning_rate": 3.219178082191781e-06, - "loss": 0.7652, - "step": 39100 - }, - { - "epoch": 1.7876712328767124, - "grad_norm": 57.05276107788086, - "learning_rate": 3.1849315068493148e-06, - "loss": 0.5234, - "step": 39150 + "loss": 0.7178, + "step": 19550 }, { "epoch": 1.7899543378995433, - "grad_norm": 18.587209701538086, + "grad_norm": 5.3349809646606445, "learning_rate": 3.150684931506849e-06, - "loss": 0.457, - "step": 39200 - }, - { - "epoch": 1.7922374429223744, - "grad_norm": 7.19858455657959, - "learning_rate": 3.1164383561643834e-06, - "loss": 0.8795, - "step": 39250 + "loss": 0.5566, + "step": 19600 }, { "epoch": 1.7945205479452055, - "grad_norm": 10.892264366149902, + "grad_norm": 30.73387908935547, "learning_rate": 3.0821917808219177e-06, - "loss": 0.7042, - "step": 39300 - }, - { - "epoch": 1.7968036529680367, - "grad_norm": 28.82424545288086, - "learning_rate": 3.047945205479452e-06, - "loss": 0.6396, - "step": 39350 + "loss": 0.7677, + "step": 19650 }, { "epoch": 1.7990867579908676, - "grad_norm": 7.087406158447266, + "grad_norm": 21.230749130249023, "learning_rate": 3.0136986301369864e-06, - "loss": 0.6665, - "step": 39400 - }, - { - "epoch": 1.8013698630136985, - "grad_norm": 22.56847381591797, - "learning_rate": 2.9794520547945207e-06, - "loss": 0.7265, - "step": 39450 + "loss": 0.61, + "step": 19700 }, { "epoch": 1.8036529680365296, - "grad_norm": 18.845949172973633, + "grad_norm": 10.363127708435059, "learning_rate": 2.9452054794520546e-06, - "loss": 0.6475, - "step": 39500 - }, - { - "epoch": 1.8059360730593608, - "grad_norm": 26.794076919555664, - "learning_rate": 2.910958904109589e-06, - "loss": 0.7632, - "step": 39550 + "loss": 0.6385, + "step": 19750 }, { "epoch": 1.808219178082192, - "grad_norm": 0.44524723291397095, + "grad_norm": 1.5289000272750854, "learning_rate": 2.876712328767123e-06, - "loss": 0.4843, - "step": 39600 - }, - { - "epoch": 1.8105022831050228, - "grad_norm": 45.64598083496094, - "learning_rate": 2.8424657534246575e-06, - "loss": 0.5918, - "step": 39650 + "loss": 0.6747, + "step": 19800 }, { "epoch": 1.8127853881278537, - "grad_norm": 75.41986846923828, + "grad_norm": 18.69597625732422, "learning_rate": 2.808219178082192e-06, - "loss": 0.6352, - "step": 39700 - }, - { - "epoch": 1.8150684931506849, - "grad_norm": 10.345170974731445, - "learning_rate": 2.773972602739726e-06, - "loss": 0.6204, - "step": 39750 + "loss": 0.5934, + "step": 19850 }, { "epoch": 1.817351598173516, - "grad_norm": 11.58834171295166, + "grad_norm": 10.11032772064209, "learning_rate": 2.73972602739726e-06, - "loss": 0.5325, - "step": 39800 - }, - { - "epoch": 1.8196347031963471, - "grad_norm": 14.26885986328125, - "learning_rate": 2.7054794520547943e-06, - "loss": 0.5008, - "step": 39850 + "loss": 0.5918, + "step": 19900 }, { "epoch": 1.821917808219178, - "grad_norm": 4.937170505523682, + "grad_norm": 30.333818435668945, "learning_rate": 2.6712328767123286e-06, - "loss": 0.7317, - "step": 39900 - }, - { - "epoch": 1.8242009132420092, - "grad_norm": 27.849742889404297, - "learning_rate": 2.636986301369863e-06, - "loss": 0.6825, - "step": 39950 + "loss": 0.6381, + "step": 19950 }, { "epoch": 1.82648401826484, - "grad_norm": 38.649810791015625, + "grad_norm": 15.229024887084961, "learning_rate": 2.6027397260273973e-06, - "loss": 0.4422, - "step": 40000 - }, - { - "epoch": 1.8287671232876712, - "grad_norm": 52.95954895019531, - "learning_rate": 2.5684931506849316e-06, - "loss": 0.747, - "step": 40050 + "loss": 0.6151, + "step": 20000 }, { "epoch": 1.8310502283105023, - "grad_norm": 10.486088752746582, + "grad_norm": 28.18828773498535, "learning_rate": 2.5342465753424655e-06, - "loss": 0.6757, - "step": 40100 - }, - { - "epoch": 1.8333333333333335, - "grad_norm": 0.9142507314682007, - "learning_rate": 2.4999999999999998e-06, - "loss": 0.712, - "step": 40150 + "loss": 0.6968, + "step": 20050 }, { "epoch": 1.8356164383561644, - "grad_norm": 51.62909698486328, + "grad_norm": 48.18195724487305, "learning_rate": 2.465753424657534e-06, - "loss": 0.6107, - "step": 40200 - }, - { - "epoch": 1.8378995433789953, - "grad_norm": 31.60240936279297, - "learning_rate": 2.4315068493150684e-06, - "loss": 0.601, - "step": 40250 + "loss": 0.6256, + "step": 20100 }, { "epoch": 1.8401826484018264, - "grad_norm": 55.16636657714844, + "grad_norm": 16.33424949645996, "learning_rate": 2.3972602739726027e-06, - "loss": 0.6121, - "step": 40300 - }, - { - "epoch": 1.8424657534246576, - "grad_norm": 34.450416564941406, - "learning_rate": 2.363013698630137e-06, - "loss": 0.6331, - "step": 40350 + "loss": 0.6477, + "step": 20150 }, { "epoch": 1.8447488584474887, - "grad_norm": 27.42693328857422, + "grad_norm": 25.994909286499023, "learning_rate": 2.3287671232876713e-06, - "loss": 0.733, - "step": 40400 - }, - { - "epoch": 1.8470319634703196, - "grad_norm": 44.26624298095703, - "learning_rate": 2.2945205479452052e-06, - "loss": 0.7453, - "step": 40450 + "loss": 0.692, + "step": 20200 }, { "epoch": 1.8493150684931505, - "grad_norm": 16.07997703552246, + "grad_norm": 8.117030143737793, "learning_rate": 2.2602739726027396e-06, - "loss": 0.6628, - "step": 40500 - }, - { - "epoch": 1.8515981735159817, - "grad_norm": 2.816776752471924, - "learning_rate": 2.226027397260274e-06, - "loss": 0.4991, - "step": 40550 + "loss": 0.6933, + "step": 20250 }, { "epoch": 1.8538812785388128, - "grad_norm": 4.0401434898376465, + "grad_norm": 8.02834415435791, "learning_rate": 2.191780821917808e-06, - "loss": 0.6943, - "step": 40600 - }, - { - "epoch": 1.856164383561644, - "grad_norm": 5.525669097900391, - "learning_rate": 2.1575342465753425e-06, - "loss": 0.6887, - "step": 40650 + "loss": 0.6413, + "step": 20300 }, { "epoch": 1.8584474885844748, - "grad_norm": 85.989990234375, + "grad_norm": 22.62827491760254, "learning_rate": 2.123287671232877e-06, - "loss": 0.7425, - "step": 40700 - }, - { - "epoch": 1.860730593607306, - "grad_norm": 4.5465779304504395, - "learning_rate": 2.0890410958904107e-06, - "loss": 0.7462, - "step": 40750 + "loss": 0.679, + "step": 20350 }, { "epoch": 1.8630136986301369, - "grad_norm": 5.920977592468262, + "grad_norm": 9.562274932861328, "learning_rate": 2.054794520547945e-06, - "loss": 0.5546, - "step": 40800 - }, - { - "epoch": 1.865296803652968, - "grad_norm": 12.154388427734375, - "learning_rate": 2.0205479452054793e-06, - "loss": 0.6693, - "step": 40850 + "loss": 0.6615, + "step": 20400 }, { "epoch": 1.8675799086757991, - "grad_norm": 16.800073623657227, + "grad_norm": 12.407808303833008, "learning_rate": 1.9863013698630136e-06, - "loss": 0.7908, - "step": 40900 - }, - { - "epoch": 1.8698630136986303, - "grad_norm": 43.42325973510742, - "learning_rate": 1.952054794520548e-06, - "loss": 0.5155, - "step": 40950 + "loss": 0.7476, + "step": 20450 }, { "epoch": 1.8721461187214612, - "grad_norm": 29.5067138671875, + "grad_norm": 41.344093322753906, "learning_rate": 1.9178082191780823e-06, - "loss": 0.7588, - "step": 41000 - }, - { - "epoch": 1.874429223744292, - "grad_norm": 28.01750946044922, - "learning_rate": 1.8835616438356164e-06, - "loss": 0.6622, - "step": 41050 + "loss": 0.5827, + "step": 20500 }, { "epoch": 1.8767123287671232, - "grad_norm": 16.869781494140625, + "grad_norm": 10.044130325317383, "learning_rate": 1.8493150684931507e-06, - "loss": 0.6737, - "step": 41100 - }, - { - "epoch": 1.8789954337899544, - "grad_norm": 8.377634048461914, - "learning_rate": 1.8150684931506848e-06, - "loss": 0.6281, - "step": 41150 + "loss": 0.6566, + "step": 20550 }, { "epoch": 1.8812785388127855, - "grad_norm": 16.61414337158203, + "grad_norm": 9.382560729980469, "learning_rate": 1.780821917808219e-06, - "loss": 0.6251, - "step": 41200 - }, - { - "epoch": 1.8835616438356164, - "grad_norm": 16.144508361816406, - "learning_rate": 1.7465753424657534e-06, - "loss": 0.6607, - "step": 41250 + "loss": 0.6259, + "step": 20600 }, { "epoch": 1.8858447488584473, - "grad_norm": 20.15201759338379, + "grad_norm": 9.731813430786133, "learning_rate": 1.7123287671232877e-06, - "loss": 0.5207, - "step": 41300 - }, - { - "epoch": 1.8881278538812785, - "grad_norm": 7.15456485748291, - "learning_rate": 1.6780821917808218e-06, - "loss": 0.5882, - "step": 41350 + "loss": 0.5924, + "step": 20650 }, { "epoch": 1.8904109589041096, - "grad_norm": 17.336624145507812, + "grad_norm": 13.417922973632812, "learning_rate": 1.6438356164383561e-06, - "loss": 0.556, - "step": 41400 - }, - { - "epoch": 1.8926940639269407, - "grad_norm": 20.451026916503906, - "learning_rate": 1.6095890410958904e-06, - "loss": 0.6011, - "step": 41450 + "loss": 0.6229, + "step": 20700 }, { "epoch": 1.8949771689497716, - "grad_norm": 33.44941329956055, + "grad_norm": 32.03701400756836, "learning_rate": 1.5753424657534245e-06, - "loss": 0.5166, - "step": 41500 - }, - { - "epoch": 1.8972602739726028, - "grad_norm": 24.176786422729492, - "learning_rate": 1.5410958904109589e-06, - "loss": 0.6837, - "step": 41550 + "loss": 0.5761, + "step": 20750 }, { "epoch": 1.8995433789954337, - "grad_norm": 17.142606735229492, + "grad_norm": 6.067290782928467, "learning_rate": 1.5068493150684932e-06, - "loss": 0.5813, - "step": 41600 - }, - { - "epoch": 1.9018264840182648, - "grad_norm": 34.20349884033203, - "learning_rate": 1.4726027397260273e-06, - "loss": 0.786, - "step": 41650 + "loss": 0.6463, + "step": 20800 }, { "epoch": 1.904109589041096, - "grad_norm": 1.5305472612380981, + "grad_norm": 19.67026710510254, "learning_rate": 1.4383561643835616e-06, - "loss": 0.5928, - "step": 41700 - }, - { - "epoch": 1.906392694063927, - "grad_norm": 1.1295257806777954, - "learning_rate": 1.404109589041096e-06, - "loss": 0.7055, - "step": 41750 + "loss": 0.6671, + "step": 20850 }, { "epoch": 1.908675799086758, - "grad_norm": 23.80326271057129, + "grad_norm": 50.498802185058594, "learning_rate": 1.36986301369863e-06, - "loss": 0.5861, - "step": 41800 - }, - { - "epoch": 1.910958904109589, - "grad_norm": 3.346529960632324, - "learning_rate": 1.3356164383561643e-06, - "loss": 0.6864, - "step": 41850 + "loss": 0.5811, + "step": 20900 }, { "epoch": 1.91324200913242, - "grad_norm": 34.07392883300781, + "grad_norm": 15.981374740600586, "learning_rate": 1.3013698630136986e-06, - "loss": 0.6754, - "step": 41900 - }, - { - "epoch": 1.9155251141552512, - "grad_norm": 42.87485122680664, - "learning_rate": 1.2671232876712327e-06, - "loss": 0.613, - "step": 41950 + "loss": 0.6823, + "step": 20950 }, { "epoch": 1.9178082191780823, - "grad_norm": 9.337113380432129, + "grad_norm": 19.175485610961914, "learning_rate": 1.232876712328767e-06, - "loss": 0.5302, - "step": 42000 - }, - { - "epoch": 1.9200913242009132, - "grad_norm": 19.920682907104492, - "learning_rate": 1.1986301369863014e-06, - "loss": 0.4405, - "step": 42050 + "loss": 0.6112, + "step": 21000 }, { "epoch": 1.9223744292237441, - "grad_norm": 24.49388313293457, + "grad_norm": 8.795181274414062, "learning_rate": 1.1643835616438357e-06, - "loss": 0.696, - "step": 42100 - }, - { - "epoch": 1.9246575342465753, - "grad_norm": 7.732158184051514, - "learning_rate": 1.1301369863013698e-06, - "loss": 0.758, - "step": 42150 + "loss": 0.5474, + "step": 21050 }, { "epoch": 1.9269406392694064, - "grad_norm": 6.940062046051025, + "grad_norm": 3.590404748916626, "learning_rate": 1.095890410958904e-06, - "loss": 0.6804, - "step": 42200 - }, - { - "epoch": 1.9292237442922375, - "grad_norm": 1.062066674232483, - "learning_rate": 1.0616438356164384e-06, - "loss": 0.6059, - "step": 42250 + "loss": 0.7026, + "step": 21100 }, { "epoch": 1.9315068493150684, - "grad_norm": 20.240144729614258, + "grad_norm": 21.619140625, "learning_rate": 1.0273972602739725e-06, - "loss": 0.7992, - "step": 42300 - }, - { - "epoch": 1.9337899543378996, - "grad_norm": 19.4890193939209, - "learning_rate": 9.931506849315068e-07, - "loss": 0.6943, - "step": 42350 + "loss": 0.701, + "step": 21150 }, { "epoch": 1.9360730593607305, - "grad_norm": 25.273487091064453, + "grad_norm": 22.84990692138672, "learning_rate": 9.589041095890411e-07, - "loss": 0.5925, - "step": 42400 - }, - { - "epoch": 1.9383561643835616, - "grad_norm": 21.916284561157227, - "learning_rate": 9.246575342465753e-07, - "loss": 0.6607, - "step": 42450 + "loss": 0.5942, + "step": 21200 }, { "epoch": 1.9406392694063928, - "grad_norm": 38.107566833496094, + "grad_norm": 28.548683166503906, "learning_rate": 8.904109589041095e-07, - "loss": 0.5807, - "step": 42500 - }, - { - "epoch": 1.9429223744292239, - "grad_norm": 8.979408264160156, - "learning_rate": 8.561643835616439e-07, - "loss": 0.6391, - "step": 42550 + "loss": 0.6343, + "step": 21250 }, { "epoch": 1.9452054794520548, - "grad_norm": 20.871389389038086, + "grad_norm": 20.216583251953125, "learning_rate": 8.219178082191781e-07, - "loss": 0.764, - "step": 42600 - }, - { - "epoch": 1.9474885844748857, - "grad_norm": 19.994056701660156, - "learning_rate": 7.876712328767123e-07, - "loss": 0.7959, - "step": 42650 + "loss": 0.6836, + "step": 21300 }, { "epoch": 1.9497716894977168, - "grad_norm": 25.47404670715332, + "grad_norm": 22.950048446655273, "learning_rate": 7.534246575342466e-07, - "loss": 0.5579, - "step": 42700 - }, - { - "epoch": 1.952054794520548, - "grad_norm": 10.598165512084961, - "learning_rate": 7.191780821917808e-07, - "loss": 0.8362, - "step": 42750 + "loss": 0.6815, + "step": 21350 }, { "epoch": 1.954337899543379, - "grad_norm": 1.3319069147109985, + "grad_norm": 20.81789207458496, "learning_rate": 6.84931506849315e-07, - "loss": 0.8132, - "step": 42800 - }, - { - "epoch": 1.95662100456621, - "grad_norm": 4.061497211456299, - "learning_rate": 6.506849315068493e-07, - "loss": 0.5364, - "step": 42850 + "loss": 0.7679, + "step": 21400 }, { "epoch": 1.958904109589041, - "grad_norm": 3.6196768283843994, + "grad_norm": 14.494473457336426, "learning_rate": 6.164383561643835e-07, - "loss": 0.57, - "step": 42900 - }, - { - "epoch": 1.961187214611872, - "grad_norm": 26.95933723449707, - "learning_rate": 5.821917808219178e-07, - "loss": 0.7364, - "step": 42950 + "loss": 0.5561, + "step": 21450 }, { "epoch": 1.9634703196347032, - "grad_norm": 25.18138313293457, + "grad_norm": 17.119592666625977, "learning_rate": 5.47945205479452e-07, - "loss": 0.5801, - "step": 43000 - }, - { - "epoch": 1.9657534246575343, - "grad_norm": 77.7193374633789, - "learning_rate": 5.136986301369863e-07, - "loss": 0.6965, - "step": 43050 + "loss": 0.6881, + "step": 21500 }, { "epoch": 1.9680365296803652, - "grad_norm": 10.553460121154785, + "grad_norm": 8.369721412658691, "learning_rate": 4.794520547945206e-07, - "loss": 0.6019, - "step": 43100 - }, - { - "epoch": 1.9703196347031964, - "grad_norm": 14.633034706115723, - "learning_rate": 4.4520547945205477e-07, - "loss": 0.5948, - "step": 43150 + "loss": 0.6692, + "step": 21550 }, { "epoch": 1.9726027397260273, - "grad_norm": 10.625903129577637, + "grad_norm": 11.264369010925293, "learning_rate": 4.1095890410958903e-07, - "loss": 0.5868, - "step": 43200 - }, - { - "epoch": 1.9748858447488584, - "grad_norm": 11.625406265258789, - "learning_rate": 3.767123287671233e-07, - "loss": 0.7335, - "step": 43250 + "loss": 0.6405, + "step": 21600 }, { "epoch": 1.9771689497716896, - "grad_norm": 11.875858306884766, + "grad_norm": 3.320608615875244, "learning_rate": 3.424657534246575e-07, - "loss": 0.6305, - "step": 43300 - }, - { - "epoch": 1.9794520547945207, - "grad_norm": 65.91475677490234, - "learning_rate": 3.0821917808219176e-07, - "loss": 0.5038, - "step": 43350 + "loss": 0.671, + "step": 21650 }, { "epoch": 1.9817351598173516, - "grad_norm": 7.164722442626953, + "grad_norm": 19.23822784423828, "learning_rate": 2.73972602739726e-07, - "loss": 0.6426, - "step": 43400 - }, - { - "epoch": 1.9840182648401825, - "grad_norm": 19.229677200317383, - "learning_rate": 2.397260273972603e-07, - "loss": 0.7671, - "step": 43450 + "loss": 0.5475, + "step": 21700 }, { "epoch": 1.9863013698630136, - "grad_norm": 9.592227935791016, + "grad_norm": 13.919721603393555, "learning_rate": 2.0547945205479452e-07, - "loss": 0.7229, - "step": 43500 - }, - { - "epoch": 1.9885844748858448, - "grad_norm": 23.81863021850586, - "learning_rate": 1.7123287671232875e-07, - "loss": 0.5238, - "step": 43550 + "loss": 0.7422, + "step": 21750 }, { "epoch": 1.990867579908676, - "grad_norm": 30.481460571289062, + "grad_norm": 14.141480445861816, "learning_rate": 1.36986301369863e-07, - "loss": 0.5882, - "step": 43600 - }, - { - "epoch": 1.9931506849315068, - "grad_norm": 3.194218397140503, - "learning_rate": 1.0273972602739726e-07, - "loss": 0.5594, - "step": 43650 + "loss": 0.5855, + "step": 21800 }, { "epoch": 1.9954337899543377, - "grad_norm": 32.75349807739258, + "grad_norm": 19.506189346313477, "learning_rate": 6.84931506849315e-08, - "loss": 0.7056, - "step": 43700 - }, - { - "epoch": 1.9977168949771689, - "grad_norm": 37.13774490356445, - "learning_rate": 3.424657534246575e-08, - "loss": 0.7017, - "step": 43750 + "loss": 0.7103, + "step": 21850 }, { "epoch": 2.0, - "grad_norm": 38.73429489135742, + "grad_norm": 16.25570297241211, "learning_rate": 0.0, - "loss": 0.6852, - "step": 43800 + "loss": 0.7072, + "step": 21900 }, { "epoch": 2.0, - "step": 43800, - "total_flos": 3.864501390676132e+17, - "train_loss": 0.05644623499482734, - "train_runtime": 4615.1991, - "train_samples_per_second": 37.961, - "train_steps_per_second": 9.49 + "step": 21900, + "total_flos": 3.8619551920019866e+17, + "train_loss": 0.9270246051109, + "train_runtime": 51747.0424, + "train_samples_per_second": 3.386, + "train_steps_per_second": 0.423 } ], "logging_steps": 50, - "max_steps": 43800, + "max_steps": 21900, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, @@ -6167,8 +3101,8 @@ "attributes": {} } }, - "total_flos": 3.864501390676132e+17, - "train_batch_size": 4, + "total_flos": 3.8619551920019866e+17, + "train_batch_size": 8, "trial_name": null, "trial_params": null }