{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 21900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0045662100456621, "grad_norm": 22.38092803955078, "learning_rate": 2.993150684931507e-05, "loss": 5.0082, "step": 50 }, { "epoch": 0.0091324200913242, "grad_norm": 19.525426864624023, "learning_rate": 2.9863013698630136e-05, "loss": 4.0853, "step": 100 }, { "epoch": 0.0136986301369863, "grad_norm": 29.21298599243164, "learning_rate": 2.9794520547945206e-05, "loss": 3.8879, "step": 150 }, { "epoch": 0.0182648401826484, "grad_norm": 29.60974884033203, "learning_rate": 2.9726027397260275e-05, "loss": 3.5946, "step": 200 }, { "epoch": 0.0228310502283105, "grad_norm": 20.75732421875, "learning_rate": 2.9657534246575345e-05, "loss": 3.1165, "step": 250 }, { "epoch": 0.0273972602739726, "grad_norm": 31.49699592590332, "learning_rate": 2.958904109589041e-05, "loss": 2.7346, "step": 300 }, { "epoch": 0.0319634703196347, "grad_norm": 43.215335845947266, "learning_rate": 2.952054794520548e-05, "loss": 2.3509, "step": 350 }, { "epoch": 0.0365296803652968, "grad_norm": 29.553388595581055, "learning_rate": 2.945205479452055e-05, "loss": 2.3374, "step": 400 }, { "epoch": 0.0410958904109589, "grad_norm": 30.272083282470703, "learning_rate": 2.938356164383562e-05, "loss": 2.0551, "step": 450 }, { "epoch": 0.045662100456621, "grad_norm": 23.157560348510742, "learning_rate": 2.9315068493150685e-05, "loss": 1.9659, "step": 500 }, { "epoch": 0.0502283105022831, "grad_norm": 39.72032165527344, "learning_rate": 2.9246575342465755e-05, "loss": 1.7552, "step": 550 }, { "epoch": 0.0547945205479452, "grad_norm": 24.441036224365234, "learning_rate": 2.9178082191780824e-05, "loss": 1.7992, "step": 600 }, { "epoch": 0.0593607305936073, "grad_norm": 32.07455825805664, "learning_rate": 2.910958904109589e-05, "loss": 1.6968, "step": 650 }, { "epoch": 0.0639269406392694, "grad_norm": 25.034423828125, "learning_rate": 2.9041095890410956e-05, "loss": 1.788, "step": 700 }, { "epoch": 0.0684931506849315, "grad_norm": 33.06129455566406, "learning_rate": 2.8972602739726026e-05, "loss": 1.8157, "step": 750 }, { "epoch": 0.0730593607305936, "grad_norm": 32.716251373291016, "learning_rate": 2.8904109589041095e-05, "loss": 1.7502, "step": 800 }, { "epoch": 0.0776255707762557, "grad_norm": 27.18515968322754, "learning_rate": 2.8835616438356165e-05, "loss": 1.5629, "step": 850 }, { "epoch": 0.0821917808219178, "grad_norm": 19.844303131103516, "learning_rate": 2.876712328767123e-05, "loss": 1.6421, "step": 900 }, { "epoch": 0.0867579908675799, "grad_norm": 160.48770141601562, "learning_rate": 2.86986301369863e-05, "loss": 1.6035, "step": 950 }, { "epoch": 0.091324200913242, "grad_norm": 35.13261032104492, "learning_rate": 2.863013698630137e-05, "loss": 1.4939, "step": 1000 }, { "epoch": 0.0958904109589041, "grad_norm": 24.28845977783203, "learning_rate": 2.856164383561644e-05, "loss": 1.5611, "step": 1050 }, { "epoch": 0.1004566210045662, "grad_norm": 30.07063865661621, "learning_rate": 2.8493150684931505e-05, "loss": 1.4715, "step": 1100 }, { "epoch": 0.1050228310502283, "grad_norm": 23.672300338745117, "learning_rate": 2.8424657534246575e-05, "loss": 1.534, "step": 1150 }, { "epoch": 0.1095890410958904, "grad_norm": 27.45283317565918, "learning_rate": 2.8356164383561644e-05, "loss": 1.5268, "step": 1200 }, { "epoch": 0.1141552511415525, "grad_norm": 26.83701515197754, "learning_rate": 2.8287671232876714e-05, "loss": 1.4089, "step": 1250 }, { "epoch": 0.1187214611872146, "grad_norm": 19.307844161987305, "learning_rate": 2.821917808219178e-05, "loss": 1.4929, "step": 1300 }, { "epoch": 0.1232876712328767, "grad_norm": 14.790838241577148, "learning_rate": 2.815068493150685e-05, "loss": 1.4687, "step": 1350 }, { "epoch": 0.1278538812785388, "grad_norm": 16.656993865966797, "learning_rate": 2.808219178082192e-05, "loss": 1.3456, "step": 1400 }, { "epoch": 0.1324200913242009, "grad_norm": 14.602429389953613, "learning_rate": 2.801369863013699e-05, "loss": 1.2725, "step": 1450 }, { "epoch": 0.136986301369863, "grad_norm": 23.853199005126953, "learning_rate": 2.7945205479452054e-05, "loss": 1.2684, "step": 1500 }, { "epoch": 0.1415525114155251, "grad_norm": 40.28663635253906, "learning_rate": 2.7876712328767124e-05, "loss": 1.2728, "step": 1550 }, { "epoch": 0.1461187214611872, "grad_norm": 11.038789749145508, "learning_rate": 2.7808219178082193e-05, "loss": 1.3578, "step": 1600 }, { "epoch": 0.1506849315068493, "grad_norm": 22.868593215942383, "learning_rate": 2.7739726027397263e-05, "loss": 1.3941, "step": 1650 }, { "epoch": 0.1552511415525114, "grad_norm": 23.89753532409668, "learning_rate": 2.767123287671233e-05, "loss": 1.3071, "step": 1700 }, { "epoch": 0.1598173515981735, "grad_norm": 40.02177429199219, "learning_rate": 2.76027397260274e-05, "loss": 1.3711, "step": 1750 }, { "epoch": 0.1643835616438356, "grad_norm": 40.53044128417969, "learning_rate": 2.7534246575342468e-05, "loss": 1.1694, "step": 1800 }, { "epoch": 0.1689497716894977, "grad_norm": 38.70663070678711, "learning_rate": 2.7465753424657537e-05, "loss": 1.2284, "step": 1850 }, { "epoch": 0.1735159817351598, "grad_norm": 114.71417236328125, "learning_rate": 2.7397260273972603e-05, "loss": 1.1634, "step": 1900 }, { "epoch": 0.1780821917808219, "grad_norm": 34.596923828125, "learning_rate": 2.732876712328767e-05, "loss": 1.2858, "step": 1950 }, { "epoch": 0.182648401826484, "grad_norm": 23.52661895751953, "learning_rate": 2.726027397260274e-05, "loss": 1.2273, "step": 2000 }, { "epoch": 0.1872146118721461, "grad_norm": 22.8360652923584, "learning_rate": 2.719178082191781e-05, "loss": 1.2622, "step": 2050 }, { "epoch": 0.1917808219178082, "grad_norm": 17.08322525024414, "learning_rate": 2.7123287671232875e-05, "loss": 1.3012, "step": 2100 }, { "epoch": 0.1963470319634703, "grad_norm": 12.923150062561035, "learning_rate": 2.7054794520547944e-05, "loss": 1.3847, "step": 2150 }, { "epoch": 0.2009132420091324, "grad_norm": 73.03246307373047, "learning_rate": 2.6986301369863014e-05, "loss": 1.3173, "step": 2200 }, { "epoch": 0.2054794520547945, "grad_norm": 16.656768798828125, "learning_rate": 2.6917808219178083e-05, "loss": 1.1779, "step": 2250 }, { "epoch": 0.2100456621004566, "grad_norm": 22.919973373413086, "learning_rate": 2.684931506849315e-05, "loss": 1.2497, "step": 2300 }, { "epoch": 0.2146118721461187, "grad_norm": 20.0987491607666, "learning_rate": 2.678082191780822e-05, "loss": 1.1787, "step": 2350 }, { "epoch": 0.2191780821917808, "grad_norm": 84.04356384277344, "learning_rate": 2.6712328767123288e-05, "loss": 1.1989, "step": 2400 }, { "epoch": 0.2237442922374429, "grad_norm": 12.585476875305176, "learning_rate": 2.6643835616438358e-05, "loss": 1.2324, "step": 2450 }, { "epoch": 0.228310502283105, "grad_norm": 18.206226348876953, "learning_rate": 2.6575342465753424e-05, "loss": 1.2541, "step": 2500 }, { "epoch": 0.2328767123287671, "grad_norm": 24.236295700073242, "learning_rate": 2.6506849315068493e-05, "loss": 1.1755, "step": 2550 }, { "epoch": 0.2374429223744292, "grad_norm": 21.91484260559082, "learning_rate": 2.6438356164383563e-05, "loss": 1.1436, "step": 2600 }, { "epoch": 0.2420091324200913, "grad_norm": 25.741945266723633, "learning_rate": 2.6369863013698632e-05, "loss": 1.146, "step": 2650 }, { "epoch": 0.2465753424657534, "grad_norm": 21.776227951049805, "learning_rate": 2.6301369863013698e-05, "loss": 1.1521, "step": 2700 }, { "epoch": 0.2511415525114155, "grad_norm": 30.281368255615234, "learning_rate": 2.6232876712328768e-05, "loss": 1.0747, "step": 2750 }, { "epoch": 0.2557077625570776, "grad_norm": 16.21628189086914, "learning_rate": 2.6164383561643837e-05, "loss": 1.2503, "step": 2800 }, { "epoch": 0.2602739726027397, "grad_norm": 17.022933959960938, "learning_rate": 2.6095890410958907e-05, "loss": 1.1719, "step": 2850 }, { "epoch": 0.2648401826484018, "grad_norm": 17.665477752685547, "learning_rate": 2.6027397260273973e-05, "loss": 1.091, "step": 2900 }, { "epoch": 0.2694063926940639, "grad_norm": 12.710230827331543, "learning_rate": 2.5958904109589042e-05, "loss": 1.1828, "step": 2950 }, { "epoch": 0.273972602739726, "grad_norm": 23.11914825439453, "learning_rate": 2.589041095890411e-05, "loss": 1.0025, "step": 3000 }, { "epoch": 0.2785388127853881, "grad_norm": 17.97935676574707, "learning_rate": 2.582191780821918e-05, "loss": 1.2145, "step": 3050 }, { "epoch": 0.2831050228310502, "grad_norm": 28.123554229736328, "learning_rate": 2.5753424657534247e-05, "loss": 1.2629, "step": 3100 }, { "epoch": 0.2876712328767123, "grad_norm": 17.924665451049805, "learning_rate": 2.5684931506849317e-05, "loss": 1.0439, "step": 3150 }, { "epoch": 0.2922374429223744, "grad_norm": 24.03557777404785, "learning_rate": 2.5616438356164386e-05, "loss": 1.127, "step": 3200 }, { "epoch": 0.2968036529680365, "grad_norm": 30.70620346069336, "learning_rate": 2.5547945205479452e-05, "loss": 1.1464, "step": 3250 }, { "epoch": 0.3013698630136986, "grad_norm": 21.80393409729004, "learning_rate": 2.5479452054794518e-05, "loss": 0.9603, "step": 3300 }, { "epoch": 0.3059360730593607, "grad_norm": 23.686079025268555, "learning_rate": 2.5410958904109588e-05, "loss": 1.0745, "step": 3350 }, { "epoch": 0.3105022831050228, "grad_norm": 10.258773803710938, "learning_rate": 2.5342465753424657e-05, "loss": 1.0776, "step": 3400 }, { "epoch": 0.3150684931506849, "grad_norm": 28.77837562561035, "learning_rate": 2.5273972602739727e-05, "loss": 1.1781, "step": 3450 }, { "epoch": 0.319634703196347, "grad_norm": 39.38608932495117, "learning_rate": 2.5205479452054793e-05, "loss": 0.9749, "step": 3500 }, { "epoch": 0.3242009132420091, "grad_norm": 29.596742630004883, "learning_rate": 2.5136986301369862e-05, "loss": 1.1099, "step": 3550 }, { "epoch": 0.3287671232876712, "grad_norm": 15.371848106384277, "learning_rate": 2.5068493150684932e-05, "loss": 1.1363, "step": 3600 }, { "epoch": 0.3333333333333333, "grad_norm": 27.008398056030273, "learning_rate": 2.5e-05, "loss": 1.2275, "step": 3650 }, { "epoch": 0.3378995433789954, "grad_norm": 54.85331726074219, "learning_rate": 2.4931506849315067e-05, "loss": 1.2297, "step": 3700 }, { "epoch": 0.3424657534246575, "grad_norm": 10.650010108947754, "learning_rate": 2.4863013698630137e-05, "loss": 1.1716, "step": 3750 }, { "epoch": 0.3470319634703196, "grad_norm": 21.537841796875, "learning_rate": 2.4794520547945206e-05, "loss": 1.2375, "step": 3800 }, { "epoch": 0.3515981735159817, "grad_norm": 31.040218353271484, "learning_rate": 2.4726027397260276e-05, "loss": 1.0511, "step": 3850 }, { "epoch": 0.3561643835616438, "grad_norm": 13.31843090057373, "learning_rate": 2.4657534246575342e-05, "loss": 1.1581, "step": 3900 }, { "epoch": 0.3607305936073059, "grad_norm": 22.53802490234375, "learning_rate": 2.458904109589041e-05, "loss": 1.0627, "step": 3950 }, { "epoch": 0.365296803652968, "grad_norm": 20.266468048095703, "learning_rate": 2.452054794520548e-05, "loss": 1.1103, "step": 4000 }, { "epoch": 0.3698630136986301, "grad_norm": 9.198668479919434, "learning_rate": 2.445205479452055e-05, "loss": 1.136, "step": 4050 }, { "epoch": 0.3744292237442922, "grad_norm": 13.770018577575684, "learning_rate": 2.4383561643835616e-05, "loss": 1.0984, "step": 4100 }, { "epoch": 0.3789954337899543, "grad_norm": 25.558441162109375, "learning_rate": 2.4315068493150686e-05, "loss": 1.062, "step": 4150 }, { "epoch": 0.3835616438356164, "grad_norm": 25.940107345581055, "learning_rate": 2.4246575342465755e-05, "loss": 1.2237, "step": 4200 }, { "epoch": 0.3881278538812785, "grad_norm": 11.530945777893066, "learning_rate": 2.4178082191780825e-05, "loss": 1.0717, "step": 4250 }, { "epoch": 0.3926940639269406, "grad_norm": 13.686588287353516, "learning_rate": 2.410958904109589e-05, "loss": 1.0011, "step": 4300 }, { "epoch": 0.3972602739726027, "grad_norm": 24.325557708740234, "learning_rate": 2.404109589041096e-05, "loss": 1.2033, "step": 4350 }, { "epoch": 0.4018264840182648, "grad_norm": 24.21858787536621, "learning_rate": 2.397260273972603e-05, "loss": 1.01, "step": 4400 }, { "epoch": 0.4063926940639269, "grad_norm": 14.875985145568848, "learning_rate": 2.39041095890411e-05, "loss": 1.0378, "step": 4450 }, { "epoch": 0.410958904109589, "grad_norm": 38.33986282348633, "learning_rate": 2.3835616438356165e-05, "loss": 1.0722, "step": 4500 }, { "epoch": 0.4155251141552511, "grad_norm": 10.655511856079102, "learning_rate": 2.376712328767123e-05, "loss": 1.1182, "step": 4550 }, { "epoch": 0.4200913242009132, "grad_norm": 26.336122512817383, "learning_rate": 2.36986301369863e-05, "loss": 1.0772, "step": 4600 }, { "epoch": 0.4246575342465753, "grad_norm": 26.254093170166016, "learning_rate": 2.363013698630137e-05, "loss": 0.9908, "step": 4650 }, { "epoch": 0.4292237442922374, "grad_norm": 7.081357002258301, "learning_rate": 2.3561643835616436e-05, "loss": 1.0079, "step": 4700 }, { "epoch": 0.4337899543378995, "grad_norm": 20.019088745117188, "learning_rate": 2.3493150684931506e-05, "loss": 1.1188, "step": 4750 }, { "epoch": 0.4383561643835616, "grad_norm": 30.188098907470703, "learning_rate": 2.3424657534246575e-05, "loss": 1.0616, "step": 4800 }, { "epoch": 0.4429223744292237, "grad_norm": 6.621674537658691, "learning_rate": 2.3356164383561645e-05, "loss": 1.0935, "step": 4850 }, { "epoch": 0.4474885844748858, "grad_norm": 21.145673751831055, "learning_rate": 2.328767123287671e-05, "loss": 1.0178, "step": 4900 }, { "epoch": 0.4520547945205479, "grad_norm": 26.22977066040039, "learning_rate": 2.321917808219178e-05, "loss": 1.1739, "step": 4950 }, { "epoch": 0.45662100456621, "grad_norm": 22.71933364868164, "learning_rate": 2.315068493150685e-05, "loss": 1.122, "step": 5000 }, { "epoch": 0.4611872146118721, "grad_norm": 16.020483016967773, "learning_rate": 2.308219178082192e-05, "loss": 1.0494, "step": 5050 }, { "epoch": 0.4657534246575342, "grad_norm": 10.217668533325195, "learning_rate": 2.3013698630136985e-05, "loss": 1.1577, "step": 5100 }, { "epoch": 0.4703196347031963, "grad_norm": 13.561128616333008, "learning_rate": 2.2945205479452055e-05, "loss": 1.1065, "step": 5150 }, { "epoch": 0.4748858447488584, "grad_norm": 24.07544708251953, "learning_rate": 2.2876712328767124e-05, "loss": 1.1721, "step": 5200 }, { "epoch": 0.4794520547945205, "grad_norm": 27.471532821655273, "learning_rate": 2.2808219178082194e-05, "loss": 1.0425, "step": 5250 }, { "epoch": 0.4840182648401826, "grad_norm": 39.17247772216797, "learning_rate": 2.273972602739726e-05, "loss": 1.0182, "step": 5300 }, { "epoch": 0.4885844748858447, "grad_norm": 11.253133773803711, "learning_rate": 2.267123287671233e-05, "loss": 1.0516, "step": 5350 }, { "epoch": 0.4931506849315068, "grad_norm": 19.816268920898438, "learning_rate": 2.26027397260274e-05, "loss": 0.9359, "step": 5400 }, { "epoch": 0.4977168949771689, "grad_norm": 21.89589500427246, "learning_rate": 2.253424657534247e-05, "loss": 1.0047, "step": 5450 }, { "epoch": 0.502283105022831, "grad_norm": 18.662717819213867, "learning_rate": 2.2465753424657534e-05, "loss": 1.1092, "step": 5500 }, { "epoch": 0.5068493150684932, "grad_norm": 13.65588092803955, "learning_rate": 2.2397260273972604e-05, "loss": 1.2777, "step": 5550 }, { "epoch": 0.5114155251141552, "grad_norm": 22.897354125976562, "learning_rate": 2.2328767123287673e-05, "loss": 1.0557, "step": 5600 }, { "epoch": 0.5159817351598174, "grad_norm": 22.30970573425293, "learning_rate": 2.2260273972602743e-05, "loss": 1.0701, "step": 5650 }, { "epoch": 0.5205479452054794, "grad_norm": 55.639892578125, "learning_rate": 2.219178082191781e-05, "loss": 1.1001, "step": 5700 }, { "epoch": 0.5251141552511416, "grad_norm": 11.327408790588379, "learning_rate": 2.212328767123288e-05, "loss": 1.1506, "step": 5750 }, { "epoch": 0.5296803652968036, "grad_norm": 31.288894653320312, "learning_rate": 2.2054794520547945e-05, "loss": 1.0774, "step": 5800 }, { "epoch": 0.5342465753424658, "grad_norm": 11.587857246398926, "learning_rate": 2.1986301369863014e-05, "loss": 1.0175, "step": 5850 }, { "epoch": 0.5388127853881278, "grad_norm": 27.174907684326172, "learning_rate": 2.191780821917808e-05, "loss": 1.1566, "step": 5900 }, { "epoch": 0.54337899543379, "grad_norm": 28.325786590576172, "learning_rate": 2.184931506849315e-05, "loss": 1.1089, "step": 5950 }, { "epoch": 0.547945205479452, "grad_norm": 21.70798683166504, "learning_rate": 2.178082191780822e-05, "loss": 1.0575, "step": 6000 }, { "epoch": 0.5525114155251142, "grad_norm": 18.79369354248047, "learning_rate": 2.171232876712329e-05, "loss": 1.1327, "step": 6050 }, { "epoch": 0.5570776255707762, "grad_norm": 31.291170120239258, "learning_rate": 2.1643835616438355e-05, "loss": 1.1412, "step": 6100 }, { "epoch": 0.5616438356164384, "grad_norm": 15.10459041595459, "learning_rate": 2.1575342465753424e-05, "loss": 1.1289, "step": 6150 }, { "epoch": 0.5662100456621004, "grad_norm": 23.595136642456055, "learning_rate": 2.1506849315068494e-05, "loss": 0.9419, "step": 6200 }, { "epoch": 0.5707762557077626, "grad_norm": 9.336952209472656, "learning_rate": 2.1438356164383563e-05, "loss": 0.9357, "step": 6250 }, { "epoch": 0.5753424657534246, "grad_norm": 34.959354400634766, "learning_rate": 2.136986301369863e-05, "loss": 1.1525, "step": 6300 }, { "epoch": 0.5799086757990868, "grad_norm": 25.974088668823242, "learning_rate": 2.13013698630137e-05, "loss": 1.0915, "step": 6350 }, { "epoch": 0.5844748858447488, "grad_norm": 26.807876586914062, "learning_rate": 2.1232876712328768e-05, "loss": 1.0783, "step": 6400 }, { "epoch": 0.589041095890411, "grad_norm": 11.836869239807129, "learning_rate": 2.1164383561643838e-05, "loss": 1.1106, "step": 6450 }, { "epoch": 0.593607305936073, "grad_norm": 20.5205078125, "learning_rate": 2.1095890410958904e-05, "loss": 1.0021, "step": 6500 }, { "epoch": 0.5981735159817352, "grad_norm": 63.918914794921875, "learning_rate": 2.1027397260273973e-05, "loss": 0.9942, "step": 6550 }, { "epoch": 0.6027397260273972, "grad_norm": 15.824874877929688, "learning_rate": 2.0958904109589043e-05, "loss": 1.0382, "step": 6600 }, { "epoch": 0.6073059360730594, "grad_norm": 23.539623260498047, "learning_rate": 2.0890410958904112e-05, "loss": 1.1323, "step": 6650 }, { "epoch": 0.6118721461187214, "grad_norm": 22.72905921936035, "learning_rate": 2.0821917808219178e-05, "loss": 1.096, "step": 6700 }, { "epoch": 0.6164383561643836, "grad_norm": 21.393850326538086, "learning_rate": 2.0753424657534248e-05, "loss": 1.0491, "step": 6750 }, { "epoch": 0.6210045662100456, "grad_norm": 10.425342559814453, "learning_rate": 2.0684931506849317e-05, "loss": 0.9894, "step": 6800 }, { "epoch": 0.6255707762557078, "grad_norm": 31.147842407226562, "learning_rate": 2.0616438356164387e-05, "loss": 1.0229, "step": 6850 }, { "epoch": 0.6301369863013698, "grad_norm": 13.265896797180176, "learning_rate": 2.0547945205479453e-05, "loss": 1.0372, "step": 6900 }, { "epoch": 0.634703196347032, "grad_norm": 19.299884796142578, "learning_rate": 2.0479452054794522e-05, "loss": 0.9861, "step": 6950 }, { "epoch": 0.639269406392694, "grad_norm": 15.215560913085938, "learning_rate": 2.041095890410959e-05, "loss": 1.0685, "step": 7000 }, { "epoch": 0.6438356164383562, "grad_norm": 12.113781929016113, "learning_rate": 2.034246575342466e-05, "loss": 1.0452, "step": 7050 }, { "epoch": 0.6484018264840182, "grad_norm": 53.0004768371582, "learning_rate": 2.0273972602739724e-05, "loss": 1.0177, "step": 7100 }, { "epoch": 0.6529680365296804, "grad_norm": 10.609742164611816, "learning_rate": 2.0205479452054793e-05, "loss": 0.9896, "step": 7150 }, { "epoch": 0.6575342465753424, "grad_norm": 13.45383071899414, "learning_rate": 2.0136986301369863e-05, "loss": 0.7549, "step": 7200 }, { "epoch": 0.6621004566210046, "grad_norm": 21.812686920166016, "learning_rate": 2.0068493150684932e-05, "loss": 1.0402, "step": 7250 }, { "epoch": 0.6666666666666666, "grad_norm": 7.622537612915039, "learning_rate": 1.9999999999999998e-05, "loss": 0.9468, "step": 7300 }, { "epoch": 0.6712328767123288, "grad_norm": 14.563289642333984, "learning_rate": 1.9931506849315068e-05, "loss": 1.0154, "step": 7350 }, { "epoch": 0.6757990867579908, "grad_norm": 29.9089298248291, "learning_rate": 1.9863013698630137e-05, "loss": 0.9872, "step": 7400 }, { "epoch": 0.680365296803653, "grad_norm": 7.758576393127441, "learning_rate": 1.9794520547945207e-05, "loss": 0.8618, "step": 7450 }, { "epoch": 0.684931506849315, "grad_norm": 15.849843978881836, "learning_rate": 1.9726027397260273e-05, "loss": 0.9981, "step": 7500 }, { "epoch": 0.6894977168949772, "grad_norm": 13.798691749572754, "learning_rate": 1.9657534246575342e-05, "loss": 1.0578, "step": 7550 }, { "epoch": 0.6940639269406392, "grad_norm": 12.977076530456543, "learning_rate": 1.9589041095890412e-05, "loss": 1.0081, "step": 7600 }, { "epoch": 0.6986301369863014, "grad_norm": 25.823368072509766, "learning_rate": 1.952054794520548e-05, "loss": 0.9483, "step": 7650 }, { "epoch": 0.7031963470319634, "grad_norm": 1.7260483503341675, "learning_rate": 1.9452054794520547e-05, "loss": 0.9396, "step": 7700 }, { "epoch": 0.7077625570776256, "grad_norm": 28.217559814453125, "learning_rate": 1.9383561643835617e-05, "loss": 1.0159, "step": 7750 }, { "epoch": 0.7123287671232876, "grad_norm": 15.566514015197754, "learning_rate": 1.9315068493150686e-05, "loss": 1.0673, "step": 7800 }, { "epoch": 0.7168949771689498, "grad_norm": 49.06792449951172, "learning_rate": 1.9246575342465756e-05, "loss": 0.9817, "step": 7850 }, { "epoch": 0.7214611872146118, "grad_norm": 35.709224700927734, "learning_rate": 1.9178082191780822e-05, "loss": 0.9818, "step": 7900 }, { "epoch": 0.726027397260274, "grad_norm": 20.94626808166504, "learning_rate": 1.910958904109589e-05, "loss": 0.9113, "step": 7950 }, { "epoch": 0.730593607305936, "grad_norm": 25.5698184967041, "learning_rate": 1.904109589041096e-05, "loss": 1.0525, "step": 8000 }, { "epoch": 0.7351598173515982, "grad_norm": 36.41669845581055, "learning_rate": 1.897260273972603e-05, "loss": 1.007, "step": 8050 }, { "epoch": 0.7397260273972602, "grad_norm": 17.15513038635254, "learning_rate": 1.8904109589041096e-05, "loss": 0.9568, "step": 8100 }, { "epoch": 0.7442922374429224, "grad_norm": 24.435543060302734, "learning_rate": 1.8835616438356166e-05, "loss": 1.0073, "step": 8150 }, { "epoch": 0.7488584474885844, "grad_norm": 26.5279483795166, "learning_rate": 1.8767123287671235e-05, "loss": 0.9768, "step": 8200 }, { "epoch": 0.7534246575342466, "grad_norm": 18.786693572998047, "learning_rate": 1.8698630136986305e-05, "loss": 1.0579, "step": 8250 }, { "epoch": 0.7579908675799086, "grad_norm": 22.201034545898438, "learning_rate": 1.863013698630137e-05, "loss": 1.003, "step": 8300 }, { "epoch": 0.7625570776255708, "grad_norm": 28.934587478637695, "learning_rate": 1.856164383561644e-05, "loss": 0.9753, "step": 8350 }, { "epoch": 0.7671232876712328, "grad_norm": 17.935937881469727, "learning_rate": 1.8493150684931506e-05, "loss": 0.9038, "step": 8400 }, { "epoch": 0.771689497716895, "grad_norm": 29.648881912231445, "learning_rate": 1.8424657534246576e-05, "loss": 1.0109, "step": 8450 }, { "epoch": 0.776255707762557, "grad_norm": 32.65835189819336, "learning_rate": 1.8356164383561642e-05, "loss": 1.0207, "step": 8500 }, { "epoch": 0.7808219178082192, "grad_norm": 21.674053192138672, "learning_rate": 1.828767123287671e-05, "loss": 1.0573, "step": 8550 }, { "epoch": 0.7853881278538812, "grad_norm": 19.038619995117188, "learning_rate": 1.821917808219178e-05, "loss": 1.042, "step": 8600 }, { "epoch": 0.7899543378995434, "grad_norm": 23.994884490966797, "learning_rate": 1.815068493150685e-05, "loss": 1.0316, "step": 8650 }, { "epoch": 0.7945205479452054, "grad_norm": 26.702028274536133, "learning_rate": 1.8082191780821916e-05, "loss": 0.9592, "step": 8700 }, { "epoch": 0.7990867579908676, "grad_norm": 7.485787391662598, "learning_rate": 1.8013698630136986e-05, "loss": 1.0175, "step": 8750 }, { "epoch": 0.8036529680365296, "grad_norm": 24.249893188476562, "learning_rate": 1.7945205479452055e-05, "loss": 1.0643, "step": 8800 }, { "epoch": 0.8082191780821918, "grad_norm": 31.136962890625, "learning_rate": 1.7876712328767125e-05, "loss": 0.8719, "step": 8850 }, { "epoch": 0.8127853881278538, "grad_norm": 18.02696418762207, "learning_rate": 1.780821917808219e-05, "loss": 0.9169, "step": 8900 }, { "epoch": 0.817351598173516, "grad_norm": 7.9297261238098145, "learning_rate": 1.773972602739726e-05, "loss": 0.9241, "step": 8950 }, { "epoch": 0.821917808219178, "grad_norm": 15.147187232971191, "learning_rate": 1.767123287671233e-05, "loss": 0.9451, "step": 9000 }, { "epoch": 0.8264840182648402, "grad_norm": 26.369773864746094, "learning_rate": 1.76027397260274e-05, "loss": 0.9198, "step": 9050 }, { "epoch": 0.8310502283105022, "grad_norm": 10.24289321899414, "learning_rate": 1.7534246575342465e-05, "loss": 0.9423, "step": 9100 }, { "epoch": 0.8356164383561644, "grad_norm": 17.285388946533203, "learning_rate": 1.7465753424657535e-05, "loss": 0.8636, "step": 9150 }, { "epoch": 0.8401826484018264, "grad_norm": 26.444520950317383, "learning_rate": 1.7397260273972604e-05, "loss": 1.0647, "step": 9200 }, { "epoch": 0.8447488584474886, "grad_norm": 37.353084564208984, "learning_rate": 1.7328767123287674e-05, "loss": 0.9484, "step": 9250 }, { "epoch": 0.8493150684931506, "grad_norm": 9.881476402282715, "learning_rate": 1.726027397260274e-05, "loss": 1.014, "step": 9300 }, { "epoch": 0.8538812785388128, "grad_norm": 27.257709503173828, "learning_rate": 1.719178082191781e-05, "loss": 0.9169, "step": 9350 }, { "epoch": 0.8584474885844748, "grad_norm": 20.704347610473633, "learning_rate": 1.712328767123288e-05, "loss": 0.9597, "step": 9400 }, { "epoch": 0.863013698630137, "grad_norm": 20.994293212890625, "learning_rate": 1.705479452054795e-05, "loss": 0.9328, "step": 9450 }, { "epoch": 0.867579908675799, "grad_norm": 31.617778778076172, "learning_rate": 1.6986301369863014e-05, "loss": 0.9988, "step": 9500 }, { "epoch": 0.8721461187214612, "grad_norm": 24.76189422607422, "learning_rate": 1.6917808219178084e-05, "loss": 0.9338, "step": 9550 }, { "epoch": 0.8767123287671232, "grad_norm": 27.70781135559082, "learning_rate": 1.6849315068493153e-05, "loss": 0.8994, "step": 9600 }, { "epoch": 0.8812785388127854, "grad_norm": 6.332718372344971, "learning_rate": 1.6780821917808223e-05, "loss": 0.8815, "step": 9650 }, { "epoch": 0.8858447488584474, "grad_norm": 25.22089195251465, "learning_rate": 1.6712328767123286e-05, "loss": 1.0243, "step": 9700 }, { "epoch": 0.8904109589041096, "grad_norm": 6.610196590423584, "learning_rate": 1.6643835616438355e-05, "loss": 0.856, "step": 9750 }, { "epoch": 0.8949771689497716, "grad_norm": 9.7604341506958, "learning_rate": 1.6575342465753425e-05, "loss": 0.9275, "step": 9800 }, { "epoch": 0.8995433789954338, "grad_norm": 13.222633361816406, "learning_rate": 1.6506849315068494e-05, "loss": 0.7912, "step": 9850 }, { "epoch": 0.9041095890410958, "grad_norm": 8.07390022277832, "learning_rate": 1.643835616438356e-05, "loss": 0.9296, "step": 9900 }, { "epoch": 0.908675799086758, "grad_norm": 8.25124740600586, "learning_rate": 1.636986301369863e-05, "loss": 0.9269, "step": 9950 }, { "epoch": 0.91324200913242, "grad_norm": 22.882007598876953, "learning_rate": 1.63013698630137e-05, "loss": 0.9694, "step": 10000 }, { "epoch": 0.9178082191780822, "grad_norm": 8.665299415588379, "learning_rate": 1.623287671232877e-05, "loss": 0.8643, "step": 10050 }, { "epoch": 0.9223744292237442, "grad_norm": 13.816045761108398, "learning_rate": 1.6164383561643835e-05, "loss": 0.9476, "step": 10100 }, { "epoch": 0.9269406392694064, "grad_norm": 11.227286338806152, "learning_rate": 1.6095890410958904e-05, "loss": 0.8805, "step": 10150 }, { "epoch": 0.9315068493150684, "grad_norm": 7.756823539733887, "learning_rate": 1.6027397260273974e-05, "loss": 0.9185, "step": 10200 }, { "epoch": 0.9360730593607306, "grad_norm": 3.170640230178833, "learning_rate": 1.5958904109589043e-05, "loss": 0.8761, "step": 10250 }, { "epoch": 0.9406392694063926, "grad_norm": 11.567336082458496, "learning_rate": 1.589041095890411e-05, "loss": 0.9298, "step": 10300 }, { "epoch": 0.9452054794520548, "grad_norm": 23.299612045288086, "learning_rate": 1.582191780821918e-05, "loss": 0.8361, "step": 10350 }, { "epoch": 0.9497716894977168, "grad_norm": 14.57529067993164, "learning_rate": 1.5753424657534248e-05, "loss": 0.9669, "step": 10400 }, { "epoch": 0.954337899543379, "grad_norm": 10.777979850769043, "learning_rate": 1.5684931506849318e-05, "loss": 0.9632, "step": 10450 }, { "epoch": 0.958904109589041, "grad_norm": 25.46247100830078, "learning_rate": 1.5616438356164384e-05, "loss": 0.8625, "step": 10500 }, { "epoch": 0.9634703196347032, "grad_norm": 8.925729751586914, "learning_rate": 1.5547945205479453e-05, "loss": 0.903, "step": 10550 }, { "epoch": 0.9680365296803652, "grad_norm": 11.709217071533203, "learning_rate": 1.5479452054794523e-05, "loss": 0.9446, "step": 10600 }, { "epoch": 0.9726027397260274, "grad_norm": 36.75910568237305, "learning_rate": 1.5410958904109592e-05, "loss": 0.9568, "step": 10650 }, { "epoch": 0.9771689497716894, "grad_norm": 30.059825897216797, "learning_rate": 1.5342465753424658e-05, "loss": 0.9603, "step": 10700 }, { "epoch": 0.9817351598173516, "grad_norm": 23.478477478027344, "learning_rate": 1.5273972602739728e-05, "loss": 0.9828, "step": 10750 }, { "epoch": 0.9863013698630136, "grad_norm": 34.233699798583984, "learning_rate": 1.5205479452054795e-05, "loss": 0.9867, "step": 10800 }, { "epoch": 0.9908675799086758, "grad_norm": 19.399288177490234, "learning_rate": 1.5136986301369865e-05, "loss": 0.8618, "step": 10850 }, { "epoch": 0.9954337899543378, "grad_norm": 9.319437026977539, "learning_rate": 1.5068493150684931e-05, "loss": 0.9352, "step": 10900 }, { "epoch": 1.0, "grad_norm": 8.767410278320312, "learning_rate": 1.5e-05, "loss": 0.8511, "step": 10950 }, { "epoch": 1.004566210045662, "grad_norm": 15.091998100280762, "learning_rate": 1.4931506849315068e-05, "loss": 0.6653, "step": 11000 }, { "epoch": 1.009132420091324, "grad_norm": 23.510337829589844, "learning_rate": 1.4863013698630138e-05, "loss": 0.6163, "step": 11050 }, { "epoch": 1.0136986301369864, "grad_norm": 8.980317115783691, "learning_rate": 1.4794520547945205e-05, "loss": 0.7783, "step": 11100 }, { "epoch": 1.0182648401826484, "grad_norm": 36.67283248901367, "learning_rate": 1.4726027397260275e-05, "loss": 0.7681, "step": 11150 }, { "epoch": 1.0228310502283104, "grad_norm": 4.734091758728027, "learning_rate": 1.4657534246575343e-05, "loss": 0.6181, "step": 11200 }, { "epoch": 1.0273972602739727, "grad_norm": 7.513982772827148, "learning_rate": 1.4589041095890412e-05, "loss": 0.7297, "step": 11250 }, { "epoch": 1.0319634703196348, "grad_norm": 16.1832275390625, "learning_rate": 1.4520547945205478e-05, "loss": 0.6487, "step": 11300 }, { "epoch": 1.0365296803652968, "grad_norm": 25.557220458984375, "learning_rate": 1.4452054794520548e-05, "loss": 0.7089, "step": 11350 }, { "epoch": 1.0410958904109588, "grad_norm": 11.07717227935791, "learning_rate": 1.4383561643835615e-05, "loss": 0.6266, "step": 11400 }, { "epoch": 1.045662100456621, "grad_norm": 11.381722450256348, "learning_rate": 1.4315068493150685e-05, "loss": 0.6758, "step": 11450 }, { "epoch": 1.0502283105022832, "grad_norm": 9.080512046813965, "learning_rate": 1.4246575342465753e-05, "loss": 0.6865, "step": 11500 }, { "epoch": 1.0547945205479452, "grad_norm": 10.727092742919922, "learning_rate": 1.4178082191780822e-05, "loss": 0.6494, "step": 11550 }, { "epoch": 1.0593607305936072, "grad_norm": 4.969435691833496, "learning_rate": 1.410958904109589e-05, "loss": 0.5874, "step": 11600 }, { "epoch": 1.0639269406392695, "grad_norm": 12.03378963470459, "learning_rate": 1.404109589041096e-05, "loss": 0.6301, "step": 11650 }, { "epoch": 1.0684931506849316, "grad_norm": 8.797894477844238, "learning_rate": 1.3972602739726027e-05, "loss": 0.6839, "step": 11700 }, { "epoch": 1.0730593607305936, "grad_norm": 9.395676612854004, "learning_rate": 1.3904109589041097e-05, "loss": 0.6672, "step": 11750 }, { "epoch": 1.0776255707762556, "grad_norm": 26.268247604370117, "learning_rate": 1.3835616438356164e-05, "loss": 0.6398, "step": 11800 }, { "epoch": 1.0821917808219177, "grad_norm": 19.63582992553711, "learning_rate": 1.3767123287671234e-05, "loss": 0.7825, "step": 11850 }, { "epoch": 1.08675799086758, "grad_norm": 23.42495346069336, "learning_rate": 1.3698630136986302e-05, "loss": 0.7078, "step": 11900 }, { "epoch": 1.091324200913242, "grad_norm": 29.21826934814453, "learning_rate": 1.363013698630137e-05, "loss": 0.6973, "step": 11950 }, { "epoch": 1.095890410958904, "grad_norm": 13.69019603729248, "learning_rate": 1.3561643835616437e-05, "loss": 0.6325, "step": 12000 }, { "epoch": 1.1004566210045663, "grad_norm": 22.293821334838867, "learning_rate": 1.3493150684931507e-05, "loss": 0.6833, "step": 12050 }, { "epoch": 1.1050228310502284, "grad_norm": 9.701814651489258, "learning_rate": 1.3424657534246575e-05, "loss": 0.6518, "step": 12100 }, { "epoch": 1.1095890410958904, "grad_norm": 25.30548667907715, "learning_rate": 1.3356164383561644e-05, "loss": 0.7458, "step": 12150 }, { "epoch": 1.1141552511415524, "grad_norm": 11.800670623779297, "learning_rate": 1.3287671232876712e-05, "loss": 0.755, "step": 12200 }, { "epoch": 1.1187214611872145, "grad_norm": 15.019549369812012, "learning_rate": 1.3219178082191781e-05, "loss": 0.6871, "step": 12250 }, { "epoch": 1.1232876712328768, "grad_norm": 8.90066909790039, "learning_rate": 1.3150684931506849e-05, "loss": 0.738, "step": 12300 }, { "epoch": 1.1278538812785388, "grad_norm": 6.738426685333252, "learning_rate": 1.3082191780821919e-05, "loss": 0.6828, "step": 12350 }, { "epoch": 1.1324200913242009, "grad_norm": 7.866949558258057, "learning_rate": 1.3013698630136986e-05, "loss": 0.708, "step": 12400 }, { "epoch": 1.1369863013698631, "grad_norm": 26.066892623901367, "learning_rate": 1.2945205479452056e-05, "loss": 0.7078, "step": 12450 }, { "epoch": 1.1415525114155252, "grad_norm": 7.540081024169922, "learning_rate": 1.2876712328767124e-05, "loss": 0.6958, "step": 12500 }, { "epoch": 1.1461187214611872, "grad_norm": 13.667712211608887, "learning_rate": 1.2808219178082193e-05, "loss": 0.5233, "step": 12550 }, { "epoch": 1.1506849315068493, "grad_norm": 17.8378849029541, "learning_rate": 1.2739726027397259e-05, "loss": 0.669, "step": 12600 }, { "epoch": 1.1552511415525113, "grad_norm": 25.038570404052734, "learning_rate": 1.2671232876712329e-05, "loss": 0.5989, "step": 12650 }, { "epoch": 1.1598173515981736, "grad_norm": 2.4400246143341064, "learning_rate": 1.2602739726027396e-05, "loss": 0.745, "step": 12700 }, { "epoch": 1.1643835616438356, "grad_norm": 3.209836006164551, "learning_rate": 1.2534246575342466e-05, "loss": 0.6943, "step": 12750 }, { "epoch": 1.1689497716894977, "grad_norm": 12.491949081420898, "learning_rate": 1.2465753424657534e-05, "loss": 0.6115, "step": 12800 }, { "epoch": 1.17351598173516, "grad_norm": 11.220638275146484, "learning_rate": 1.2397260273972603e-05, "loss": 0.5898, "step": 12850 }, { "epoch": 1.178082191780822, "grad_norm": 16.73185920715332, "learning_rate": 1.2328767123287671e-05, "loss": 0.6734, "step": 12900 }, { "epoch": 1.182648401826484, "grad_norm": 14.307918548583984, "learning_rate": 1.226027397260274e-05, "loss": 0.7266, "step": 12950 }, { "epoch": 1.187214611872146, "grad_norm": 14.836384773254395, "learning_rate": 1.2191780821917808e-05, "loss": 0.7586, "step": 13000 }, { "epoch": 1.191780821917808, "grad_norm": 29.282228469848633, "learning_rate": 1.2123287671232878e-05, "loss": 0.7917, "step": 13050 }, { "epoch": 1.1963470319634704, "grad_norm": 26.005468368530273, "learning_rate": 1.2054794520547945e-05, "loss": 0.6413, "step": 13100 }, { "epoch": 1.2009132420091324, "grad_norm": 8.961703300476074, "learning_rate": 1.1986301369863015e-05, "loss": 0.792, "step": 13150 }, { "epoch": 1.2054794520547945, "grad_norm": 10.702567100524902, "learning_rate": 1.1917808219178083e-05, "loss": 0.6594, "step": 13200 }, { "epoch": 1.2100456621004567, "grad_norm": 13.418671607971191, "learning_rate": 1.184931506849315e-05, "loss": 0.7547, "step": 13250 }, { "epoch": 1.2146118721461188, "grad_norm": 17.413429260253906, "learning_rate": 1.1780821917808218e-05, "loss": 0.7908, "step": 13300 }, { "epoch": 1.2191780821917808, "grad_norm": 18.375572204589844, "learning_rate": 1.1712328767123288e-05, "loss": 0.7298, "step": 13350 }, { "epoch": 1.2237442922374429, "grad_norm": 12.126824378967285, "learning_rate": 1.1643835616438355e-05, "loss": 0.7482, "step": 13400 }, { "epoch": 1.228310502283105, "grad_norm": 6.292917728424072, "learning_rate": 1.1575342465753425e-05, "loss": 0.7049, "step": 13450 }, { "epoch": 1.2328767123287672, "grad_norm": 48.449954986572266, "learning_rate": 1.1506849315068493e-05, "loss": 0.5808, "step": 13500 }, { "epoch": 1.2374429223744292, "grad_norm": 11.724007606506348, "learning_rate": 1.1438356164383562e-05, "loss": 0.6694, "step": 13550 }, { "epoch": 1.2420091324200913, "grad_norm": 11.694669723510742, "learning_rate": 1.136986301369863e-05, "loss": 0.7223, "step": 13600 }, { "epoch": 1.2465753424657535, "grad_norm": 18.402746200561523, "learning_rate": 1.13013698630137e-05, "loss": 0.6794, "step": 13650 }, { "epoch": 1.2511415525114156, "grad_norm": 10.988481521606445, "learning_rate": 1.1232876712328767e-05, "loss": 0.618, "step": 13700 }, { "epoch": 1.2557077625570776, "grad_norm": 8.175172805786133, "learning_rate": 1.1164383561643837e-05, "loss": 0.6552, "step": 13750 }, { "epoch": 1.2602739726027397, "grad_norm": 17.85883903503418, "learning_rate": 1.1095890410958904e-05, "loss": 0.6645, "step": 13800 }, { "epoch": 1.2648401826484017, "grad_norm": 6.0199198722839355, "learning_rate": 1.1027397260273972e-05, "loss": 0.7725, "step": 13850 }, { "epoch": 1.269406392694064, "grad_norm": 12.200461387634277, "learning_rate": 1.095890410958904e-05, "loss": 0.6685, "step": 13900 }, { "epoch": 1.273972602739726, "grad_norm": 11.29808235168457, "learning_rate": 1.089041095890411e-05, "loss": 0.5974, "step": 13950 }, { "epoch": 1.278538812785388, "grad_norm": 20.522750854492188, "learning_rate": 1.0821917808219177e-05, "loss": 0.6497, "step": 14000 }, { "epoch": 1.2831050228310503, "grad_norm": 4.903714656829834, "learning_rate": 1.0753424657534247e-05, "loss": 0.805, "step": 14050 }, { "epoch": 1.2876712328767124, "grad_norm": 14.09408950805664, "learning_rate": 1.0684931506849315e-05, "loss": 0.6975, "step": 14100 }, { "epoch": 1.2922374429223744, "grad_norm": 5.306293964385986, "learning_rate": 1.0616438356164384e-05, "loss": 0.6181, "step": 14150 }, { "epoch": 1.2968036529680365, "grad_norm": 15.217981338500977, "learning_rate": 1.0547945205479452e-05, "loss": 0.6363, "step": 14200 }, { "epoch": 1.3013698630136985, "grad_norm": 2.86833119392395, "learning_rate": 1.0479452054794521e-05, "loss": 0.632, "step": 14250 }, { "epoch": 1.3059360730593608, "grad_norm": 12.824779510498047, "learning_rate": 1.0410958904109589e-05, "loss": 0.7499, "step": 14300 }, { "epoch": 1.3105022831050228, "grad_norm": 17.47176170349121, "learning_rate": 1.0342465753424659e-05, "loss": 0.581, "step": 14350 }, { "epoch": 1.3150684931506849, "grad_norm": 7.620934009552002, "learning_rate": 1.0273972602739726e-05, "loss": 0.5913, "step": 14400 }, { "epoch": 1.3196347031963471, "grad_norm": 11.1875, "learning_rate": 1.0205479452054796e-05, "loss": 0.6465, "step": 14450 }, { "epoch": 1.3242009132420092, "grad_norm": 24.72510528564453, "learning_rate": 1.0136986301369862e-05, "loss": 0.6796, "step": 14500 }, { "epoch": 1.3287671232876712, "grad_norm": 6.102226734161377, "learning_rate": 1.0068493150684931e-05, "loss": 0.7074, "step": 14550 }, { "epoch": 1.3333333333333333, "grad_norm": 6.332953929901123, "learning_rate": 9.999999999999999e-06, "loss": 0.6338, "step": 14600 }, { "epoch": 1.3378995433789953, "grad_norm": 28.30670738220215, "learning_rate": 9.931506849315069e-06, "loss": 0.7109, "step": 14650 }, { "epoch": 1.3424657534246576, "grad_norm": 40.727230072021484, "learning_rate": 9.863013698630136e-06, "loss": 0.7342, "step": 14700 }, { "epoch": 1.3470319634703196, "grad_norm": 11.026389122009277, "learning_rate": 9.794520547945206e-06, "loss": 0.6972, "step": 14750 }, { "epoch": 1.3515981735159817, "grad_norm": 16.95206642150879, "learning_rate": 9.726027397260274e-06, "loss": 0.6509, "step": 14800 }, { "epoch": 1.356164383561644, "grad_norm": 24.887845993041992, "learning_rate": 9.657534246575343e-06, "loss": 0.6608, "step": 14850 }, { "epoch": 1.360730593607306, "grad_norm": 5.1824421882629395, "learning_rate": 9.589041095890411e-06, "loss": 0.664, "step": 14900 }, { "epoch": 1.365296803652968, "grad_norm": 35.08380889892578, "learning_rate": 9.52054794520548e-06, "loss": 0.7375, "step": 14950 }, { "epoch": 1.36986301369863, "grad_norm": 13.273919105529785, "learning_rate": 9.452054794520548e-06, "loss": 0.6581, "step": 15000 }, { "epoch": 1.374429223744292, "grad_norm": 20.243751525878906, "learning_rate": 9.383561643835618e-06, "loss": 0.7028, "step": 15050 }, { "epoch": 1.3789954337899544, "grad_norm": 6.9884934425354, "learning_rate": 9.315068493150685e-06, "loss": 0.6699, "step": 15100 }, { "epoch": 1.3835616438356164, "grad_norm": 10.110499382019043, "learning_rate": 9.246575342465753e-06, "loss": 0.5804, "step": 15150 }, { "epoch": 1.3881278538812785, "grad_norm": 5.272585868835449, "learning_rate": 9.178082191780821e-06, "loss": 0.7046, "step": 15200 }, { "epoch": 1.3926940639269407, "grad_norm": 3.3239293098449707, "learning_rate": 9.10958904109589e-06, "loss": 0.6621, "step": 15250 }, { "epoch": 1.3972602739726028, "grad_norm": 9.12402057647705, "learning_rate": 9.041095890410958e-06, "loss": 0.5613, "step": 15300 }, { "epoch": 1.4018264840182648, "grad_norm": 9.260445594787598, "learning_rate": 8.972602739726028e-06, "loss": 0.6179, "step": 15350 }, { "epoch": 1.4063926940639269, "grad_norm": 40.01394271850586, "learning_rate": 8.904109589041095e-06, "loss": 0.5897, "step": 15400 }, { "epoch": 1.410958904109589, "grad_norm": 43.27082443237305, "learning_rate": 8.835616438356165e-06, "loss": 0.5804, "step": 15450 }, { "epoch": 1.4155251141552512, "grad_norm": 8.149184226989746, "learning_rate": 8.767123287671233e-06, "loss": 0.7865, "step": 15500 }, { "epoch": 1.4200913242009132, "grad_norm": 16.708332061767578, "learning_rate": 8.698630136986302e-06, "loss": 0.6983, "step": 15550 }, { "epoch": 1.4246575342465753, "grad_norm": 2.826059103012085, "learning_rate": 8.63013698630137e-06, "loss": 0.6238, "step": 15600 }, { "epoch": 1.4292237442922375, "grad_norm": 13.411745071411133, "learning_rate": 8.56164383561644e-06, "loss": 0.6448, "step": 15650 }, { "epoch": 1.4337899543378996, "grad_norm": 12.094820976257324, "learning_rate": 8.493150684931507e-06, "loss": 0.6868, "step": 15700 }, { "epoch": 1.4383561643835616, "grad_norm": 13.272956848144531, "learning_rate": 8.424657534246577e-06, "loss": 0.6758, "step": 15750 }, { "epoch": 1.4429223744292237, "grad_norm": 7.710869312286377, "learning_rate": 8.356164383561643e-06, "loss": 0.6154, "step": 15800 }, { "epoch": 1.4474885844748857, "grad_norm": 24.845901489257812, "learning_rate": 8.287671232876712e-06, "loss": 0.6228, "step": 15850 }, { "epoch": 1.452054794520548, "grad_norm": 10.101279258728027, "learning_rate": 8.21917808219178e-06, "loss": 0.7418, "step": 15900 }, { "epoch": 1.45662100456621, "grad_norm": 71.19440460205078, "learning_rate": 8.15068493150685e-06, "loss": 0.7128, "step": 15950 }, { "epoch": 1.461187214611872, "grad_norm": 6.2137041091918945, "learning_rate": 8.082191780821917e-06, "loss": 0.7121, "step": 16000 }, { "epoch": 1.4657534246575343, "grad_norm": 20.703536987304688, "learning_rate": 8.013698630136987e-06, "loss": 0.7115, "step": 16050 }, { "epoch": 1.4703196347031964, "grad_norm": 26.53441619873047, "learning_rate": 7.945205479452055e-06, "loss": 0.5534, "step": 16100 }, { "epoch": 1.4748858447488584, "grad_norm": 20.233125686645508, "learning_rate": 7.876712328767124e-06, "loss": 0.6204, "step": 16150 }, { "epoch": 1.4794520547945205, "grad_norm": 23.36627769470215, "learning_rate": 7.808219178082192e-06, "loss": 0.6914, "step": 16200 }, { "epoch": 1.4840182648401825, "grad_norm": 11.94163703918457, "learning_rate": 7.739726027397261e-06, "loss": 0.6062, "step": 16250 }, { "epoch": 1.4885844748858448, "grad_norm": 15.757901191711426, "learning_rate": 7.671232876712329e-06, "loss": 0.6142, "step": 16300 }, { "epoch": 1.4931506849315068, "grad_norm": 12.007556915283203, "learning_rate": 7.602739726027398e-06, "loss": 0.6708, "step": 16350 }, { "epoch": 1.4977168949771689, "grad_norm": 9.127739906311035, "learning_rate": 7.5342465753424655e-06, "loss": 0.6319, "step": 16400 }, { "epoch": 1.5022831050228311, "grad_norm": 14.81264877319336, "learning_rate": 7.465753424657534e-06, "loss": 0.6143, "step": 16450 }, { "epoch": 1.5068493150684932, "grad_norm": 8.986160278320312, "learning_rate": 7.397260273972603e-06, "loss": 0.7508, "step": 16500 }, { "epoch": 1.5114155251141552, "grad_norm": 23.52107810974121, "learning_rate": 7.328767123287671e-06, "loss": 0.6377, "step": 16550 }, { "epoch": 1.5159817351598175, "grad_norm": 4.14312219619751, "learning_rate": 7.260273972602739e-06, "loss": 0.595, "step": 16600 }, { "epoch": 1.5205479452054793, "grad_norm": 32.40318298339844, "learning_rate": 7.191780821917808e-06, "loss": 0.6177, "step": 16650 }, { "epoch": 1.5251141552511416, "grad_norm": 13.583507537841797, "learning_rate": 7.123287671232876e-06, "loss": 0.6513, "step": 16700 }, { "epoch": 1.5296803652968036, "grad_norm": 27.02487564086914, "learning_rate": 7.054794520547945e-06, "loss": 0.6136, "step": 16750 }, { "epoch": 1.5342465753424657, "grad_norm": 46.82355880737305, "learning_rate": 6.986301369863014e-06, "loss": 0.6545, "step": 16800 }, { "epoch": 1.538812785388128, "grad_norm": 11.266030311584473, "learning_rate": 6.917808219178082e-06, "loss": 0.6438, "step": 16850 }, { "epoch": 1.54337899543379, "grad_norm": 21.652450561523438, "learning_rate": 6.849315068493151e-06, "loss": 0.6617, "step": 16900 }, { "epoch": 1.547945205479452, "grad_norm": 18.254072189331055, "learning_rate": 6.780821917808219e-06, "loss": 0.645, "step": 16950 }, { "epoch": 1.5525114155251143, "grad_norm": 7.959854602813721, "learning_rate": 6.712328767123287e-06, "loss": 0.7444, "step": 17000 }, { "epoch": 1.5570776255707761, "grad_norm": 15.169598579406738, "learning_rate": 6.643835616438356e-06, "loss": 0.6788, "step": 17050 }, { "epoch": 1.5616438356164384, "grad_norm": 14.872618675231934, "learning_rate": 6.5753424657534245e-06, "loss": 0.7433, "step": 17100 }, { "epoch": 1.5662100456621004, "grad_norm": 12.479876518249512, "learning_rate": 6.506849315068493e-06, "loss": 0.676, "step": 17150 }, { "epoch": 1.5707762557077625, "grad_norm": 4.30610990524292, "learning_rate": 6.438356164383562e-06, "loss": 0.597, "step": 17200 }, { "epoch": 1.5753424657534247, "grad_norm": 9.52687931060791, "learning_rate": 6.3698630136986296e-06, "loss": 0.6084, "step": 17250 }, { "epoch": 1.5799086757990868, "grad_norm": 6.067666053771973, "learning_rate": 6.301369863013698e-06, "loss": 0.61, "step": 17300 }, { "epoch": 1.5844748858447488, "grad_norm": 15.737329483032227, "learning_rate": 6.232876712328767e-06, "loss": 0.7011, "step": 17350 }, { "epoch": 1.589041095890411, "grad_norm": 4.7880730628967285, "learning_rate": 6.1643835616438354e-06, "loss": 0.694, "step": 17400 }, { "epoch": 1.593607305936073, "grad_norm": 19.6992130279541, "learning_rate": 6.095890410958904e-06, "loss": 0.706, "step": 17450 }, { "epoch": 1.5981735159817352, "grad_norm": 10.835814476013184, "learning_rate": 6.027397260273973e-06, "loss": 0.6298, "step": 17500 }, { "epoch": 1.6027397260273972, "grad_norm": 13.219555854797363, "learning_rate": 5.958904109589041e-06, "loss": 0.6496, "step": 17550 }, { "epoch": 1.6073059360730593, "grad_norm": 22.237091064453125, "learning_rate": 5.890410958904109e-06, "loss": 0.6585, "step": 17600 }, { "epoch": 1.6118721461187215, "grad_norm": 12.173138618469238, "learning_rate": 5.821917808219178e-06, "loss": 0.712, "step": 17650 }, { "epoch": 1.6164383561643836, "grad_norm": 6.5948896408081055, "learning_rate": 5.753424657534246e-06, "loss": 0.6253, "step": 17700 }, { "epoch": 1.6210045662100456, "grad_norm": 5.447400093078613, "learning_rate": 5.684931506849315e-06, "loss": 0.5762, "step": 17750 }, { "epoch": 1.625570776255708, "grad_norm": 12.413744926452637, "learning_rate": 5.616438356164384e-06, "loss": 0.6272, "step": 17800 }, { "epoch": 1.6301369863013697, "grad_norm": 48.877052307128906, "learning_rate": 5.547945205479452e-06, "loss": 0.589, "step": 17850 }, { "epoch": 1.634703196347032, "grad_norm": 21.571834564208984, "learning_rate": 5.47945205479452e-06, "loss": 0.6592, "step": 17900 }, { "epoch": 1.639269406392694, "grad_norm": 7.417336940765381, "learning_rate": 5.410958904109589e-06, "loss": 0.5248, "step": 17950 }, { "epoch": 1.643835616438356, "grad_norm": 48.4299430847168, "learning_rate": 5.342465753424657e-06, "loss": 0.6622, "step": 18000 }, { "epoch": 1.6484018264840183, "grad_norm": 17.760732650756836, "learning_rate": 5.273972602739726e-06, "loss": 0.626, "step": 18050 }, { "epoch": 1.6529680365296804, "grad_norm": 14.000800132751465, "learning_rate": 5.2054794520547945e-06, "loss": 0.6084, "step": 18100 }, { "epoch": 1.6575342465753424, "grad_norm": 8.742088317871094, "learning_rate": 5.136986301369863e-06, "loss": 0.6029, "step": 18150 }, { "epoch": 1.6621004566210047, "grad_norm": 26.16733169555664, "learning_rate": 5.068493150684931e-06, "loss": 0.6828, "step": 18200 }, { "epoch": 1.6666666666666665, "grad_norm": 29.90041160583496, "learning_rate": 4.9999999999999996e-06, "loss": 0.5702, "step": 18250 }, { "epoch": 1.6712328767123288, "grad_norm": 15.568696022033691, "learning_rate": 4.931506849315068e-06, "loss": 0.6376, "step": 18300 }, { "epoch": 1.6757990867579908, "grad_norm": 15.59715747833252, "learning_rate": 4.863013698630137e-06, "loss": 0.6776, "step": 18350 }, { "epoch": 1.6803652968036529, "grad_norm": 11.07044506072998, "learning_rate": 4.7945205479452054e-06, "loss": 0.529, "step": 18400 }, { "epoch": 1.6849315068493151, "grad_norm": 5.349613666534424, "learning_rate": 4.726027397260274e-06, "loss": 0.692, "step": 18450 }, { "epoch": 1.6894977168949772, "grad_norm": 18.147794723510742, "learning_rate": 4.657534246575343e-06, "loss": 0.7057, "step": 18500 }, { "epoch": 1.6940639269406392, "grad_norm": 13.486499786376953, "learning_rate": 4.5890410958904105e-06, "loss": 0.6415, "step": 18550 }, { "epoch": 1.6986301369863015, "grad_norm": 10.16304874420166, "learning_rate": 4.520547945205479e-06, "loss": 0.6604, "step": 18600 }, { "epoch": 1.7031963470319633, "grad_norm": 29.04235076904297, "learning_rate": 4.452054794520548e-06, "loss": 0.6773, "step": 18650 }, { "epoch": 1.7077625570776256, "grad_norm": 9.932119369506836, "learning_rate": 4.383561643835616e-06, "loss": 0.617, "step": 18700 }, { "epoch": 1.7123287671232876, "grad_norm": 70.96830749511719, "learning_rate": 4.315068493150685e-06, "loss": 0.6157, "step": 18750 }, { "epoch": 1.7168949771689497, "grad_norm": 15.635278701782227, "learning_rate": 4.246575342465754e-06, "loss": 0.6127, "step": 18800 }, { "epoch": 1.721461187214612, "grad_norm": 2.3553667068481445, "learning_rate": 4.178082191780821e-06, "loss": 0.6847, "step": 18850 }, { "epoch": 1.726027397260274, "grad_norm": 8.880425453186035, "learning_rate": 4.10958904109589e-06, "loss": 0.5043, "step": 18900 }, { "epoch": 1.730593607305936, "grad_norm": 22.30252456665039, "learning_rate": 4.041095890410959e-06, "loss": 0.5546, "step": 18950 }, { "epoch": 1.7351598173515983, "grad_norm": 7.0828142166137695, "learning_rate": 3.972602739726027e-06, "loss": 0.5997, "step": 19000 }, { "epoch": 1.7397260273972601, "grad_norm": 25.635570526123047, "learning_rate": 3.904109589041096e-06, "loss": 0.6091, "step": 19050 }, { "epoch": 1.7442922374429224, "grad_norm": 9.004444122314453, "learning_rate": 3.8356164383561645e-06, "loss": 0.6257, "step": 19100 }, { "epoch": 1.7488584474885844, "grad_norm": 5.485713958740234, "learning_rate": 3.7671232876712327e-06, "loss": 0.6728, "step": 19150 }, { "epoch": 1.7534246575342465, "grad_norm": 35.63444519042969, "learning_rate": 3.6986301369863014e-06, "loss": 0.7696, "step": 19200 }, { "epoch": 1.7579908675799087, "grad_norm": 29.199115753173828, "learning_rate": 3.6301369863013696e-06, "loss": 0.59, "step": 19250 }, { "epoch": 1.7625570776255708, "grad_norm": 18.041336059570312, "learning_rate": 3.561643835616438e-06, "loss": 0.5744, "step": 19300 }, { "epoch": 1.7671232876712328, "grad_norm": 16.391035079956055, "learning_rate": 3.493150684931507e-06, "loss": 0.6831, "step": 19350 }, { "epoch": 1.771689497716895, "grad_norm": 9.0728759765625, "learning_rate": 3.4246575342465754e-06, "loss": 0.5779, "step": 19400 }, { "epoch": 1.776255707762557, "grad_norm": 18.102890014648438, "learning_rate": 3.3561643835616436e-06, "loss": 0.5507, "step": 19450 }, { "epoch": 1.7808219178082192, "grad_norm": 17.248735427856445, "learning_rate": 3.2876712328767123e-06, "loss": 0.7226, "step": 19500 }, { "epoch": 1.7853881278538812, "grad_norm": 27.942777633666992, "learning_rate": 3.219178082191781e-06, "loss": 0.7178, "step": 19550 }, { "epoch": 1.7899543378995433, "grad_norm": 5.3349809646606445, "learning_rate": 3.150684931506849e-06, "loss": 0.5566, "step": 19600 }, { "epoch": 1.7945205479452055, "grad_norm": 30.73387908935547, "learning_rate": 3.0821917808219177e-06, "loss": 0.7677, "step": 19650 }, { "epoch": 1.7990867579908676, "grad_norm": 21.230749130249023, "learning_rate": 3.0136986301369864e-06, "loss": 0.61, "step": 19700 }, { "epoch": 1.8036529680365296, "grad_norm": 10.363127708435059, "learning_rate": 2.9452054794520546e-06, "loss": 0.6385, "step": 19750 }, { "epoch": 1.808219178082192, "grad_norm": 1.5289000272750854, "learning_rate": 2.876712328767123e-06, "loss": 0.6747, "step": 19800 }, { "epoch": 1.8127853881278537, "grad_norm": 18.69597625732422, "learning_rate": 2.808219178082192e-06, "loss": 0.5934, "step": 19850 }, { "epoch": 1.817351598173516, "grad_norm": 10.11032772064209, "learning_rate": 2.73972602739726e-06, "loss": 0.5918, "step": 19900 }, { "epoch": 1.821917808219178, "grad_norm": 30.333818435668945, "learning_rate": 2.6712328767123286e-06, "loss": 0.6381, "step": 19950 }, { "epoch": 1.82648401826484, "grad_norm": 15.229024887084961, "learning_rate": 2.6027397260273973e-06, "loss": 0.6151, "step": 20000 }, { "epoch": 1.8310502283105023, "grad_norm": 28.18828773498535, "learning_rate": 2.5342465753424655e-06, "loss": 0.6968, "step": 20050 }, { "epoch": 1.8356164383561644, "grad_norm": 48.18195724487305, "learning_rate": 2.465753424657534e-06, "loss": 0.6256, "step": 20100 }, { "epoch": 1.8401826484018264, "grad_norm": 16.33424949645996, "learning_rate": 2.3972602739726027e-06, "loss": 0.6477, "step": 20150 }, { "epoch": 1.8447488584474887, "grad_norm": 25.994909286499023, "learning_rate": 2.3287671232876713e-06, "loss": 0.692, "step": 20200 }, { "epoch": 1.8493150684931505, "grad_norm": 8.117030143737793, "learning_rate": 2.2602739726027396e-06, "loss": 0.6933, "step": 20250 }, { "epoch": 1.8538812785388128, "grad_norm": 8.02834415435791, "learning_rate": 2.191780821917808e-06, "loss": 0.6413, "step": 20300 }, { "epoch": 1.8584474885844748, "grad_norm": 22.62827491760254, "learning_rate": 2.123287671232877e-06, "loss": 0.679, "step": 20350 }, { "epoch": 1.8630136986301369, "grad_norm": 9.562274932861328, "learning_rate": 2.054794520547945e-06, "loss": 0.6615, "step": 20400 }, { "epoch": 1.8675799086757991, "grad_norm": 12.407808303833008, "learning_rate": 1.9863013698630136e-06, "loss": 0.7476, "step": 20450 }, { "epoch": 1.8721461187214612, "grad_norm": 41.344093322753906, "learning_rate": 1.9178082191780823e-06, "loss": 0.5827, "step": 20500 }, { "epoch": 1.8767123287671232, "grad_norm": 10.044130325317383, "learning_rate": 1.8493150684931507e-06, "loss": 0.6566, "step": 20550 }, { "epoch": 1.8812785388127855, "grad_norm": 9.382560729980469, "learning_rate": 1.780821917808219e-06, "loss": 0.6259, "step": 20600 }, { "epoch": 1.8858447488584473, "grad_norm": 9.731813430786133, "learning_rate": 1.7123287671232877e-06, "loss": 0.5924, "step": 20650 }, { "epoch": 1.8904109589041096, "grad_norm": 13.417922973632812, "learning_rate": 1.6438356164383561e-06, "loss": 0.6229, "step": 20700 }, { "epoch": 1.8949771689497716, "grad_norm": 32.03701400756836, "learning_rate": 1.5753424657534245e-06, "loss": 0.5761, "step": 20750 }, { "epoch": 1.8995433789954337, "grad_norm": 6.067290782928467, "learning_rate": 1.5068493150684932e-06, "loss": 0.6463, "step": 20800 }, { "epoch": 1.904109589041096, "grad_norm": 19.67026710510254, "learning_rate": 1.4383561643835616e-06, "loss": 0.6671, "step": 20850 }, { "epoch": 1.908675799086758, "grad_norm": 50.498802185058594, "learning_rate": 1.36986301369863e-06, "loss": 0.5811, "step": 20900 }, { "epoch": 1.91324200913242, "grad_norm": 15.981374740600586, "learning_rate": 1.3013698630136986e-06, "loss": 0.6823, "step": 20950 }, { "epoch": 1.9178082191780823, "grad_norm": 19.175485610961914, "learning_rate": 1.232876712328767e-06, "loss": 0.6112, "step": 21000 }, { "epoch": 1.9223744292237441, "grad_norm": 8.795181274414062, "learning_rate": 1.1643835616438357e-06, "loss": 0.5474, "step": 21050 }, { "epoch": 1.9269406392694064, "grad_norm": 3.590404748916626, "learning_rate": 1.095890410958904e-06, "loss": 0.7026, "step": 21100 }, { "epoch": 1.9315068493150684, "grad_norm": 21.619140625, "learning_rate": 1.0273972602739725e-06, "loss": 0.701, "step": 21150 }, { "epoch": 1.9360730593607305, "grad_norm": 22.84990692138672, "learning_rate": 9.589041095890411e-07, "loss": 0.5942, "step": 21200 }, { "epoch": 1.9406392694063928, "grad_norm": 28.548683166503906, "learning_rate": 8.904109589041095e-07, "loss": 0.6343, "step": 21250 }, { "epoch": 1.9452054794520548, "grad_norm": 20.216583251953125, "learning_rate": 8.219178082191781e-07, "loss": 0.6836, "step": 21300 }, { "epoch": 1.9497716894977168, "grad_norm": 22.950048446655273, "learning_rate": 7.534246575342466e-07, "loss": 0.6815, "step": 21350 }, { "epoch": 1.954337899543379, "grad_norm": 20.81789207458496, "learning_rate": 6.84931506849315e-07, "loss": 0.7679, "step": 21400 }, { "epoch": 1.958904109589041, "grad_norm": 14.494473457336426, "learning_rate": 6.164383561643835e-07, "loss": 0.5561, "step": 21450 }, { "epoch": 1.9634703196347032, "grad_norm": 17.119592666625977, "learning_rate": 5.47945205479452e-07, "loss": 0.6881, "step": 21500 }, { "epoch": 1.9680365296803652, "grad_norm": 8.369721412658691, "learning_rate": 4.794520547945206e-07, "loss": 0.6692, "step": 21550 }, { "epoch": 1.9726027397260273, "grad_norm": 11.264369010925293, "learning_rate": 4.1095890410958903e-07, "loss": 0.6405, "step": 21600 }, { "epoch": 1.9771689497716896, "grad_norm": 3.320608615875244, "learning_rate": 3.424657534246575e-07, "loss": 0.671, "step": 21650 }, { "epoch": 1.9817351598173516, "grad_norm": 19.23822784423828, "learning_rate": 2.73972602739726e-07, "loss": 0.5475, "step": 21700 }, { "epoch": 1.9863013698630136, "grad_norm": 13.919721603393555, "learning_rate": 2.0547945205479452e-07, "loss": 0.7422, "step": 21750 }, { "epoch": 1.990867579908676, "grad_norm": 14.141480445861816, "learning_rate": 1.36986301369863e-07, "loss": 0.5855, "step": 21800 }, { "epoch": 1.9954337899543377, "grad_norm": 19.506189346313477, "learning_rate": 6.84931506849315e-08, "loss": 0.7103, "step": 21850 }, { "epoch": 2.0, "grad_norm": 16.25570297241211, "learning_rate": 0.0, "loss": 0.7072, "step": 21900 }, { "epoch": 2.0, "step": 21900, "total_flos": 3.8619551920019866e+17, "train_loss": 0.9270246051109, "train_runtime": 51747.0424, "train_samples_per_second": 3.386, "train_steps_per_second": 0.423 } ], "logging_steps": 50, "max_steps": 21900, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.8619551920019866e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }