{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9851665267282396, "eval_steps": 10, "global_step": 2200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00044780296669465436, "grad_norm": 2.2432782649993896, "learning_rate": 3.7313432835820895e-07, "loss": 1.8132, "step": 1 }, { "epoch": 0.0008956059333893087, "grad_norm": 2.291522264480591, "learning_rate": 7.462686567164179e-07, "loss": 1.8956, "step": 2 }, { "epoch": 0.001343408900083963, "grad_norm": 1.6618109941482544, "learning_rate": 1.119402985074627e-06, "loss": 1.8312, "step": 3 }, { "epoch": 0.0017912118667786174, "grad_norm": 2.319044589996338, "learning_rate": 1.4925373134328358e-06, "loss": 1.8718, "step": 4 }, { "epoch": 0.002239014833473272, "grad_norm": 1.671183705329895, "learning_rate": 1.8656716417910446e-06, "loss": 1.8156, "step": 5 }, { "epoch": 0.002686817800167926, "grad_norm": 1.857389211654663, "learning_rate": 2.238805970149254e-06, "loss": 1.8177, "step": 6 }, { "epoch": 0.0031346207668625807, "grad_norm": 1.590662956237793, "learning_rate": 2.6119402985074627e-06, "loss": 1.8255, "step": 7 }, { "epoch": 0.003582423733557235, "grad_norm": 1.9237611293792725, "learning_rate": 2.9850746268656716e-06, "loss": 1.8032, "step": 8 }, { "epoch": 0.004030226700251889, "grad_norm": 1.6888021230697632, "learning_rate": 3.358208955223881e-06, "loss": 1.7965, "step": 9 }, { "epoch": 0.004478029666946544, "grad_norm": 2.0715696811676025, "learning_rate": 3.7313432835820893e-06, "loss": 1.9659, "step": 10 }, { "epoch": 0.004478029666946544, "eval_loss": 1.819678783416748, "eval_runtime": 1739.3924, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 10 }, { "epoch": 0.0049258326336411975, "grad_norm": 1.545844316482544, "learning_rate": 4.1044776119402985e-06, "loss": 1.8914, "step": 11 }, { "epoch": 0.005373635600335852, "grad_norm": 1.37221360206604, "learning_rate": 4.477611940298508e-06, "loss": 1.7632, "step": 12 }, { "epoch": 0.005821438567030507, "grad_norm": 1.4589499235153198, "learning_rate": 4.850746268656717e-06, "loss": 1.7984, "step": 13 }, { "epoch": 0.006269241533725161, "grad_norm": 1.5319428443908691, "learning_rate": 5.2238805970149255e-06, "loss": 1.9122, "step": 14 }, { "epoch": 0.006717044500419815, "grad_norm": 1.2317570447921753, "learning_rate": 5.597014925373135e-06, "loss": 1.7678, "step": 15 }, { "epoch": 0.00716484746711447, "grad_norm": 1.4415943622589111, "learning_rate": 5.970149253731343e-06, "loss": 1.9129, "step": 16 }, { "epoch": 0.007612650433809124, "grad_norm": 1.0773462057113647, "learning_rate": 6.343283582089552e-06, "loss": 1.4962, "step": 17 }, { "epoch": 0.008060453400503778, "grad_norm": 1.1921244859695435, "learning_rate": 6.716417910447762e-06, "loss": 1.7538, "step": 18 }, { "epoch": 0.008508256367198432, "grad_norm": 1.0547364950180054, "learning_rate": 7.08955223880597e-06, "loss": 1.6643, "step": 19 }, { "epoch": 0.008956059333893087, "grad_norm": 1.065486192703247, "learning_rate": 7.4626865671641785e-06, "loss": 1.5438, "step": 20 }, { "epoch": 0.008956059333893087, "eval_loss": 1.666861891746521, "eval_runtime": 1742.6586, "eval_samples_per_second": 2.563, "eval_steps_per_second": 2.563, "step": 20 }, { "epoch": 0.009403862300587741, "grad_norm": 0.9827540516853333, "learning_rate": 7.835820895522389e-06, "loss": 1.5295, "step": 21 }, { "epoch": 0.009851665267282395, "grad_norm": 0.9350996613502502, "learning_rate": 8.208955223880597e-06, "loss": 1.6664, "step": 22 }, { "epoch": 0.01029946823397705, "grad_norm": 1.0762792825698853, "learning_rate": 8.582089552238805e-06, "loss": 1.6782, "step": 23 }, { "epoch": 0.010747271200671704, "grad_norm": 0.7595136165618896, "learning_rate": 8.955223880597016e-06, "loss": 1.6303, "step": 24 }, { "epoch": 0.01119507416736636, "grad_norm": 0.6962496638298035, "learning_rate": 9.328358208955226e-06, "loss": 1.5204, "step": 25 }, { "epoch": 0.011642877134061013, "grad_norm": 0.6340360641479492, "learning_rate": 9.701492537313434e-06, "loss": 1.4292, "step": 26 }, { "epoch": 0.012090680100755667, "grad_norm": 0.6120555996894836, "learning_rate": 1.0074626865671643e-05, "loss": 1.4588, "step": 27 }, { "epoch": 0.012538483067450323, "grad_norm": 0.6970665454864502, "learning_rate": 1.0447761194029851e-05, "loss": 1.6072, "step": 28 }, { "epoch": 0.012986286034144976, "grad_norm": 0.7495852112770081, "learning_rate": 1.082089552238806e-05, "loss": 1.4976, "step": 29 }, { "epoch": 0.01343408900083963, "grad_norm": 0.5586577653884888, "learning_rate": 1.119402985074627e-05, "loss": 1.5005, "step": 30 }, { "epoch": 0.01343408900083963, "eval_loss": 1.4981863498687744, "eval_runtime": 1737.6969, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 30 }, { "epoch": 0.013881891967534286, "grad_norm": 0.5340927839279175, "learning_rate": 1.1567164179104478e-05, "loss": 1.5658, "step": 31 }, { "epoch": 0.01432969493422894, "grad_norm": 0.5143851637840271, "learning_rate": 1.1940298507462686e-05, "loss": 1.5624, "step": 32 }, { "epoch": 0.014777497900923593, "grad_norm": 0.5012256503105164, "learning_rate": 1.2313432835820896e-05, "loss": 1.2532, "step": 33 }, { "epoch": 0.015225300867618249, "grad_norm": 0.5775434970855713, "learning_rate": 1.2686567164179105e-05, "loss": 1.372, "step": 34 }, { "epoch": 0.0156731038343129, "grad_norm": 0.611817479133606, "learning_rate": 1.3059701492537313e-05, "loss": 1.4631, "step": 35 }, { "epoch": 0.016120906801007556, "grad_norm": 0.629241406917572, "learning_rate": 1.3432835820895523e-05, "loss": 1.4753, "step": 36 }, { "epoch": 0.016568709767702212, "grad_norm": 0.5267626643180847, "learning_rate": 1.3805970149253733e-05, "loss": 1.371, "step": 37 }, { "epoch": 0.017016512734396864, "grad_norm": 0.6829407811164856, "learning_rate": 1.417910447761194e-05, "loss": 1.4017, "step": 38 }, { "epoch": 0.01746431570109152, "grad_norm": 0.6046289205551147, "learning_rate": 1.455223880597015e-05, "loss": 1.3686, "step": 39 }, { "epoch": 0.017912118667786175, "grad_norm": 0.610405683517456, "learning_rate": 1.4925373134328357e-05, "loss": 1.269, "step": 40 }, { "epoch": 0.017912118667786175, "eval_loss": 1.3559142351150513, "eval_runtime": 1741.7432, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 40 }, { "epoch": 0.018359921634480827, "grad_norm": 0.5271261930465698, "learning_rate": 1.529850746268657e-05, "loss": 1.3167, "step": 41 }, { "epoch": 0.018807724601175482, "grad_norm": 0.549258291721344, "learning_rate": 1.5671641791044777e-05, "loss": 1.4023, "step": 42 }, { "epoch": 0.019255527567870138, "grad_norm": 0.5591861605644226, "learning_rate": 1.6044776119402986e-05, "loss": 1.2516, "step": 43 }, { "epoch": 0.01970333053456479, "grad_norm": 0.4712078273296356, "learning_rate": 1.6417910447761194e-05, "loss": 1.4436, "step": 44 }, { "epoch": 0.020151133501259445, "grad_norm": 0.4768025279045105, "learning_rate": 1.6791044776119406e-05, "loss": 1.3387, "step": 45 }, { "epoch": 0.0205989364679541, "grad_norm": 0.5674676299095154, "learning_rate": 1.716417910447761e-05, "loss": 1.1163, "step": 46 }, { "epoch": 0.021046739434648753, "grad_norm": 0.5266850590705872, "learning_rate": 1.7537313432835823e-05, "loss": 1.4061, "step": 47 }, { "epoch": 0.02149454240134341, "grad_norm": 0.5684650540351868, "learning_rate": 1.791044776119403e-05, "loss": 1.3917, "step": 48 }, { "epoch": 0.021942345368038064, "grad_norm": 0.4995711147785187, "learning_rate": 1.828358208955224e-05, "loss": 1.2096, "step": 49 }, { "epoch": 0.02239014833473272, "grad_norm": 0.4433654546737671, "learning_rate": 1.865671641791045e-05, "loss": 1.1679, "step": 50 }, { "epoch": 0.02239014833473272, "eval_loss": 1.2627769708633423, "eval_runtime": 1741.8103, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 50 }, { "epoch": 0.02283795130142737, "grad_norm": 0.49915778636932373, "learning_rate": 1.9029850746268656e-05, "loss": 1.1626, "step": 51 }, { "epoch": 0.023285754268122027, "grad_norm": 0.5690376162528992, "learning_rate": 1.9402985074626868e-05, "loss": 1.2634, "step": 52 }, { "epoch": 0.023733557234816682, "grad_norm": 0.46489912271499634, "learning_rate": 1.9776119402985073e-05, "loss": 1.3055, "step": 53 }, { "epoch": 0.024181360201511334, "grad_norm": 0.46374455094337463, "learning_rate": 2.0149253731343285e-05, "loss": 1.1938, "step": 54 }, { "epoch": 0.02462916316820599, "grad_norm": 0.43488311767578125, "learning_rate": 2.0522388059701493e-05, "loss": 1.2376, "step": 55 }, { "epoch": 0.025076966134900645, "grad_norm": 0.5964159965515137, "learning_rate": 2.0895522388059702e-05, "loss": 1.2211, "step": 56 }, { "epoch": 0.025524769101595297, "grad_norm": 0.3947959244251251, "learning_rate": 2.126865671641791e-05, "loss": 1.1828, "step": 57 }, { "epoch": 0.025972572068289953, "grad_norm": 0.42681577801704407, "learning_rate": 2.164179104477612e-05, "loss": 1.3245, "step": 58 }, { "epoch": 0.02642037503498461, "grad_norm": 0.4505462944507599, "learning_rate": 2.201492537313433e-05, "loss": 1.2595, "step": 59 }, { "epoch": 0.02686817800167926, "grad_norm": 0.44249483942985535, "learning_rate": 2.238805970149254e-05, "loss": 1.3007, "step": 60 }, { "epoch": 0.02686817800167926, "eval_loss": 1.2291107177734375, "eval_runtime": 1742.3595, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 60 }, { "epoch": 0.027315980968373916, "grad_norm": 0.4635663330554962, "learning_rate": 2.2761194029850747e-05, "loss": 1.2106, "step": 61 }, { "epoch": 0.02776378393506857, "grad_norm": 0.4279429614543915, "learning_rate": 2.3134328358208956e-05, "loss": 1.3232, "step": 62 }, { "epoch": 0.028211586901763223, "grad_norm": 0.47193002700805664, "learning_rate": 2.3507462686567168e-05, "loss": 1.1924, "step": 63 }, { "epoch": 0.02865938986845788, "grad_norm": 0.46152573823928833, "learning_rate": 2.3880597014925373e-05, "loss": 1.1613, "step": 64 }, { "epoch": 0.029107192835152534, "grad_norm": 0.41957393288612366, "learning_rate": 2.4253731343283584e-05, "loss": 1.2249, "step": 65 }, { "epoch": 0.029554995801847186, "grad_norm": 0.5162342190742493, "learning_rate": 2.4626865671641793e-05, "loss": 1.2557, "step": 66 }, { "epoch": 0.030002798768541842, "grad_norm": 0.4564678370952606, "learning_rate": 2.5e-05, "loss": 1.0555, "step": 67 }, { "epoch": 0.030450601735236497, "grad_norm": 0.441526859998703, "learning_rate": 2.537313432835821e-05, "loss": 1.1295, "step": 68 }, { "epoch": 0.03089840470193115, "grad_norm": 0.38912713527679443, "learning_rate": 2.574626865671642e-05, "loss": 1.0991, "step": 69 }, { "epoch": 0.0313462076686258, "grad_norm": 0.411045104265213, "learning_rate": 2.6119402985074626e-05, "loss": 1.2926, "step": 70 }, { "epoch": 0.0313462076686258, "eval_loss": 1.207950234413147, "eval_runtime": 1742.1349, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 70 }, { "epoch": 0.03179401063532046, "grad_norm": 0.6341650485992432, "learning_rate": 2.6492537313432835e-05, "loss": 1.2046, "step": 71 }, { "epoch": 0.03224181360201511, "grad_norm": 0.4355488717556, "learning_rate": 2.6865671641791047e-05, "loss": 1.2758, "step": 72 }, { "epoch": 0.032689616568709765, "grad_norm": 0.44128844141960144, "learning_rate": 2.7238805970149255e-05, "loss": 1.35, "step": 73 }, { "epoch": 0.033137419535404423, "grad_norm": 0.43141600489616394, "learning_rate": 2.7611940298507467e-05, "loss": 1.2572, "step": 74 }, { "epoch": 0.033585222502099076, "grad_norm": 0.46453097462654114, "learning_rate": 2.7985074626865672e-05, "loss": 1.1533, "step": 75 }, { "epoch": 0.03403302546879373, "grad_norm": 0.39782240986824036, "learning_rate": 2.835820895522388e-05, "loss": 1.1852, "step": 76 }, { "epoch": 0.034480828435488386, "grad_norm": 0.45064836740493774, "learning_rate": 2.8731343283582092e-05, "loss": 1.079, "step": 77 }, { "epoch": 0.03492863140218304, "grad_norm": 0.4778350591659546, "learning_rate": 2.91044776119403e-05, "loss": 1.3059, "step": 78 }, { "epoch": 0.03537643436887769, "grad_norm": 0.3985428512096405, "learning_rate": 2.9477611940298512e-05, "loss": 1.2544, "step": 79 }, { "epoch": 0.03582423733557235, "grad_norm": 0.43038174510002136, "learning_rate": 2.9850746268656714e-05, "loss": 1.3496, "step": 80 }, { "epoch": 0.03582423733557235, "eval_loss": 1.190480351448059, "eval_runtime": 1741.9424, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 80 }, { "epoch": 0.036272040302267, "grad_norm": 0.46937325596809387, "learning_rate": 3.0223880597014926e-05, "loss": 1.1186, "step": 81 }, { "epoch": 0.036719843268961654, "grad_norm": 0.45396608114242554, "learning_rate": 3.059701492537314e-05, "loss": 1.1812, "step": 82 }, { "epoch": 0.03716764623565631, "grad_norm": 0.4596857726573944, "learning_rate": 3.0970149253731346e-05, "loss": 1.1643, "step": 83 }, { "epoch": 0.037615449202350965, "grad_norm": 0.4687398076057434, "learning_rate": 3.1343283582089554e-05, "loss": 1.1045, "step": 84 }, { "epoch": 0.03806325216904562, "grad_norm": 0.40774309635162354, "learning_rate": 3.171641791044776e-05, "loss": 1.1006, "step": 85 }, { "epoch": 0.038511055135740276, "grad_norm": 0.41461247205734253, "learning_rate": 3.208955223880597e-05, "loss": 1.1628, "step": 86 }, { "epoch": 0.03895885810243493, "grad_norm": 0.5309916138648987, "learning_rate": 3.246268656716418e-05, "loss": 1.1437, "step": 87 }, { "epoch": 0.03940666106912958, "grad_norm": 0.4281834661960602, "learning_rate": 3.283582089552239e-05, "loss": 1.0383, "step": 88 }, { "epoch": 0.03985446403582424, "grad_norm": 0.4634588956832886, "learning_rate": 3.32089552238806e-05, "loss": 1.1905, "step": 89 }, { "epoch": 0.04030226700251889, "grad_norm": 0.48842886090278625, "learning_rate": 3.358208955223881e-05, "loss": 1.1012, "step": 90 }, { "epoch": 0.04030226700251889, "eval_loss": 1.1734172105789185, "eval_runtime": 1742.402, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 90 }, { "epoch": 0.04075006996921354, "grad_norm": 0.4782605767250061, "learning_rate": 3.395522388059701e-05, "loss": 1.2087, "step": 91 }, { "epoch": 0.0411978729359082, "grad_norm": 0.6223856210708618, "learning_rate": 3.432835820895522e-05, "loss": 1.1121, "step": 92 }, { "epoch": 0.041645675902602854, "grad_norm": 0.4409349858760834, "learning_rate": 3.470149253731344e-05, "loss": 1.1133, "step": 93 }, { "epoch": 0.042093478869297506, "grad_norm": 0.5995984077453613, "learning_rate": 3.5074626865671645e-05, "loss": 1.1062, "step": 94 }, { "epoch": 0.042541281835992165, "grad_norm": 0.5431959629058838, "learning_rate": 3.5447761194029854e-05, "loss": 1.0471, "step": 95 }, { "epoch": 0.04298908480268682, "grad_norm": 0.4902937114238739, "learning_rate": 3.582089552238806e-05, "loss": 1.2069, "step": 96 }, { "epoch": 0.043436887769381476, "grad_norm": 0.4660329818725586, "learning_rate": 3.619402985074627e-05, "loss": 1.1259, "step": 97 }, { "epoch": 0.04388469073607613, "grad_norm": 0.5536847114562988, "learning_rate": 3.656716417910448e-05, "loss": 1.2128, "step": 98 }, { "epoch": 0.04433249370277078, "grad_norm": 0.6095202565193176, "learning_rate": 3.694029850746269e-05, "loss": 1.0796, "step": 99 }, { "epoch": 0.04478029666946544, "grad_norm": 0.5868849754333496, "learning_rate": 3.73134328358209e-05, "loss": 1.2881, "step": 100 }, { "epoch": 0.04478029666946544, "eval_loss": 1.152649998664856, "eval_runtime": 1741.8326, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 100 }, { "epoch": 0.04522809963616009, "grad_norm": 0.5384364724159241, "learning_rate": 3.7686567164179104e-05, "loss": 1.1073, "step": 101 }, { "epoch": 0.04567590260285474, "grad_norm": 0.6125591397285461, "learning_rate": 3.805970149253731e-05, "loss": 1.0919, "step": 102 }, { "epoch": 0.0461237055695494, "grad_norm": 0.4984678030014038, "learning_rate": 3.843283582089552e-05, "loss": 1.2438, "step": 103 }, { "epoch": 0.046571508536244054, "grad_norm": 0.4974757432937622, "learning_rate": 3.8805970149253736e-05, "loss": 1.1709, "step": 104 }, { "epoch": 0.047019311502938706, "grad_norm": 0.4727949798107147, "learning_rate": 3.9179104477611945e-05, "loss": 1.2356, "step": 105 }, { "epoch": 0.047467114469633365, "grad_norm": 0.5323635935783386, "learning_rate": 3.9552238805970146e-05, "loss": 1.1092, "step": 106 }, { "epoch": 0.04791491743632802, "grad_norm": 0.4718462824821472, "learning_rate": 3.992537313432836e-05, "loss": 1.1643, "step": 107 }, { "epoch": 0.04836272040302267, "grad_norm": 0.46946635842323303, "learning_rate": 4.029850746268657e-05, "loss": 1.1401, "step": 108 }, { "epoch": 0.04881052336971733, "grad_norm": 0.5624064207077026, "learning_rate": 4.067164179104478e-05, "loss": 1.0968, "step": 109 }, { "epoch": 0.04925832633641198, "grad_norm": 0.5305824875831604, "learning_rate": 4.104477611940299e-05, "loss": 1.1556, "step": 110 }, { "epoch": 0.04925832633641198, "eval_loss": 1.1313787698745728, "eval_runtime": 1741.4825, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 110 }, { "epoch": 0.04970612930310663, "grad_norm": 0.47058171033859253, "learning_rate": 4.1417910447761195e-05, "loss": 1.1151, "step": 111 }, { "epoch": 0.05015393226980129, "grad_norm": 0.6491269469261169, "learning_rate": 4.1791044776119404e-05, "loss": 1.0862, "step": 112 }, { "epoch": 0.05060173523649594, "grad_norm": 0.5460953712463379, "learning_rate": 4.216417910447761e-05, "loss": 1.0627, "step": 113 }, { "epoch": 0.051049538203190595, "grad_norm": 0.5378393530845642, "learning_rate": 4.253731343283582e-05, "loss": 1.2173, "step": 114 }, { "epoch": 0.051497341169885254, "grad_norm": 0.6210533976554871, "learning_rate": 4.2910447761194036e-05, "loss": 1.0156, "step": 115 }, { "epoch": 0.051945144136579906, "grad_norm": 0.4818199574947357, "learning_rate": 4.328358208955224e-05, "loss": 1.1848, "step": 116 }, { "epoch": 0.05239294710327456, "grad_norm": 0.6459422707557678, "learning_rate": 4.3656716417910446e-05, "loss": 1.0064, "step": 117 }, { "epoch": 0.05284075006996922, "grad_norm": 0.5303052067756653, "learning_rate": 4.402985074626866e-05, "loss": 1.1831, "step": 118 }, { "epoch": 0.05328855303666387, "grad_norm": 0.6115245819091797, "learning_rate": 4.440298507462687e-05, "loss": 1.2621, "step": 119 }, { "epoch": 0.05373635600335852, "grad_norm": 0.5204371213912964, "learning_rate": 4.477611940298508e-05, "loss": 1.1908, "step": 120 }, { "epoch": 0.05373635600335852, "eval_loss": 1.114864468574524, "eval_runtime": 1741.2804, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 120 }, { "epoch": 0.05418415897005318, "grad_norm": 0.5293467044830322, "learning_rate": 4.5149253731343286e-05, "loss": 1.2622, "step": 121 }, { "epoch": 0.05463196193674783, "grad_norm": 0.5757855176925659, "learning_rate": 4.5522388059701495e-05, "loss": 0.9621, "step": 122 }, { "epoch": 0.055079764903442484, "grad_norm": 0.558157205581665, "learning_rate": 4.58955223880597e-05, "loss": 1.0987, "step": 123 }, { "epoch": 0.05552756787013714, "grad_norm": 0.6928860545158386, "learning_rate": 4.626865671641791e-05, "loss": 1.0925, "step": 124 }, { "epoch": 0.055975370836831795, "grad_norm": 0.6394439935684204, "learning_rate": 4.664179104477612e-05, "loss": 1.302, "step": 125 }, { "epoch": 0.05642317380352645, "grad_norm": 0.6799196004867554, "learning_rate": 4.7014925373134335e-05, "loss": 0.9764, "step": 126 }, { "epoch": 0.056870976770221106, "grad_norm": 0.5544326901435852, "learning_rate": 4.738805970149254e-05, "loss": 1.0112, "step": 127 }, { "epoch": 0.05731877973691576, "grad_norm": 0.5835520625114441, "learning_rate": 4.7761194029850745e-05, "loss": 1.0672, "step": 128 }, { "epoch": 0.05776658270361041, "grad_norm": 0.5551716089248657, "learning_rate": 4.813432835820896e-05, "loss": 1.3243, "step": 129 }, { "epoch": 0.05821438567030507, "grad_norm": 0.5320628881454468, "learning_rate": 4.850746268656717e-05, "loss": 1.0686, "step": 130 }, { "epoch": 0.05821438567030507, "eval_loss": 1.0987714529037476, "eval_runtime": 1741.4607, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 130 }, { "epoch": 0.05866218863699972, "grad_norm": 0.5460705161094666, "learning_rate": 4.888059701492538e-05, "loss": 1.3141, "step": 131 }, { "epoch": 0.05910999160369437, "grad_norm": 0.5375784039497375, "learning_rate": 4.9253731343283586e-05, "loss": 0.7433, "step": 132 }, { "epoch": 0.05955779457038903, "grad_norm": 0.6023723483085632, "learning_rate": 4.9626865671641794e-05, "loss": 1.0964, "step": 133 }, { "epoch": 0.060005597537083684, "grad_norm": 0.7922960519790649, "learning_rate": 5e-05, "loss": 1.0928, "step": 134 }, { "epoch": 0.060453400503778336, "grad_norm": 0.6330304145812988, "learning_rate": 4.9988457987072945e-05, "loss": 0.9644, "step": 135 }, { "epoch": 0.060901203470472995, "grad_norm": 0.5186926126480103, "learning_rate": 4.9976915974145894e-05, "loss": 1.3319, "step": 136 }, { "epoch": 0.06134900643716765, "grad_norm": 0.9184876084327698, "learning_rate": 4.996537396121884e-05, "loss": 0.8197, "step": 137 }, { "epoch": 0.0617968094038623, "grad_norm": 0.7908456325531006, "learning_rate": 4.9953831948291786e-05, "loss": 1.0536, "step": 138 }, { "epoch": 0.06224461237055696, "grad_norm": 0.6290323138237, "learning_rate": 4.994228993536473e-05, "loss": 1.0967, "step": 139 }, { "epoch": 0.0626924153372516, "grad_norm": 0.7094379663467407, "learning_rate": 4.993074792243768e-05, "loss": 1.1115, "step": 140 }, { "epoch": 0.0626924153372516, "eval_loss": 1.0867949724197388, "eval_runtime": 1740.7431, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 140 }, { "epoch": 0.06314021830394627, "grad_norm": 0.6611003279685974, "learning_rate": 4.991920590951062e-05, "loss": 1.0284, "step": 141 }, { "epoch": 0.06358802127064092, "grad_norm": 0.670655369758606, "learning_rate": 4.990766389658357e-05, "loss": 1.0436, "step": 142 }, { "epoch": 0.06403582423733557, "grad_norm": 0.700566828250885, "learning_rate": 4.989612188365651e-05, "loss": 0.9554, "step": 143 }, { "epoch": 0.06448362720403022, "grad_norm": 0.6415517330169678, "learning_rate": 4.9884579870729456e-05, "loss": 1.0908, "step": 144 }, { "epoch": 0.06493143017072488, "grad_norm": 0.6611580848693848, "learning_rate": 4.98730378578024e-05, "loss": 1.0711, "step": 145 }, { "epoch": 0.06537923313741953, "grad_norm": 0.6099154949188232, "learning_rate": 4.986149584487535e-05, "loss": 1.0865, "step": 146 }, { "epoch": 0.0658270361041142, "grad_norm": 0.6479048132896423, "learning_rate": 4.984995383194829e-05, "loss": 1.2223, "step": 147 }, { "epoch": 0.06627483907080885, "grad_norm": 0.7321466207504272, "learning_rate": 4.983841181902124e-05, "loss": 1.0052, "step": 148 }, { "epoch": 0.0667226420375035, "grad_norm": 0.6522185206413269, "learning_rate": 4.982686980609418e-05, "loss": 1.0421, "step": 149 }, { "epoch": 0.06717044500419815, "grad_norm": 0.633614182472229, "learning_rate": 4.981532779316713e-05, "loss": 1.0914, "step": 150 }, { "epoch": 0.06717044500419815, "eval_loss": 1.0725953578948975, "eval_runtime": 1740.6944, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 150 }, { "epoch": 0.0676182479708928, "grad_norm": 0.7060382962226868, "learning_rate": 4.9803785780240074e-05, "loss": 0.891, "step": 151 }, { "epoch": 0.06806605093758746, "grad_norm": 0.6778922080993652, "learning_rate": 4.9792243767313024e-05, "loss": 0.9576, "step": 152 }, { "epoch": 0.06851385390428212, "grad_norm": 0.54057776927948, "learning_rate": 4.9780701754385966e-05, "loss": 1.0744, "step": 153 }, { "epoch": 0.06896165687097677, "grad_norm": 0.6451223492622375, "learning_rate": 4.9769159741458916e-05, "loss": 1.0027, "step": 154 }, { "epoch": 0.06940945983767143, "grad_norm": 0.6291092038154602, "learning_rate": 4.975761772853186e-05, "loss": 1.2297, "step": 155 }, { "epoch": 0.06985726280436608, "grad_norm": 0.6851316094398499, "learning_rate": 4.974607571560481e-05, "loss": 1.1893, "step": 156 }, { "epoch": 0.07030506577106073, "grad_norm": 0.5764050483703613, "learning_rate": 4.973453370267775e-05, "loss": 1.1657, "step": 157 }, { "epoch": 0.07075286873775538, "grad_norm": 0.5566059350967407, "learning_rate": 4.97229916897507e-05, "loss": 1.0751, "step": 158 }, { "epoch": 0.07120067170445005, "grad_norm": 0.6990348696708679, "learning_rate": 4.971144967682364e-05, "loss": 1.0401, "step": 159 }, { "epoch": 0.0716484746711447, "grad_norm": 0.6392049789428711, "learning_rate": 4.9699907663896585e-05, "loss": 1.1104, "step": 160 }, { "epoch": 0.0716484746711447, "eval_loss": 1.0557206869125366, "eval_runtime": 1740.3422, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 160 }, { "epoch": 0.07209627763783935, "grad_norm": 0.5888108015060425, "learning_rate": 4.9688365650969534e-05, "loss": 1.0651, "step": 161 }, { "epoch": 0.072544080604534, "grad_norm": 0.6952875256538391, "learning_rate": 4.967682363804248e-05, "loss": 1.0902, "step": 162 }, { "epoch": 0.07299188357122866, "grad_norm": 0.6969994902610779, "learning_rate": 4.9665281625115426e-05, "loss": 0.8659, "step": 163 }, { "epoch": 0.07343968653792331, "grad_norm": 0.5647038817405701, "learning_rate": 4.965373961218837e-05, "loss": 1.0399, "step": 164 }, { "epoch": 0.07388748950461797, "grad_norm": 0.5780266523361206, "learning_rate": 4.964219759926131e-05, "loss": 1.2407, "step": 165 }, { "epoch": 0.07433529247131263, "grad_norm": 0.700043797492981, "learning_rate": 4.9630655586334254e-05, "loss": 1.1373, "step": 166 }, { "epoch": 0.07478309543800728, "grad_norm": 0.5190924406051636, "learning_rate": 4.96191135734072e-05, "loss": 1.0578, "step": 167 }, { "epoch": 0.07523089840470193, "grad_norm": 0.5897246599197388, "learning_rate": 4.9607571560480146e-05, "loss": 0.9403, "step": 168 }, { "epoch": 0.07567870137139658, "grad_norm": 0.7634034156799316, "learning_rate": 4.9596029547553095e-05, "loss": 0.9184, "step": 169 }, { "epoch": 0.07612650433809123, "grad_norm": 0.6055589914321899, "learning_rate": 4.958448753462604e-05, "loss": 1.0687, "step": 170 }, { "epoch": 0.07612650433809123, "eval_loss": 1.0446605682373047, "eval_runtime": 1740.2042, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 170 }, { "epoch": 0.0765743073047859, "grad_norm": 0.6777039766311646, "learning_rate": 4.957294552169899e-05, "loss": 1.0545, "step": 171 }, { "epoch": 0.07702211027148055, "grad_norm": 0.602261483669281, "learning_rate": 4.956140350877193e-05, "loss": 0.9887, "step": 172 }, { "epoch": 0.0774699132381752, "grad_norm": 0.6870952248573303, "learning_rate": 4.954986149584488e-05, "loss": 0.767, "step": 173 }, { "epoch": 0.07791771620486986, "grad_norm": 0.6584283709526062, "learning_rate": 4.953831948291782e-05, "loss": 1.0888, "step": 174 }, { "epoch": 0.07836551917156451, "grad_norm": 0.7014456391334534, "learning_rate": 4.952677746999077e-05, "loss": 0.9995, "step": 175 }, { "epoch": 0.07881332213825916, "grad_norm": 0.7976059913635254, "learning_rate": 4.9515235457063714e-05, "loss": 0.907, "step": 176 }, { "epoch": 0.07926112510495383, "grad_norm": 0.6465182900428772, "learning_rate": 4.950369344413666e-05, "loss": 1.0508, "step": 177 }, { "epoch": 0.07970892807164848, "grad_norm": 0.7543056607246399, "learning_rate": 4.9492151431209606e-05, "loss": 1.1151, "step": 178 }, { "epoch": 0.08015673103834313, "grad_norm": 0.729171633720398, "learning_rate": 4.9480609418282555e-05, "loss": 1.0579, "step": 179 }, { "epoch": 0.08060453400503778, "grad_norm": 0.6621066927909851, "learning_rate": 4.94690674053555e-05, "loss": 0.915, "step": 180 }, { "epoch": 0.08060453400503778, "eval_loss": 1.0335291624069214, "eval_runtime": 1740.3768, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 180 }, { "epoch": 0.08105233697173243, "grad_norm": 0.7245395183563232, "learning_rate": 4.945752539242845e-05, "loss": 1.0651, "step": 181 }, { "epoch": 0.08150013993842709, "grad_norm": 0.611437976360321, "learning_rate": 4.944598337950139e-05, "loss": 1.2536, "step": 182 }, { "epoch": 0.08194794290512175, "grad_norm": 0.6172018051147461, "learning_rate": 4.943444136657433e-05, "loss": 1.197, "step": 183 }, { "epoch": 0.0823957458718164, "grad_norm": 0.7766463160514832, "learning_rate": 4.9422899353647275e-05, "loss": 1.1375, "step": 184 }, { "epoch": 0.08284354883851106, "grad_norm": 0.6918166279792786, "learning_rate": 4.9411357340720224e-05, "loss": 0.9037, "step": 185 }, { "epoch": 0.08329135180520571, "grad_norm": 0.5888485908508301, "learning_rate": 4.939981532779317e-05, "loss": 1.0212, "step": 186 }, { "epoch": 0.08373915477190036, "grad_norm": 0.6069585680961609, "learning_rate": 4.938827331486611e-05, "loss": 1.0665, "step": 187 }, { "epoch": 0.08418695773859501, "grad_norm": 0.5719831585884094, "learning_rate": 4.937673130193906e-05, "loss": 1.0863, "step": 188 }, { "epoch": 0.08463476070528968, "grad_norm": 0.6351702213287354, "learning_rate": 4.9365189289012e-05, "loss": 1.0613, "step": 189 }, { "epoch": 0.08508256367198433, "grad_norm": 0.7906411290168762, "learning_rate": 4.935364727608495e-05, "loss": 1.0697, "step": 190 }, { "epoch": 0.08508256367198433, "eval_loss": 1.0170701742172241, "eval_runtime": 1740.2195, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 190 }, { "epoch": 0.08553036663867898, "grad_norm": 0.8668298125267029, "learning_rate": 4.9342105263157894e-05, "loss": 0.9181, "step": 191 }, { "epoch": 0.08597816960537363, "grad_norm": 0.7080042958259583, "learning_rate": 4.933056325023084e-05, "loss": 0.9964, "step": 192 }, { "epoch": 0.08642597257206829, "grad_norm": 0.5576188564300537, "learning_rate": 4.9319021237303786e-05, "loss": 0.8769, "step": 193 }, { "epoch": 0.08687377553876295, "grad_norm": 0.685297966003418, "learning_rate": 4.9307479224376735e-05, "loss": 1.1107, "step": 194 }, { "epoch": 0.0873215785054576, "grad_norm": 0.6759443879127502, "learning_rate": 4.929593721144968e-05, "loss": 1.0284, "step": 195 }, { "epoch": 0.08776938147215226, "grad_norm": 0.7277442216873169, "learning_rate": 4.928439519852263e-05, "loss": 1.0706, "step": 196 }, { "epoch": 0.08821718443884691, "grad_norm": 0.7153092622756958, "learning_rate": 4.927285318559557e-05, "loss": 1.046, "step": 197 }, { "epoch": 0.08866498740554156, "grad_norm": 0.7023780941963196, "learning_rate": 4.926131117266852e-05, "loss": 1.1325, "step": 198 }, { "epoch": 0.08911279037223621, "grad_norm": 0.6779207587242126, "learning_rate": 4.924976915974146e-05, "loss": 0.8576, "step": 199 }, { "epoch": 0.08956059333893088, "grad_norm": 0.630558967590332, "learning_rate": 4.923822714681441e-05, "loss": 0.9907, "step": 200 }, { "epoch": 0.08956059333893088, "eval_loss": 1.006293773651123, "eval_runtime": 1739.5536, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 200 }, { "epoch": 0.09000839630562553, "grad_norm": 0.59002685546875, "learning_rate": 4.9226685133887354e-05, "loss": 0.9343, "step": 201 }, { "epoch": 0.09045619927232018, "grad_norm": 0.728977382183075, "learning_rate": 4.92151431209603e-05, "loss": 0.9973, "step": 202 }, { "epoch": 0.09090400223901483, "grad_norm": 0.6520524621009827, "learning_rate": 4.9203601108033246e-05, "loss": 0.9441, "step": 203 }, { "epoch": 0.09135180520570949, "grad_norm": 0.7074622511863708, "learning_rate": 4.919205909510619e-05, "loss": 0.9559, "step": 204 }, { "epoch": 0.09179960817240414, "grad_norm": 0.7225532531738281, "learning_rate": 4.918051708217913e-05, "loss": 0.9087, "step": 205 }, { "epoch": 0.0922474111390988, "grad_norm": 0.7118654251098633, "learning_rate": 4.916897506925208e-05, "loss": 0.9611, "step": 206 }, { "epoch": 0.09269521410579346, "grad_norm": 0.6199821829795837, "learning_rate": 4.915743305632502e-05, "loss": 0.9679, "step": 207 }, { "epoch": 0.09314301707248811, "grad_norm": 0.6176968812942505, "learning_rate": 4.914589104339797e-05, "loss": 1.0421, "step": 208 }, { "epoch": 0.09359082003918276, "grad_norm": 0.811738133430481, "learning_rate": 4.9134349030470915e-05, "loss": 0.8281, "step": 209 }, { "epoch": 0.09403862300587741, "grad_norm": 0.7109337449073792, "learning_rate": 4.912280701754386e-05, "loss": 1.1078, "step": 210 }, { "epoch": 0.09403862300587741, "eval_loss": 0.996062695980072, "eval_runtime": 1737.4871, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 210 }, { "epoch": 0.09448642597257206, "grad_norm": 0.6960453987121582, "learning_rate": 4.911126500461681e-05, "loss": 0.9872, "step": 211 }, { "epoch": 0.09493422893926673, "grad_norm": 0.7350823879241943, "learning_rate": 4.909972299168975e-05, "loss": 0.8907, "step": 212 }, { "epoch": 0.09538203190596138, "grad_norm": 0.722599983215332, "learning_rate": 4.90881809787627e-05, "loss": 1.0963, "step": 213 }, { "epoch": 0.09582983487265603, "grad_norm": 0.6781536936759949, "learning_rate": 4.907663896583564e-05, "loss": 0.8545, "step": 214 }, { "epoch": 0.09627763783935069, "grad_norm": 0.7184162735939026, "learning_rate": 4.906509695290859e-05, "loss": 0.9652, "step": 215 }, { "epoch": 0.09672544080604534, "grad_norm": 0.8125225305557251, "learning_rate": 4.905355493998153e-05, "loss": 1.0223, "step": 216 }, { "epoch": 0.09717324377273999, "grad_norm": 0.7977463603019714, "learning_rate": 4.904201292705448e-05, "loss": 1.2184, "step": 217 }, { "epoch": 0.09762104673943466, "grad_norm": 0.696886420249939, "learning_rate": 4.9030470914127425e-05, "loss": 0.9601, "step": 218 }, { "epoch": 0.09806884970612931, "grad_norm": 0.6477362513542175, "learning_rate": 4.9018928901200375e-05, "loss": 1.166, "step": 219 }, { "epoch": 0.09851665267282396, "grad_norm": 0.7791169285774231, "learning_rate": 4.900738688827332e-05, "loss": 1.0317, "step": 220 }, { "epoch": 0.09851665267282396, "eval_loss": 0.9866155982017517, "eval_runtime": 1737.5401, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 220 }, { "epoch": 0.09896445563951861, "grad_norm": 0.7042242884635925, "learning_rate": 4.899584487534627e-05, "loss": 1.0636, "step": 221 }, { "epoch": 0.09941225860621326, "grad_norm": 0.6456022262573242, "learning_rate": 4.898430286241921e-05, "loss": 1.0597, "step": 222 }, { "epoch": 0.09986006157290792, "grad_norm": 0.796667754650116, "learning_rate": 4.897276084949215e-05, "loss": 1.0915, "step": 223 }, { "epoch": 0.10030786453960258, "grad_norm": 0.7900235056877136, "learning_rate": 4.89612188365651e-05, "loss": 0.9049, "step": 224 }, { "epoch": 0.10075566750629723, "grad_norm": 0.689604640007019, "learning_rate": 4.8949676823638044e-05, "loss": 0.9269, "step": 225 }, { "epoch": 0.10120347047299189, "grad_norm": 0.8840981125831604, "learning_rate": 4.8938134810710987e-05, "loss": 0.9206, "step": 226 }, { "epoch": 0.10165127343968654, "grad_norm": 0.9159313440322876, "learning_rate": 4.8926592797783936e-05, "loss": 1.0473, "step": 227 }, { "epoch": 0.10209907640638119, "grad_norm": 0.7433324456214905, "learning_rate": 4.891505078485688e-05, "loss": 0.8495, "step": 228 }, { "epoch": 0.10254687937307584, "grad_norm": 0.813946008682251, "learning_rate": 4.890350877192983e-05, "loss": 0.9191, "step": 229 }, { "epoch": 0.10299468233977051, "grad_norm": 0.7069205045700073, "learning_rate": 4.889196675900277e-05, "loss": 0.8153, "step": 230 }, { "epoch": 0.10299468233977051, "eval_loss": 0.9733306765556335, "eval_runtime": 1737.4261, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 230 }, { "epoch": 0.10344248530646516, "grad_norm": 0.7403346300125122, "learning_rate": 4.888042474607572e-05, "loss": 0.8176, "step": 231 }, { "epoch": 0.10389028827315981, "grad_norm": 0.779050886631012, "learning_rate": 4.886888273314866e-05, "loss": 1.1263, "step": 232 }, { "epoch": 0.10433809123985446, "grad_norm": 0.7971842885017395, "learning_rate": 4.885734072022161e-05, "loss": 0.7987, "step": 233 }, { "epoch": 0.10478589420654912, "grad_norm": 0.8085238933563232, "learning_rate": 4.8845798707294554e-05, "loss": 0.8065, "step": 234 }, { "epoch": 0.10523369717324377, "grad_norm": 0.7707525491714478, "learning_rate": 4.88342566943675e-05, "loss": 0.9083, "step": 235 }, { "epoch": 0.10568150013993843, "grad_norm": 0.7241260409355164, "learning_rate": 4.8822714681440446e-05, "loss": 0.9536, "step": 236 }, { "epoch": 0.10612930310663309, "grad_norm": 0.760982871055603, "learning_rate": 4.881117266851339e-05, "loss": 0.9606, "step": 237 }, { "epoch": 0.10657710607332774, "grad_norm": 0.7906852960586548, "learning_rate": 4.879963065558634e-05, "loss": 0.6995, "step": 238 }, { "epoch": 0.10702490904002239, "grad_norm": 0.8570147752761841, "learning_rate": 4.878808864265928e-05, "loss": 1.0152, "step": 239 }, { "epoch": 0.10747271200671704, "grad_norm": 0.7524833679199219, "learning_rate": 4.877654662973223e-05, "loss": 0.9486, "step": 240 }, { "epoch": 0.10747271200671704, "eval_loss": 0.962479829788208, "eval_runtime": 1737.2243, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 240 }, { "epoch": 0.1079205149734117, "grad_norm": 0.6601128578186035, "learning_rate": 4.876500461680517e-05, "loss": 0.8763, "step": 241 }, { "epoch": 0.10836831794010636, "grad_norm": 0.7229903936386108, "learning_rate": 4.875346260387812e-05, "loss": 1.0229, "step": 242 }, { "epoch": 0.10881612090680101, "grad_norm": 0.7775028944015503, "learning_rate": 4.8741920590951065e-05, "loss": 0.9157, "step": 243 }, { "epoch": 0.10926392387349566, "grad_norm": 0.8382840752601624, "learning_rate": 4.873037857802401e-05, "loss": 0.942, "step": 244 }, { "epoch": 0.10971172684019032, "grad_norm": 0.6494114995002747, "learning_rate": 4.871883656509695e-05, "loss": 1.0415, "step": 245 }, { "epoch": 0.11015952980688497, "grad_norm": 0.6916049122810364, "learning_rate": 4.87072945521699e-05, "loss": 0.9285, "step": 246 }, { "epoch": 0.11060733277357962, "grad_norm": 0.7655143737792969, "learning_rate": 4.869575253924284e-05, "loss": 0.8565, "step": 247 }, { "epoch": 0.11105513574027429, "grad_norm": 0.8227872848510742, "learning_rate": 4.868421052631579e-05, "loss": 0.933, "step": 248 }, { "epoch": 0.11150293870696894, "grad_norm": 0.6931448578834534, "learning_rate": 4.8672668513388734e-05, "loss": 0.8383, "step": 249 }, { "epoch": 0.11195074167366359, "grad_norm": 0.8759434819221497, "learning_rate": 4.8661126500461684e-05, "loss": 0.9163, "step": 250 }, { "epoch": 0.11195074167366359, "eval_loss": 0.9523711204528809, "eval_runtime": 1737.1441, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 250 }, { "epoch": 0.11239854464035824, "grad_norm": 0.7037357687950134, "learning_rate": 4.8649584487534626e-05, "loss": 1.0964, "step": 251 }, { "epoch": 0.1128463476070529, "grad_norm": 0.7379258871078491, "learning_rate": 4.8638042474607576e-05, "loss": 1.0948, "step": 252 }, { "epoch": 0.11329415057374755, "grad_norm": 0.6891792416572571, "learning_rate": 4.862650046168052e-05, "loss": 1.0872, "step": 253 }, { "epoch": 0.11374195354044221, "grad_norm": 0.6693180203437805, "learning_rate": 4.861495844875347e-05, "loss": 1.0345, "step": 254 }, { "epoch": 0.11418975650713686, "grad_norm": 0.7131677269935608, "learning_rate": 4.860341643582641e-05, "loss": 0.8185, "step": 255 }, { "epoch": 0.11463755947383152, "grad_norm": 0.6727960109710693, "learning_rate": 4.859187442289936e-05, "loss": 1.047, "step": 256 }, { "epoch": 0.11508536244052617, "grad_norm": 0.7159959673881531, "learning_rate": 4.85803324099723e-05, "loss": 1.0208, "step": 257 }, { "epoch": 0.11553316540722082, "grad_norm": 0.7980951070785522, "learning_rate": 4.856879039704525e-05, "loss": 0.829, "step": 258 }, { "epoch": 0.11598096837391547, "grad_norm": 0.6713117957115173, "learning_rate": 4.8557248384118194e-05, "loss": 0.8269, "step": 259 }, { "epoch": 0.11642877134061014, "grad_norm": 0.8814539313316345, "learning_rate": 4.854570637119114e-05, "loss": 0.8441, "step": 260 }, { "epoch": 0.11642877134061014, "eval_loss": 0.9442258477210999, "eval_runtime": 1737.8062, "eval_samples_per_second": 2.57, "eval_steps_per_second": 2.57, "step": 260 }, { "epoch": 0.11687657430730479, "grad_norm": 0.7765588164329529, "learning_rate": 4.8534164358264086e-05, "loss": 0.9471, "step": 261 }, { "epoch": 0.11732437727399944, "grad_norm": 0.7720988988876343, "learning_rate": 4.852262234533703e-05, "loss": 1.1604, "step": 262 }, { "epoch": 0.1177721802406941, "grad_norm": 0.6977170705795288, "learning_rate": 4.851108033240998e-05, "loss": 0.9949, "step": 263 }, { "epoch": 0.11821998320738875, "grad_norm": 0.7317233681678772, "learning_rate": 4.849953831948292e-05, "loss": 1.0061, "step": 264 }, { "epoch": 0.1186677861740834, "grad_norm": 0.79209303855896, "learning_rate": 4.848799630655586e-05, "loss": 0.8906, "step": 265 }, { "epoch": 0.11911558914077806, "grad_norm": 0.7325365543365479, "learning_rate": 4.8476454293628806e-05, "loss": 0.8362, "step": 266 }, { "epoch": 0.11956339210747272, "grad_norm": 0.7075287699699402, "learning_rate": 4.8464912280701755e-05, "loss": 1.0315, "step": 267 }, { "epoch": 0.12001119507416737, "grad_norm": 0.7248746752738953, "learning_rate": 4.84533702677747e-05, "loss": 1.0958, "step": 268 }, { "epoch": 0.12045899804086202, "grad_norm": 0.8397259712219238, "learning_rate": 4.844182825484765e-05, "loss": 0.8609, "step": 269 }, { "epoch": 0.12090680100755667, "grad_norm": 0.6971864700317383, "learning_rate": 4.843028624192059e-05, "loss": 1.0048, "step": 270 }, { "epoch": 0.12090680100755667, "eval_loss": 0.9325976371765137, "eval_runtime": 1737.3018, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 270 }, { "epoch": 0.12135460397425132, "grad_norm": 0.7825617790222168, "learning_rate": 4.841874422899354e-05, "loss": 1.0109, "step": 271 }, { "epoch": 0.12180240694094599, "grad_norm": 0.7868365049362183, "learning_rate": 4.840720221606648e-05, "loss": 0.8101, "step": 272 }, { "epoch": 0.12225020990764064, "grad_norm": 0.7207172513008118, "learning_rate": 4.839566020313943e-05, "loss": 0.9749, "step": 273 }, { "epoch": 0.1226980128743353, "grad_norm": 0.6478491425514221, "learning_rate": 4.8384118190212374e-05, "loss": 0.9946, "step": 274 }, { "epoch": 0.12314581584102995, "grad_norm": 0.8110407590866089, "learning_rate": 4.837257617728532e-05, "loss": 1.0575, "step": 275 }, { "epoch": 0.1235936188077246, "grad_norm": 0.8174804449081421, "learning_rate": 4.8361034164358266e-05, "loss": 1.1115, "step": 276 }, { "epoch": 0.12404142177441925, "grad_norm": 0.7011256217956543, "learning_rate": 4.8349492151431215e-05, "loss": 1.0387, "step": 277 }, { "epoch": 0.12448922474111392, "grad_norm": 0.736200213432312, "learning_rate": 4.833795013850416e-05, "loss": 0.6846, "step": 278 }, { "epoch": 0.12493702770780857, "grad_norm": 0.7827062606811523, "learning_rate": 4.832640812557711e-05, "loss": 0.8612, "step": 279 }, { "epoch": 0.1253848306745032, "grad_norm": 0.7452975511550903, "learning_rate": 4.831486611265005e-05, "loss": 0.8897, "step": 280 }, { "epoch": 0.1253848306745032, "eval_loss": 0.9226668477058411, "eval_runtime": 1737.6416, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 280 }, { "epoch": 0.12583263364119787, "grad_norm": 0.6109349131584167, "learning_rate": 4.8303324099723e-05, "loss": 0.8418, "step": 281 }, { "epoch": 0.12628043660789254, "grad_norm": 0.6637207269668579, "learning_rate": 4.829178208679594e-05, "loss": 0.949, "step": 282 }, { "epoch": 0.12672823957458718, "grad_norm": 0.8242074251174927, "learning_rate": 4.8280240073868885e-05, "loss": 0.8032, "step": 283 }, { "epoch": 0.12717604254128184, "grad_norm": 0.725492000579834, "learning_rate": 4.826869806094183e-05, "loss": 0.949, "step": 284 }, { "epoch": 0.12762384550797648, "grad_norm": 0.7447363138198853, "learning_rate": 4.8257156048014777e-05, "loss": 0.8985, "step": 285 }, { "epoch": 0.12807164847467115, "grad_norm": 0.8081015348434448, "learning_rate": 4.824561403508772e-05, "loss": 0.7457, "step": 286 }, { "epoch": 0.1285194514413658, "grad_norm": 0.735405683517456, "learning_rate": 4.823407202216066e-05, "loss": 0.9313, "step": 287 }, { "epoch": 0.12896725440806045, "grad_norm": 0.8218609690666199, "learning_rate": 4.822253000923361e-05, "loss": 0.997, "step": 288 }, { "epoch": 0.12941505737475512, "grad_norm": 0.6740204095840454, "learning_rate": 4.8210987996306554e-05, "loss": 1.0128, "step": 289 }, { "epoch": 0.12986286034144975, "grad_norm": 0.9718872904777527, "learning_rate": 4.81994459833795e-05, "loss": 0.9457, "step": 290 }, { "epoch": 0.12986286034144975, "eval_loss": 0.9150700569152832, "eval_runtime": 1737.2884, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 290 }, { "epoch": 0.13031066330814442, "grad_norm": 0.8290822505950928, "learning_rate": 4.8187903970452446e-05, "loss": 1.0297, "step": 291 }, { "epoch": 0.13075846627483906, "grad_norm": 0.9061235189437866, "learning_rate": 4.8176361957525395e-05, "loss": 0.7766, "step": 292 }, { "epoch": 0.13120626924153372, "grad_norm": 0.6609340310096741, "learning_rate": 4.816481994459834e-05, "loss": 0.9471, "step": 293 }, { "epoch": 0.1316540722082284, "grad_norm": 0.7579784989356995, "learning_rate": 4.815327793167129e-05, "loss": 0.9807, "step": 294 }, { "epoch": 0.13210187517492303, "grad_norm": 0.7428559064865112, "learning_rate": 4.814173591874423e-05, "loss": 0.9862, "step": 295 }, { "epoch": 0.1325496781416177, "grad_norm": 0.8760770559310913, "learning_rate": 4.813019390581718e-05, "loss": 0.7563, "step": 296 }, { "epoch": 0.13299748110831233, "grad_norm": 0.7334847450256348, "learning_rate": 4.811865189289012e-05, "loss": 0.9554, "step": 297 }, { "epoch": 0.133445284075007, "grad_norm": 0.8345491886138916, "learning_rate": 4.810710987996307e-05, "loss": 0.9365, "step": 298 }, { "epoch": 0.13389308704170166, "grad_norm": 0.8603995442390442, "learning_rate": 4.8095567867036014e-05, "loss": 0.8086, "step": 299 }, { "epoch": 0.1343408900083963, "grad_norm": 0.7264519333839417, "learning_rate": 4.808402585410896e-05, "loss": 0.9419, "step": 300 }, { "epoch": 0.1343408900083963, "eval_loss": 0.9055705666542053, "eval_runtime": 1737.7291, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 300 }, { "epoch": 0.13478869297509097, "grad_norm": 0.7371543645858765, "learning_rate": 4.8072483841181906e-05, "loss": 0.7665, "step": 301 }, { "epoch": 0.1352364959417856, "grad_norm": 0.6498313546180725, "learning_rate": 4.8060941828254855e-05, "loss": 0.9527, "step": 302 }, { "epoch": 0.13568429890848027, "grad_norm": 0.7187266945838928, "learning_rate": 4.80493998153278e-05, "loss": 0.9354, "step": 303 }, { "epoch": 0.1361321018751749, "grad_norm": 0.6874094605445862, "learning_rate": 4.803785780240074e-05, "loss": 0.9004, "step": 304 }, { "epoch": 0.13657990484186958, "grad_norm": 0.7286795973777771, "learning_rate": 4.802631578947368e-05, "loss": 0.7882, "step": 305 }, { "epoch": 0.13702770780856424, "grad_norm": 0.6951384544372559, "learning_rate": 4.801477377654663e-05, "loss": 0.9683, "step": 306 }, { "epoch": 0.13747551077525888, "grad_norm": 0.7582501769065857, "learning_rate": 4.8003231763619575e-05, "loss": 0.9498, "step": 307 }, { "epoch": 0.13792331374195355, "grad_norm": 0.8873469233512878, "learning_rate": 4.7991689750692524e-05, "loss": 1.0083, "step": 308 }, { "epoch": 0.13837111670864818, "grad_norm": 0.8048168420791626, "learning_rate": 4.798014773776547e-05, "loss": 0.9438, "step": 309 }, { "epoch": 0.13881891967534285, "grad_norm": 0.6849039793014526, "learning_rate": 4.7968605724838416e-05, "loss": 0.9656, "step": 310 }, { "epoch": 0.13881891967534285, "eval_loss": 0.8966045379638672, "eval_runtime": 1736.6349, "eval_samples_per_second": 2.572, "eval_steps_per_second": 2.572, "step": 310 }, { "epoch": 0.13926672264203752, "grad_norm": 0.7566764950752258, "learning_rate": 4.795706371191136e-05, "loss": 0.8435, "step": 311 }, { "epoch": 0.13971452560873215, "grad_norm": 0.8874958157539368, "learning_rate": 4.79455216989843e-05, "loss": 1.028, "step": 312 }, { "epoch": 0.14016232857542682, "grad_norm": 0.830262303352356, "learning_rate": 4.793397968605725e-05, "loss": 0.8655, "step": 313 }, { "epoch": 0.14061013154212146, "grad_norm": 0.8174196481704712, "learning_rate": 4.7922437673130193e-05, "loss": 0.9186, "step": 314 }, { "epoch": 0.14105793450881612, "grad_norm": 0.7265437841415405, "learning_rate": 4.791089566020314e-05, "loss": 1.0262, "step": 315 }, { "epoch": 0.14150573747551076, "grad_norm": 0.7852171063423157, "learning_rate": 4.7899353647276085e-05, "loss": 0.8919, "step": 316 }, { "epoch": 0.14195354044220543, "grad_norm": 0.8389447331428528, "learning_rate": 4.7887811634349035e-05, "loss": 1.0888, "step": 317 }, { "epoch": 0.1424013434089001, "grad_norm": 0.9585886597633362, "learning_rate": 4.787626962142198e-05, "loss": 0.9379, "step": 318 }, { "epoch": 0.14284914637559473, "grad_norm": 0.8709464073181152, "learning_rate": 4.786472760849493e-05, "loss": 0.8661, "step": 319 }, { "epoch": 0.1432969493422894, "grad_norm": 0.7449386715888977, "learning_rate": 4.785318559556787e-05, "loss": 0.9359, "step": 320 }, { "epoch": 0.1432969493422894, "eval_loss": 0.8892737030982971, "eval_runtime": 1737.4213, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 320 }, { "epoch": 0.14374475230898404, "grad_norm": 0.9661244750022888, "learning_rate": 4.784164358264082e-05, "loss": 0.7743, "step": 321 }, { "epoch": 0.1441925552756787, "grad_norm": 0.7624433040618896, "learning_rate": 4.783010156971376e-05, "loss": 1.0915, "step": 322 }, { "epoch": 0.14464035824237337, "grad_norm": 0.8274304270744324, "learning_rate": 4.7818559556786704e-05, "loss": 0.9038, "step": 323 }, { "epoch": 0.145088161209068, "grad_norm": 0.746432900428772, "learning_rate": 4.780701754385965e-05, "loss": 1.0964, "step": 324 }, { "epoch": 0.14553596417576267, "grad_norm": 0.8682849407196045, "learning_rate": 4.7795475530932596e-05, "loss": 0.849, "step": 325 }, { "epoch": 0.1459837671424573, "grad_norm": 0.761936604976654, "learning_rate": 4.778393351800554e-05, "loss": 0.8772, "step": 326 }, { "epoch": 0.14643157010915198, "grad_norm": 0.8509253859519958, "learning_rate": 4.777239150507849e-05, "loss": 1.1404, "step": 327 }, { "epoch": 0.14687937307584661, "grad_norm": 1.0035078525543213, "learning_rate": 4.776084949215143e-05, "loss": 0.8262, "step": 328 }, { "epoch": 0.14732717604254128, "grad_norm": 0.8431288003921509, "learning_rate": 4.774930747922438e-05, "loss": 0.9464, "step": 329 }, { "epoch": 0.14777497900923595, "grad_norm": 0.7923704385757446, "learning_rate": 4.773776546629732e-05, "loss": 1.1387, "step": 330 }, { "epoch": 0.14777497900923595, "eval_loss": 0.8819145560264587, "eval_runtime": 1737.0418, "eval_samples_per_second": 2.572, "eval_steps_per_second": 2.572, "step": 330 }, { "epoch": 0.14822278197593058, "grad_norm": 0.7270142436027527, "learning_rate": 4.772622345337027e-05, "loss": 1.0828, "step": 331 }, { "epoch": 0.14867058494262525, "grad_norm": 0.7770641446113586, "learning_rate": 4.7714681440443215e-05, "loss": 0.875, "step": 332 }, { "epoch": 0.1491183879093199, "grad_norm": 0.7159392833709717, "learning_rate": 4.7703139427516164e-05, "loss": 0.818, "step": 333 }, { "epoch": 0.14956619087601455, "grad_norm": 0.7923138737678528, "learning_rate": 4.7691597414589107e-05, "loss": 1.0655, "step": 334 }, { "epoch": 0.15001399384270922, "grad_norm": 0.7565407156944275, "learning_rate": 4.7680055401662056e-05, "loss": 0.7377, "step": 335 }, { "epoch": 0.15046179680940386, "grad_norm": 0.713832437992096, "learning_rate": 4.7668513388735e-05, "loss": 1.0459, "step": 336 }, { "epoch": 0.15090959977609852, "grad_norm": 0.774344801902771, "learning_rate": 4.765697137580794e-05, "loss": 1.0397, "step": 337 }, { "epoch": 0.15135740274279316, "grad_norm": 0.7079201936721802, "learning_rate": 4.764542936288089e-05, "loss": 0.9431, "step": 338 }, { "epoch": 0.15180520570948783, "grad_norm": 0.8966906070709229, "learning_rate": 4.763388734995383e-05, "loss": 0.8048, "step": 339 }, { "epoch": 0.15225300867618247, "grad_norm": 0.7640495300292969, "learning_rate": 4.762234533702678e-05, "loss": 1.0161, "step": 340 }, { "epoch": 0.15225300867618247, "eval_loss": 0.8745113611221313, "eval_runtime": 1737.1711, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 340 }, { "epoch": 0.15270081164287713, "grad_norm": 0.9535894393920898, "learning_rate": 4.7610803324099725e-05, "loss": 1.1023, "step": 341 }, { "epoch": 0.1531486146095718, "grad_norm": 0.7694385051727295, "learning_rate": 4.7599261311172675e-05, "loss": 1.0471, "step": 342 }, { "epoch": 0.15359641757626644, "grad_norm": 0.7791855931282043, "learning_rate": 4.758771929824562e-05, "loss": 1.0974, "step": 343 }, { "epoch": 0.1540442205429611, "grad_norm": 0.8553569912910461, "learning_rate": 4.757617728531856e-05, "loss": 1.0196, "step": 344 }, { "epoch": 0.15449202350965574, "grad_norm": 0.8217856287956238, "learning_rate": 4.75646352723915e-05, "loss": 0.6761, "step": 345 }, { "epoch": 0.1549398264763504, "grad_norm": 0.8384736776351929, "learning_rate": 4.755309325946445e-05, "loss": 1.0592, "step": 346 }, { "epoch": 0.15538762944304507, "grad_norm": 0.774459958076477, "learning_rate": 4.7541551246537394e-05, "loss": 0.7733, "step": 347 }, { "epoch": 0.1558354324097397, "grad_norm": 0.8029676675796509, "learning_rate": 4.7530009233610344e-05, "loss": 0.9376, "step": 348 }, { "epoch": 0.15628323537643438, "grad_norm": 0.7330321073532104, "learning_rate": 4.7518467220683286e-05, "loss": 0.986, "step": 349 }, { "epoch": 0.15673103834312901, "grad_norm": 0.7578117251396179, "learning_rate": 4.7506925207756236e-05, "loss": 0.7285, "step": 350 }, { "epoch": 0.15673103834312901, "eval_loss": 0.867455005645752, "eval_runtime": 1737.4591, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 350 }, { "epoch": 0.15717884130982368, "grad_norm": 0.9032571911811829, "learning_rate": 4.749538319482918e-05, "loss": 0.933, "step": 351 }, { "epoch": 0.15762664427651832, "grad_norm": 0.9054291844367981, "learning_rate": 4.748384118190213e-05, "loss": 0.7717, "step": 352 }, { "epoch": 0.15807444724321298, "grad_norm": 0.9551945328712463, "learning_rate": 4.747229916897507e-05, "loss": 0.9073, "step": 353 }, { "epoch": 0.15852225020990765, "grad_norm": 0.9689154624938965, "learning_rate": 4.746075715604802e-05, "loss": 0.8513, "step": 354 }, { "epoch": 0.1589700531766023, "grad_norm": 0.7683622241020203, "learning_rate": 4.744921514312096e-05, "loss": 0.9604, "step": 355 }, { "epoch": 0.15941785614329695, "grad_norm": 0.7971686720848083, "learning_rate": 4.743767313019391e-05, "loss": 0.3917, "step": 356 }, { "epoch": 0.1598656591099916, "grad_norm": 0.7324389815330505, "learning_rate": 4.7426131117266854e-05, "loss": 0.9423, "step": 357 }, { "epoch": 0.16031346207668626, "grad_norm": 0.8222924470901489, "learning_rate": 4.7414589104339804e-05, "loss": 0.7726, "step": 358 }, { "epoch": 0.16076126504338092, "grad_norm": 0.7495877742767334, "learning_rate": 4.7403047091412746e-05, "loss": 0.7838, "step": 359 }, { "epoch": 0.16120906801007556, "grad_norm": 0.7453298568725586, "learning_rate": 4.7391505078485696e-05, "loss": 1.0874, "step": 360 }, { "epoch": 0.16120906801007556, "eval_loss": 0.8566550612449646, "eval_runtime": 1737.1905, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 360 }, { "epoch": 0.16165687097677023, "grad_norm": 0.8878437280654907, "learning_rate": 4.737996306555864e-05, "loss": 0.7624, "step": 361 }, { "epoch": 0.16210467394346487, "grad_norm": 0.771511971950531, "learning_rate": 4.736842105263158e-05, "loss": 0.7768, "step": 362 }, { "epoch": 0.16255247691015953, "grad_norm": 0.7700636386871338, "learning_rate": 4.735687903970453e-05, "loss": 0.857, "step": 363 }, { "epoch": 0.16300027987685417, "grad_norm": 0.8953984975814819, "learning_rate": 4.734533702677747e-05, "loss": 0.8731, "step": 364 }, { "epoch": 0.16344808284354884, "grad_norm": 1.056182622909546, "learning_rate": 4.7333795013850415e-05, "loss": 0.7347, "step": 365 }, { "epoch": 0.1638958858102435, "grad_norm": 0.8084350824356079, "learning_rate": 4.732225300092336e-05, "loss": 0.8404, "step": 366 }, { "epoch": 0.16434368877693814, "grad_norm": 0.8122984766960144, "learning_rate": 4.731071098799631e-05, "loss": 0.7705, "step": 367 }, { "epoch": 0.1647914917436328, "grad_norm": 0.810514509677887, "learning_rate": 4.729916897506925e-05, "loss": 0.8933, "step": 368 }, { "epoch": 0.16523929471032744, "grad_norm": 0.8424896597862244, "learning_rate": 4.72876269621422e-05, "loss": 0.6952, "step": 369 }, { "epoch": 0.1656870976770221, "grad_norm": 0.7785488963127136, "learning_rate": 4.727608494921514e-05, "loss": 0.8959, "step": 370 }, { "epoch": 0.1656870976770221, "eval_loss": 0.8474584817886353, "eval_runtime": 1737.3041, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 370 }, { "epoch": 0.16613490064371678, "grad_norm": 0.7459981441497803, "learning_rate": 4.726454293628809e-05, "loss": 0.8296, "step": 371 }, { "epoch": 0.16658270361041141, "grad_norm": 0.8447751998901367, "learning_rate": 4.7253000923361034e-05, "loss": 0.7229, "step": 372 }, { "epoch": 0.16703050657710608, "grad_norm": 0.8873215317726135, "learning_rate": 4.7241458910433983e-05, "loss": 0.9665, "step": 373 }, { "epoch": 0.16747830954380072, "grad_norm": 0.8727468848228455, "learning_rate": 4.7229916897506926e-05, "loss": 0.8588, "step": 374 }, { "epoch": 0.16792611251049538, "grad_norm": 0.8442516326904297, "learning_rate": 4.7218374884579875e-05, "loss": 0.8735, "step": 375 }, { "epoch": 0.16837391547719002, "grad_norm": 0.741875410079956, "learning_rate": 4.720683287165282e-05, "loss": 0.7119, "step": 376 }, { "epoch": 0.1688217184438847, "grad_norm": 0.9967590570449829, "learning_rate": 4.719529085872577e-05, "loss": 0.6724, "step": 377 }, { "epoch": 0.16926952141057935, "grad_norm": 0.8461303114891052, "learning_rate": 4.718374884579871e-05, "loss": 0.8864, "step": 378 }, { "epoch": 0.169717324377274, "grad_norm": 0.8811253309249878, "learning_rate": 4.717220683287166e-05, "loss": 0.8461, "step": 379 }, { "epoch": 0.17016512734396866, "grad_norm": 0.7719868421554565, "learning_rate": 4.71606648199446e-05, "loss": 0.8755, "step": 380 }, { "epoch": 0.17016512734396866, "eval_loss": 0.8378962874412537, "eval_runtime": 1737.6913, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 380 }, { "epoch": 0.1706129303106633, "grad_norm": 0.904470682144165, "learning_rate": 4.714912280701755e-05, "loss": 0.869, "step": 381 }, { "epoch": 0.17106073327735796, "grad_norm": 0.988050639629364, "learning_rate": 4.7137580794090494e-05, "loss": 0.8023, "step": 382 }, { "epoch": 0.17150853624405263, "grad_norm": 0.7240595817565918, "learning_rate": 4.7126038781163437e-05, "loss": 0.6097, "step": 383 }, { "epoch": 0.17195633921074727, "grad_norm": 0.811393141746521, "learning_rate": 4.711449676823638e-05, "loss": 0.7306, "step": 384 }, { "epoch": 0.17240414217744193, "grad_norm": 0.867587149143219, "learning_rate": 4.710295475530933e-05, "loss": 0.7562, "step": 385 }, { "epoch": 0.17285194514413657, "grad_norm": 0.842566728591919, "learning_rate": 4.709141274238227e-05, "loss": 0.5919, "step": 386 }, { "epoch": 0.17329974811083124, "grad_norm": 0.9528062343597412, "learning_rate": 4.7079870729455214e-05, "loss": 0.8223, "step": 387 }, { "epoch": 0.1737475510775259, "grad_norm": 0.7800689935684204, "learning_rate": 4.706832871652816e-05, "loss": 0.7363, "step": 388 }, { "epoch": 0.17419535404422054, "grad_norm": 1.0290626287460327, "learning_rate": 4.7056786703601106e-05, "loss": 0.7701, "step": 389 }, { "epoch": 0.1746431570109152, "grad_norm": 0.7647327780723572, "learning_rate": 4.7045244690674055e-05, "loss": 0.877, "step": 390 }, { "epoch": 0.1746431570109152, "eval_loss": 0.8322415351867676, "eval_runtime": 1737.8875, "eval_samples_per_second": 2.57, "eval_steps_per_second": 2.57, "step": 390 }, { "epoch": 0.17509095997760984, "grad_norm": 1.024598240852356, "learning_rate": 4.7033702677747e-05, "loss": 0.9069, "step": 391 }, { "epoch": 0.1755387629443045, "grad_norm": 0.8539007902145386, "learning_rate": 4.702216066481995e-05, "loss": 0.904, "step": 392 }, { "epoch": 0.17598656591099915, "grad_norm": 0.7349446415901184, "learning_rate": 4.701061865189289e-05, "loss": 0.7279, "step": 393 }, { "epoch": 0.17643436887769381, "grad_norm": 0.9322994947433472, "learning_rate": 4.699907663896584e-05, "loss": 1.0765, "step": 394 }, { "epoch": 0.17688217184438848, "grad_norm": 0.7974610924720764, "learning_rate": 4.698753462603878e-05, "loss": 0.7591, "step": 395 }, { "epoch": 0.17732997481108312, "grad_norm": 0.9433238506317139, "learning_rate": 4.697599261311173e-05, "loss": 0.7526, "step": 396 }, { "epoch": 0.17777777777777778, "grad_norm": 0.8383938074111938, "learning_rate": 4.6964450600184674e-05, "loss": 0.725, "step": 397 }, { "epoch": 0.17822558074447242, "grad_norm": 0.7757347226142883, "learning_rate": 4.695290858725762e-05, "loss": 1.063, "step": 398 }, { "epoch": 0.1786733837111671, "grad_norm": 0.8449235558509827, "learning_rate": 4.6941366574330566e-05, "loss": 0.7043, "step": 399 }, { "epoch": 0.17912118667786175, "grad_norm": 0.7594418525695801, "learning_rate": 4.6929824561403515e-05, "loss": 0.9083, "step": 400 }, { "epoch": 0.17912118667786175, "eval_loss": 0.8224214911460876, "eval_runtime": 1738.3524, "eval_samples_per_second": 2.57, "eval_steps_per_second": 2.57, "step": 400 }, { "epoch": 0.1795689896445564, "grad_norm": 0.7965478301048279, "learning_rate": 4.691828254847646e-05, "loss": 0.9748, "step": 401 }, { "epoch": 0.18001679261125106, "grad_norm": 0.8964141607284546, "learning_rate": 4.690674053554941e-05, "loss": 1.0344, "step": 402 }, { "epoch": 0.1804645955779457, "grad_norm": 0.7425351142883301, "learning_rate": 4.689519852262235e-05, "loss": 0.6737, "step": 403 }, { "epoch": 0.18091239854464036, "grad_norm": 0.7083563208580017, "learning_rate": 4.688365650969529e-05, "loss": 0.8339, "step": 404 }, { "epoch": 0.181360201511335, "grad_norm": 0.7600324153900146, "learning_rate": 4.6872114496768235e-05, "loss": 0.8032, "step": 405 }, { "epoch": 0.18180800447802967, "grad_norm": 0.7678210735321045, "learning_rate": 4.6860572483841184e-05, "loss": 0.8703, "step": 406 }, { "epoch": 0.18225580744472433, "grad_norm": 0.8270692229270935, "learning_rate": 4.684903047091413e-05, "loss": 0.8807, "step": 407 }, { "epoch": 0.18270361041141897, "grad_norm": 0.8562888503074646, "learning_rate": 4.6837488457987076e-05, "loss": 0.9365, "step": 408 }, { "epoch": 0.18315141337811364, "grad_norm": 0.9993149042129517, "learning_rate": 4.682594644506002e-05, "loss": 0.8716, "step": 409 }, { "epoch": 0.18359921634480827, "grad_norm": 0.795988917350769, "learning_rate": 4.681440443213297e-05, "loss": 0.7853, "step": 410 }, { "epoch": 0.18359921634480827, "eval_loss": 0.8142237663269043, "eval_runtime": 1737.637, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 410 }, { "epoch": 0.18404701931150294, "grad_norm": 0.7437682151794434, "learning_rate": 4.680286241920591e-05, "loss": 0.7872, "step": 411 }, { "epoch": 0.1844948222781976, "grad_norm": 0.8209964036941528, "learning_rate": 4.6791320406278854e-05, "loss": 0.7995, "step": 412 }, { "epoch": 0.18494262524489224, "grad_norm": 0.9489561319351196, "learning_rate": 4.67797783933518e-05, "loss": 0.7761, "step": 413 }, { "epoch": 0.1853904282115869, "grad_norm": 0.8502386212348938, "learning_rate": 4.6768236380424746e-05, "loss": 0.9846, "step": 414 }, { "epoch": 0.18583823117828155, "grad_norm": 0.7570764422416687, "learning_rate": 4.6756694367497695e-05, "loss": 1.116, "step": 415 }, { "epoch": 0.18628603414497621, "grad_norm": 0.8194412589073181, "learning_rate": 4.674515235457064e-05, "loss": 0.8261, "step": 416 }, { "epoch": 0.18673383711167085, "grad_norm": 1.0201462507247925, "learning_rate": 4.673361034164359e-05, "loss": 1.0355, "step": 417 }, { "epoch": 0.18718164007836552, "grad_norm": 0.7511212229728699, "learning_rate": 4.672206832871653e-05, "loss": 0.921, "step": 418 }, { "epoch": 0.18762944304506018, "grad_norm": 0.8299571871757507, "learning_rate": 4.671052631578948e-05, "loss": 1.0033, "step": 419 }, { "epoch": 0.18807724601175482, "grad_norm": 0.9280295968055725, "learning_rate": 4.669898430286242e-05, "loss": 0.7556, "step": 420 }, { "epoch": 0.18807724601175482, "eval_loss": 0.8086807727813721, "eval_runtime": 1738.0909, "eval_samples_per_second": 2.57, "eval_steps_per_second": 2.57, "step": 420 }, { "epoch": 0.1885250489784495, "grad_norm": 0.8704283237457275, "learning_rate": 4.668744228993537e-05, "loss": 0.9123, "step": 421 }, { "epoch": 0.18897285194514413, "grad_norm": 1.0498746633529663, "learning_rate": 4.6675900277008313e-05, "loss": 0.6306, "step": 422 }, { "epoch": 0.1894206549118388, "grad_norm": 0.7668930888175964, "learning_rate": 4.6664358264081256e-05, "loss": 1.1142, "step": 423 }, { "epoch": 0.18986845787853346, "grad_norm": 1.0189166069030762, "learning_rate": 4.6652816251154205e-05, "loss": 0.8831, "step": 424 }, { "epoch": 0.1903162608452281, "grad_norm": 0.9827046990394592, "learning_rate": 4.664127423822715e-05, "loss": 0.6772, "step": 425 }, { "epoch": 0.19076406381192276, "grad_norm": 0.8419145941734314, "learning_rate": 4.662973222530009e-05, "loss": 0.8903, "step": 426 }, { "epoch": 0.1912118667786174, "grad_norm": 1.0089313983917236, "learning_rate": 4.661819021237304e-05, "loss": 0.8986, "step": 427 }, { "epoch": 0.19165966974531207, "grad_norm": 0.9125351309776306, "learning_rate": 4.660664819944598e-05, "loss": 0.8019, "step": 428 }, { "epoch": 0.1921074727120067, "grad_norm": 0.9646217823028564, "learning_rate": 4.659510618651893e-05, "loss": 0.6646, "step": 429 }, { "epoch": 0.19255527567870137, "grad_norm": 1.0330066680908203, "learning_rate": 4.6583564173591875e-05, "loss": 0.9278, "step": 430 }, { "epoch": 0.19255527567870137, "eval_loss": 0.7997502684593201, "eval_runtime": 1738.9851, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 430 }, { "epoch": 0.19300307864539604, "grad_norm": 0.7883182168006897, "learning_rate": 4.6572022160664824e-05, "loss": 0.7367, "step": 431 }, { "epoch": 0.19345088161209067, "grad_norm": 0.8542357087135315, "learning_rate": 4.656048014773777e-05, "loss": 0.8034, "step": 432 }, { "epoch": 0.19389868457878534, "grad_norm": 0.8576323390007019, "learning_rate": 4.6548938134810716e-05, "loss": 1.0407, "step": 433 }, { "epoch": 0.19434648754547998, "grad_norm": 0.8395902514457703, "learning_rate": 4.653739612188366e-05, "loss": 0.8512, "step": 434 }, { "epoch": 0.19479429051217464, "grad_norm": 0.8623055815696716, "learning_rate": 4.652585410895661e-05, "loss": 0.5575, "step": 435 }, { "epoch": 0.1952420934788693, "grad_norm": 0.711990237236023, "learning_rate": 4.651431209602955e-05, "loss": 0.9274, "step": 436 }, { "epoch": 0.19568989644556395, "grad_norm": 0.9932701587677002, "learning_rate": 4.650277008310249e-05, "loss": 0.7234, "step": 437 }, { "epoch": 0.19613769941225861, "grad_norm": 0.9657753705978394, "learning_rate": 4.649122807017544e-05, "loss": 0.8719, "step": 438 }, { "epoch": 0.19658550237895325, "grad_norm": 0.8279201984405518, "learning_rate": 4.6479686057248385e-05, "loss": 0.7891, "step": 439 }, { "epoch": 0.19703330534564792, "grad_norm": 0.9909638166427612, "learning_rate": 4.6468144044321335e-05, "loss": 0.6438, "step": 440 }, { "epoch": 0.19703330534564792, "eval_loss": 0.7911050915718079, "eval_runtime": 1739.4085, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 440 }, { "epoch": 0.19748110831234256, "grad_norm": 0.7535269260406494, "learning_rate": 4.645660203139428e-05, "loss": 0.834, "step": 441 }, { "epoch": 0.19792891127903722, "grad_norm": 0.8972837328910828, "learning_rate": 4.6445060018467227e-05, "loss": 0.7625, "step": 442 }, { "epoch": 0.1983767142457319, "grad_norm": 0.8356785774230957, "learning_rate": 4.643351800554017e-05, "loss": 0.8538, "step": 443 }, { "epoch": 0.19882451721242653, "grad_norm": 0.8921677470207214, "learning_rate": 4.642197599261311e-05, "loss": 0.9137, "step": 444 }, { "epoch": 0.1992723201791212, "grad_norm": 0.8165355324745178, "learning_rate": 4.6410433979686054e-05, "loss": 0.8285, "step": 445 }, { "epoch": 0.19972012314581583, "grad_norm": 0.8382999897003174, "learning_rate": 4.6398891966759004e-05, "loss": 1.0426, "step": 446 }, { "epoch": 0.2001679261125105, "grad_norm": 0.7926031351089478, "learning_rate": 4.6387349953831946e-05, "loss": 0.841, "step": 447 }, { "epoch": 0.20061572907920516, "grad_norm": 0.8137739300727844, "learning_rate": 4.6375807940904896e-05, "loss": 0.8253, "step": 448 }, { "epoch": 0.2010635320458998, "grad_norm": 0.8974906802177429, "learning_rate": 4.636426592797784e-05, "loss": 0.7984, "step": 449 }, { "epoch": 0.20151133501259447, "grad_norm": 0.7735929489135742, "learning_rate": 4.635272391505079e-05, "loss": 0.6582, "step": 450 }, { "epoch": 0.20151133501259447, "eval_loss": 0.7823182940483093, "eval_runtime": 1739.8443, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 450 }, { "epoch": 0.2019591379792891, "grad_norm": 0.7329652905464172, "learning_rate": 4.634118190212373e-05, "loss": 0.8273, "step": 451 }, { "epoch": 0.20240694094598377, "grad_norm": 0.7918416261672974, "learning_rate": 4.632963988919668e-05, "loss": 0.8176, "step": 452 }, { "epoch": 0.2028547439126784, "grad_norm": 0.9216796159744263, "learning_rate": 4.631809787626962e-05, "loss": 0.7303, "step": 453 }, { "epoch": 0.20330254687937308, "grad_norm": 1.078320026397705, "learning_rate": 4.630655586334257e-05, "loss": 0.5538, "step": 454 }, { "epoch": 0.20375034984606774, "grad_norm": 0.8843169808387756, "learning_rate": 4.6295013850415514e-05, "loss": 0.7284, "step": 455 }, { "epoch": 0.20419815281276238, "grad_norm": 1.031596302986145, "learning_rate": 4.6283471837488464e-05, "loss": 0.8675, "step": 456 }, { "epoch": 0.20464595577945704, "grad_norm": 1.0795527696609497, "learning_rate": 4.6271929824561406e-05, "loss": 0.8009, "step": 457 }, { "epoch": 0.20509375874615168, "grad_norm": 0.9157360196113586, "learning_rate": 4.6260387811634356e-05, "loss": 0.6112, "step": 458 }, { "epoch": 0.20554156171284635, "grad_norm": 0.8338711857795715, "learning_rate": 4.62488457987073e-05, "loss": 0.8584, "step": 459 }, { "epoch": 0.20598936467954101, "grad_norm": 0.70074063539505, "learning_rate": 4.623730378578025e-05, "loss": 0.882, "step": 460 }, { "epoch": 0.20598936467954101, "eval_loss": 0.7754072546958923, "eval_runtime": 1740.1151, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 460 }, { "epoch": 0.20643716764623565, "grad_norm": 0.8252310156822205, "learning_rate": 4.622576177285319e-05, "loss": 0.8817, "step": 461 }, { "epoch": 0.20688497061293032, "grad_norm": 0.9494065046310425, "learning_rate": 4.621421975992613e-05, "loss": 0.8243, "step": 462 }, { "epoch": 0.20733277357962496, "grad_norm": 0.74778151512146, "learning_rate": 4.620267774699908e-05, "loss": 1.1506, "step": 463 }, { "epoch": 0.20778057654631962, "grad_norm": 0.8175053596496582, "learning_rate": 4.6191135734072025e-05, "loss": 0.7699, "step": 464 }, { "epoch": 0.20822837951301426, "grad_norm": 0.7756770849227905, "learning_rate": 4.617959372114497e-05, "loss": 0.8569, "step": 465 }, { "epoch": 0.20867618247970893, "grad_norm": 0.8187942504882812, "learning_rate": 4.616805170821791e-05, "loss": 0.8084, "step": 466 }, { "epoch": 0.2091239854464036, "grad_norm": 0.7939863204956055, "learning_rate": 4.615650969529086e-05, "loss": 0.5601, "step": 467 }, { "epoch": 0.20957178841309823, "grad_norm": 0.8753362894058228, "learning_rate": 4.61449676823638e-05, "loss": 0.799, "step": 468 }, { "epoch": 0.2100195913797929, "grad_norm": 0.7502310276031494, "learning_rate": 4.613342566943675e-05, "loss": 0.78, "step": 469 }, { "epoch": 0.21046739434648754, "grad_norm": 0.933039665222168, "learning_rate": 4.6121883656509694e-05, "loss": 0.5448, "step": 470 }, { "epoch": 0.21046739434648754, "eval_loss": 0.7699535489082336, "eval_runtime": 1739.9004, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 470 }, { "epoch": 0.2109151973131822, "grad_norm": 1.137333631515503, "learning_rate": 4.6110341643582644e-05, "loss": 0.7728, "step": 471 }, { "epoch": 0.21136300027987687, "grad_norm": 0.9676854610443115, "learning_rate": 4.6098799630655586e-05, "loss": 0.8342, "step": 472 }, { "epoch": 0.2118108032465715, "grad_norm": 0.9194774627685547, "learning_rate": 4.6087257617728535e-05, "loss": 0.8051, "step": 473 }, { "epoch": 0.21225860621326617, "grad_norm": 0.9370580315589905, "learning_rate": 4.607571560480148e-05, "loss": 0.7046, "step": 474 }, { "epoch": 0.2127064091799608, "grad_norm": 0.8880735635757446, "learning_rate": 4.606417359187443e-05, "loss": 0.743, "step": 475 }, { "epoch": 0.21315421214665548, "grad_norm": 0.8558589220046997, "learning_rate": 4.605263157894737e-05, "loss": 0.9362, "step": 476 }, { "epoch": 0.2136020151133501, "grad_norm": 0.8161864876747131, "learning_rate": 4.604108956602032e-05, "loss": 0.7904, "step": 477 }, { "epoch": 0.21404981808004478, "grad_norm": 0.9424650073051453, "learning_rate": 4.602954755309326e-05, "loss": 0.7731, "step": 478 }, { "epoch": 0.21449762104673945, "grad_norm": 0.8773455023765564, "learning_rate": 4.601800554016621e-05, "loss": 0.7889, "step": 479 }, { "epoch": 0.21494542401343408, "grad_norm": 0.8586567640304565, "learning_rate": 4.6006463527239154e-05, "loss": 0.9641, "step": 480 }, { "epoch": 0.21494542401343408, "eval_loss": 0.7605552077293396, "eval_runtime": 1741.1131, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 480 }, { "epoch": 0.21539322698012875, "grad_norm": 0.768221378326416, "learning_rate": 4.5994921514312103e-05, "loss": 0.9106, "step": 481 }, { "epoch": 0.2158410299468234, "grad_norm": 1.0304666757583618, "learning_rate": 4.5983379501385046e-05, "loss": 0.6707, "step": 482 }, { "epoch": 0.21628883291351805, "grad_norm": 0.8881514072418213, "learning_rate": 4.597183748845799e-05, "loss": 0.8598, "step": 483 }, { "epoch": 0.21673663588021272, "grad_norm": 0.907448410987854, "learning_rate": 4.596029547553093e-05, "loss": 0.7609, "step": 484 }, { "epoch": 0.21718443884690736, "grad_norm": 0.9400318264961243, "learning_rate": 4.594875346260388e-05, "loss": 0.8799, "step": 485 }, { "epoch": 0.21763224181360202, "grad_norm": 1.0344361066818237, "learning_rate": 4.593721144967682e-05, "loss": 0.8069, "step": 486 }, { "epoch": 0.21808004478029666, "grad_norm": 0.8733990788459778, "learning_rate": 4.5925669436749766e-05, "loss": 0.668, "step": 487 }, { "epoch": 0.21852784774699133, "grad_norm": 0.7444555759429932, "learning_rate": 4.5914127423822715e-05, "loss": 0.6643, "step": 488 }, { "epoch": 0.21897565071368597, "grad_norm": 0.8918318152427673, "learning_rate": 4.590258541089566e-05, "loss": 0.656, "step": 489 }, { "epoch": 0.21942345368038063, "grad_norm": 0.9341442584991455, "learning_rate": 4.589104339796861e-05, "loss": 0.7254, "step": 490 }, { "epoch": 0.21942345368038063, "eval_loss": 0.7527384757995605, "eval_runtime": 1741.6165, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 490 }, { "epoch": 0.2198712566470753, "grad_norm": 0.9269483089447021, "learning_rate": 4.587950138504155e-05, "loss": 0.6856, "step": 491 }, { "epoch": 0.22031905961376994, "grad_norm": 0.9737778306007385, "learning_rate": 4.58679593721145e-05, "loss": 0.6296, "step": 492 }, { "epoch": 0.2207668625804646, "grad_norm": 1.1320279836654663, "learning_rate": 4.585641735918744e-05, "loss": 0.9568, "step": 493 }, { "epoch": 0.22121466554715924, "grad_norm": 0.9716588854789734, "learning_rate": 4.584487534626039e-05, "loss": 0.9283, "step": 494 }, { "epoch": 0.2216624685138539, "grad_norm": 0.9201183915138245, "learning_rate": 4.5833333333333334e-05, "loss": 0.5564, "step": 495 }, { "epoch": 0.22211027148054857, "grad_norm": 0.956973671913147, "learning_rate": 4.582179132040628e-05, "loss": 0.7652, "step": 496 }, { "epoch": 0.2225580744472432, "grad_norm": 0.9087381958961487, "learning_rate": 4.5810249307479226e-05, "loss": 0.5507, "step": 497 }, { "epoch": 0.22300587741393788, "grad_norm": 0.8070569038391113, "learning_rate": 4.5798707294552175e-05, "loss": 0.7761, "step": 498 }, { "epoch": 0.2234536803806325, "grad_norm": 0.9219637513160706, "learning_rate": 4.578716528162512e-05, "loss": 0.8153, "step": 499 }, { "epoch": 0.22390148334732718, "grad_norm": 0.921072781085968, "learning_rate": 4.577562326869807e-05, "loss": 0.616, "step": 500 }, { "epoch": 0.22390148334732718, "eval_loss": 0.7446861863136292, "eval_runtime": 1731.4556, "eval_samples_per_second": 2.58, "eval_steps_per_second": 2.58, "step": 500 }, { "epoch": 0.22434928631402182, "grad_norm": 0.8725545406341553, "learning_rate": 4.576408125577101e-05, "loss": 0.6828, "step": 501 }, { "epoch": 0.22479708928071648, "grad_norm": 1.0268959999084473, "learning_rate": 4.575253924284396e-05, "loss": 0.6286, "step": 502 }, { "epoch": 0.22524489224741115, "grad_norm": 0.8673756718635559, "learning_rate": 4.57409972299169e-05, "loss": 0.7195, "step": 503 }, { "epoch": 0.2256926952141058, "grad_norm": 0.7618908286094666, "learning_rate": 4.5729455216989844e-05, "loss": 0.8099, "step": 504 }, { "epoch": 0.22614049818080045, "grad_norm": 0.9637864828109741, "learning_rate": 4.571791320406279e-05, "loss": 0.5663, "step": 505 }, { "epoch": 0.2265883011474951, "grad_norm": 0.8402149081230164, "learning_rate": 4.5706371191135736e-05, "loss": 0.8883, "step": 506 }, { "epoch": 0.22703610411418976, "grad_norm": 1.0460970401763916, "learning_rate": 4.569482917820868e-05, "loss": 0.7933, "step": 507 }, { "epoch": 0.22748390708088442, "grad_norm": 1.0066616535186768, "learning_rate": 4.568328716528163e-05, "loss": 0.6868, "step": 508 }, { "epoch": 0.22793171004757906, "grad_norm": 0.933358371257782, "learning_rate": 4.567174515235457e-05, "loss": 0.6533, "step": 509 }, { "epoch": 0.22837951301427373, "grad_norm": 0.9774124622344971, "learning_rate": 4.566020313942752e-05, "loss": 0.5852, "step": 510 }, { "epoch": 0.22837951301427373, "eval_loss": 0.73744797706604, "eval_runtime": 1735.3738, "eval_samples_per_second": 2.574, "eval_steps_per_second": 2.574, "step": 510 }, { "epoch": 0.22882731598096837, "grad_norm": 0.8680145740509033, "learning_rate": 4.564866112650046e-05, "loss": 0.8256, "step": 511 }, { "epoch": 0.22927511894766303, "grad_norm": 0.9629591107368469, "learning_rate": 4.5637119113573406e-05, "loss": 0.786, "step": 512 }, { "epoch": 0.22972292191435767, "grad_norm": 0.8956183791160583, "learning_rate": 4.5625577100646355e-05, "loss": 0.6659, "step": 513 }, { "epoch": 0.23017072488105234, "grad_norm": 0.9961169958114624, "learning_rate": 4.56140350877193e-05, "loss": 0.8279, "step": 514 }, { "epoch": 0.230618527847747, "grad_norm": 0.9245657920837402, "learning_rate": 4.560249307479225e-05, "loss": 0.5056, "step": 515 }, { "epoch": 0.23106633081444164, "grad_norm": 1.0266315937042236, "learning_rate": 4.559095106186519e-05, "loss": 0.839, "step": 516 }, { "epoch": 0.2315141337811363, "grad_norm": 1.0880862474441528, "learning_rate": 4.557940904893814e-05, "loss": 0.7539, "step": 517 }, { "epoch": 0.23196193674783094, "grad_norm": 0.894558846950531, "learning_rate": 4.556786703601108e-05, "loss": 0.7628, "step": 518 }, { "epoch": 0.2324097397145256, "grad_norm": 0.9117352366447449, "learning_rate": 4.555632502308403e-05, "loss": 0.6321, "step": 519 }, { "epoch": 0.23285754268122028, "grad_norm": 0.7979990839958191, "learning_rate": 4.5544783010156974e-05, "loss": 0.9759, "step": 520 }, { "epoch": 0.23285754268122028, "eval_loss": 0.731434166431427, "eval_runtime": 1732.8361, "eval_samples_per_second": 2.578, "eval_steps_per_second": 2.578, "step": 520 }, { "epoch": 0.2333053456479149, "grad_norm": 1.0542393922805786, "learning_rate": 4.553324099722992e-05, "loss": 0.4714, "step": 521 }, { "epoch": 0.23375314861460958, "grad_norm": 1.0756099224090576, "learning_rate": 4.5521698984302866e-05, "loss": 0.6873, "step": 522 }, { "epoch": 0.23420095158130422, "grad_norm": 1.0123642683029175, "learning_rate": 4.551015697137581e-05, "loss": 0.6545, "step": 523 }, { "epoch": 0.23464875454799888, "grad_norm": 1.0468355417251587, "learning_rate": 4.549861495844876e-05, "loss": 0.7371, "step": 524 }, { "epoch": 0.23509655751469352, "grad_norm": 0.91986483335495, "learning_rate": 4.54870729455217e-05, "loss": 0.6783, "step": 525 }, { "epoch": 0.2355443604813882, "grad_norm": 0.8713955283164978, "learning_rate": 4.547553093259464e-05, "loss": 0.7152, "step": 526 }, { "epoch": 0.23599216344808285, "grad_norm": 1.010161280632019, "learning_rate": 4.546398891966759e-05, "loss": 0.7729, "step": 527 }, { "epoch": 0.2364399664147775, "grad_norm": 0.9573583602905273, "learning_rate": 4.5452446906740535e-05, "loss": 0.6187, "step": 528 }, { "epoch": 0.23688776938147216, "grad_norm": 0.9014054536819458, "learning_rate": 4.5440904893813484e-05, "loss": 0.6841, "step": 529 }, { "epoch": 0.2373355723481668, "grad_norm": 0.7979456782341003, "learning_rate": 4.542936288088643e-05, "loss": 0.6041, "step": 530 }, { "epoch": 0.2373355723481668, "eval_loss": 0.7225666046142578, "eval_runtime": 1730.4142, "eval_samples_per_second": 2.581, "eval_steps_per_second": 2.581, "step": 530 }, { "epoch": 0.23778337531486146, "grad_norm": 0.9193875193595886, "learning_rate": 4.5417820867959376e-05, "loss": 0.5635, "step": 531 }, { "epoch": 0.23823117828155613, "grad_norm": 0.9741266369819641, "learning_rate": 4.540627885503232e-05, "loss": 0.6157, "step": 532 }, { "epoch": 0.23867898124825077, "grad_norm": 0.9680760502815247, "learning_rate": 4.539473684210527e-05, "loss": 0.6919, "step": 533 }, { "epoch": 0.23912678421494543, "grad_norm": 0.9001390337944031, "learning_rate": 4.538319482917821e-05, "loss": 0.6982, "step": 534 }, { "epoch": 0.23957458718164007, "grad_norm": 0.921018660068512, "learning_rate": 4.537165281625116e-05, "loss": 0.5572, "step": 535 }, { "epoch": 0.24002239014833474, "grad_norm": 0.833634078502655, "learning_rate": 4.53601108033241e-05, "loss": 0.8303, "step": 536 }, { "epoch": 0.24047019311502937, "grad_norm": 1.0180853605270386, "learning_rate": 4.5348568790397045e-05, "loss": 0.6601, "step": 537 }, { "epoch": 0.24091799608172404, "grad_norm": 0.8896169066429138, "learning_rate": 4.5337026777469995e-05, "loss": 0.7986, "step": 538 }, { "epoch": 0.2413657990484187, "grad_norm": 1.0047816038131714, "learning_rate": 4.532548476454294e-05, "loss": 0.71, "step": 539 }, { "epoch": 0.24181360201511334, "grad_norm": 0.9067107439041138, "learning_rate": 4.531394275161589e-05, "loss": 0.9264, "step": 540 }, { "epoch": 0.24181360201511334, "eval_loss": 0.7173718214035034, "eval_runtime": 1732.183, "eval_samples_per_second": 2.579, "eval_steps_per_second": 2.579, "step": 540 }, { "epoch": 0.242261404981808, "grad_norm": 0.9225302338600159, "learning_rate": 4.530240073868883e-05, "loss": 0.7265, "step": 541 }, { "epoch": 0.24270920794850265, "grad_norm": 1.0222615003585815, "learning_rate": 4.529085872576178e-05, "loss": 0.6444, "step": 542 }, { "epoch": 0.2431570109151973, "grad_norm": 0.8889837861061096, "learning_rate": 4.527931671283472e-05, "loss": 0.6853, "step": 543 }, { "epoch": 0.24360481388189198, "grad_norm": 0.9772764444351196, "learning_rate": 4.5267774699907664e-05, "loss": 0.5919, "step": 544 }, { "epoch": 0.24405261684858662, "grad_norm": 0.794398307800293, "learning_rate": 4.5256232686980606e-05, "loss": 0.9762, "step": 545 }, { "epoch": 0.24450041981528128, "grad_norm": 0.8529706597328186, "learning_rate": 4.5244690674053556e-05, "loss": 1.0409, "step": 546 }, { "epoch": 0.24494822278197592, "grad_norm": 0.9057673811912537, "learning_rate": 4.52331486611265e-05, "loss": 0.6886, "step": 547 }, { "epoch": 0.2453960257486706, "grad_norm": 0.9492160677909851, "learning_rate": 4.522160664819945e-05, "loss": 0.8578, "step": 548 }, { "epoch": 0.24584382871536523, "grad_norm": 1.1789013147354126, "learning_rate": 4.521006463527239e-05, "loss": 0.7631, "step": 549 }, { "epoch": 0.2462916316820599, "grad_norm": 1.0071775913238525, "learning_rate": 4.519852262234534e-05, "loss": 0.6618, "step": 550 }, { "epoch": 0.2462916316820599, "eval_loss": 0.7097620368003845, "eval_runtime": 1735.094, "eval_samples_per_second": 2.575, "eval_steps_per_second": 2.575, "step": 550 }, { "epoch": 0.24673943464875456, "grad_norm": 0.8603365421295166, "learning_rate": 4.518698060941828e-05, "loss": 0.5254, "step": 551 }, { "epoch": 0.2471872376154492, "grad_norm": 1.0445563793182373, "learning_rate": 4.517543859649123e-05, "loss": 0.7521, "step": 552 }, { "epoch": 0.24763504058214386, "grad_norm": 0.8923339247703552, "learning_rate": 4.5163896583564174e-05, "loss": 0.5268, "step": 553 }, { "epoch": 0.2480828435488385, "grad_norm": 0.9316530227661133, "learning_rate": 4.5152354570637124e-05, "loss": 0.5405, "step": 554 }, { "epoch": 0.24853064651553317, "grad_norm": 0.9488258361816406, "learning_rate": 4.5140812557710066e-05, "loss": 0.6399, "step": 555 }, { "epoch": 0.24897844948222783, "grad_norm": 0.9134349226951599, "learning_rate": 4.5129270544783016e-05, "loss": 0.7562, "step": 556 }, { "epoch": 0.24942625244892247, "grad_norm": 0.8964117169380188, "learning_rate": 4.511772853185596e-05, "loss": 0.6248, "step": 557 }, { "epoch": 0.24987405541561714, "grad_norm": 0.8586876392364502, "learning_rate": 4.510618651892891e-05, "loss": 0.7153, "step": 558 }, { "epoch": 0.2503218583823118, "grad_norm": 0.9428457021713257, "learning_rate": 4.509464450600185e-05, "loss": 0.8549, "step": 559 }, { "epoch": 0.2507696613490064, "grad_norm": 0.8758842945098877, "learning_rate": 4.50831024930748e-05, "loss": 0.7171, "step": 560 }, { "epoch": 0.2507696613490064, "eval_loss": 0.703180193901062, "eval_runtime": 1745.1552, "eval_samples_per_second": 2.56, "eval_steps_per_second": 2.56, "step": 560 }, { "epoch": 0.2512174643157011, "grad_norm": 0.9924363493919373, "learning_rate": 4.507156048014774e-05, "loss": 0.3454, "step": 561 }, { "epoch": 0.25166526728239574, "grad_norm": 0.8703386187553406, "learning_rate": 4.5060018467220685e-05, "loss": 0.7339, "step": 562 }, { "epoch": 0.2521130702490904, "grad_norm": 0.9063810110092163, "learning_rate": 4.5048476454293634e-05, "loss": 0.5657, "step": 563 }, { "epoch": 0.2525608732157851, "grad_norm": 0.9477468132972717, "learning_rate": 4.503693444136658e-05, "loss": 0.8622, "step": 564 }, { "epoch": 0.2530086761824797, "grad_norm": 0.8709380030632019, "learning_rate": 4.502539242843952e-05, "loss": 0.7344, "step": 565 }, { "epoch": 0.25345647914917435, "grad_norm": 1.0082261562347412, "learning_rate": 4.501385041551246e-05, "loss": 0.8182, "step": 566 }, { "epoch": 0.253904282115869, "grad_norm": 0.891644299030304, "learning_rate": 4.500230840258541e-05, "loss": 0.6383, "step": 567 }, { "epoch": 0.2543520850825637, "grad_norm": 0.9304959774017334, "learning_rate": 4.4990766389658354e-05, "loss": 0.7813, "step": 568 }, { "epoch": 0.25479988804925835, "grad_norm": 0.9278199672698975, "learning_rate": 4.4979224376731304e-05, "loss": 0.5809, "step": 569 }, { "epoch": 0.25524769101595296, "grad_norm": 0.9517792463302612, "learning_rate": 4.4967682363804246e-05, "loss": 0.5944, "step": 570 }, { "epoch": 0.25524769101595296, "eval_loss": 0.6951916813850403, "eval_runtime": 1748.0579, "eval_samples_per_second": 2.555, "eval_steps_per_second": 2.555, "step": 570 }, { "epoch": 0.2556954939826476, "grad_norm": 0.9592674374580383, "learning_rate": 4.4956140350877196e-05, "loss": 0.5948, "step": 571 }, { "epoch": 0.2561432969493423, "grad_norm": 0.7364266514778137, "learning_rate": 4.494459833795014e-05, "loss": 0.6486, "step": 572 }, { "epoch": 0.25659109991603696, "grad_norm": 0.8261834383010864, "learning_rate": 4.493305632502309e-05, "loss": 0.55, "step": 573 }, { "epoch": 0.2570389028827316, "grad_norm": 0.8784179091453552, "learning_rate": 4.492151431209603e-05, "loss": 0.7123, "step": 574 }, { "epoch": 0.25748670584942623, "grad_norm": 0.930644154548645, "learning_rate": 4.490997229916898e-05, "loss": 0.7046, "step": 575 }, { "epoch": 0.2579345088161209, "grad_norm": 1.0528159141540527, "learning_rate": 4.489843028624192e-05, "loss": 0.6414, "step": 576 }, { "epoch": 0.25838231178281557, "grad_norm": 0.9306286573410034, "learning_rate": 4.488688827331487e-05, "loss": 0.6506, "step": 577 }, { "epoch": 0.25883011474951023, "grad_norm": 1.1190401315689087, "learning_rate": 4.4875346260387814e-05, "loss": 0.505, "step": 578 }, { "epoch": 0.2592779177162049, "grad_norm": 0.9627339839935303, "learning_rate": 4.4863804247460764e-05, "loss": 0.4562, "step": 579 }, { "epoch": 0.2597257206828995, "grad_norm": 1.0335285663604736, "learning_rate": 4.4852262234533706e-05, "loss": 0.6464, "step": 580 }, { "epoch": 0.2597257206828995, "eval_loss": 0.689763605594635, "eval_runtime": 1738.6553, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 580 }, { "epoch": 0.2601735236495942, "grad_norm": 1.1265842914581299, "learning_rate": 4.4840720221606656e-05, "loss": 0.5609, "step": 581 }, { "epoch": 0.26062132661628884, "grad_norm": 1.0893819332122803, "learning_rate": 4.48291782086796e-05, "loss": 0.8671, "step": 582 }, { "epoch": 0.2610691295829835, "grad_norm": 0.9546157717704773, "learning_rate": 4.481763619575254e-05, "loss": 0.7458, "step": 583 }, { "epoch": 0.2615169325496781, "grad_norm": 0.9704911112785339, "learning_rate": 4.480609418282548e-05, "loss": 0.8723, "step": 584 }, { "epoch": 0.2619647355163728, "grad_norm": 1.0225448608398438, "learning_rate": 4.479455216989843e-05, "loss": 0.8101, "step": 585 }, { "epoch": 0.26241253848306745, "grad_norm": 0.9812183976173401, "learning_rate": 4.4783010156971375e-05, "loss": 0.7722, "step": 586 }, { "epoch": 0.2628603414497621, "grad_norm": 0.8933651447296143, "learning_rate": 4.4771468144044325e-05, "loss": 0.6636, "step": 587 }, { "epoch": 0.2633081444164568, "grad_norm": 0.8690710067749023, "learning_rate": 4.475992613111727e-05, "loss": 0.6494, "step": 588 }, { "epoch": 0.2637559473831514, "grad_norm": 0.9920389652252197, "learning_rate": 4.474838411819021e-05, "loss": 0.3813, "step": 589 }, { "epoch": 0.26420375034984606, "grad_norm": 0.9435338973999023, "learning_rate": 4.473684210526316e-05, "loss": 0.5487, "step": 590 }, { "epoch": 0.26420375034984606, "eval_loss": 0.6839437484741211, "eval_runtime": 1734.3464, "eval_samples_per_second": 2.576, "eval_steps_per_second": 2.576, "step": 590 }, { "epoch": 0.2646515533165407, "grad_norm": 0.7974052429199219, "learning_rate": 4.47253000923361e-05, "loss": 0.6446, "step": 591 }, { "epoch": 0.2650993562832354, "grad_norm": 0.9143849015235901, "learning_rate": 4.471375807940905e-05, "loss": 0.5984, "step": 592 }, { "epoch": 0.26554715924993005, "grad_norm": 0.9549746513366699, "learning_rate": 4.4702216066481994e-05, "loss": 0.6503, "step": 593 }, { "epoch": 0.26599496221662466, "grad_norm": 1.10163414478302, "learning_rate": 4.469067405355494e-05, "loss": 0.5042, "step": 594 }, { "epoch": 0.26644276518331933, "grad_norm": 1.0820388793945312, "learning_rate": 4.4679132040627886e-05, "loss": 0.5365, "step": 595 }, { "epoch": 0.266890568150014, "grad_norm": 1.3410756587982178, "learning_rate": 4.4667590027700835e-05, "loss": 0.5666, "step": 596 }, { "epoch": 0.26733837111670866, "grad_norm": 1.1864416599273682, "learning_rate": 4.465604801477378e-05, "loss": 0.6475, "step": 597 }, { "epoch": 0.2677861740834033, "grad_norm": 1.0692484378814697, "learning_rate": 4.464450600184673e-05, "loss": 0.6315, "step": 598 }, { "epoch": 0.26823397705009794, "grad_norm": 1.0647058486938477, "learning_rate": 4.463296398891967e-05, "loss": 0.7084, "step": 599 }, { "epoch": 0.2686817800167926, "grad_norm": 0.9759196043014526, "learning_rate": 4.462142197599262e-05, "loss": 0.8468, "step": 600 }, { "epoch": 0.2686817800167926, "eval_loss": 0.6730876564979553, "eval_runtime": 1733.1552, "eval_samples_per_second": 2.577, "eval_steps_per_second": 2.577, "step": 600 }, { "epoch": 0.26912958298348727, "grad_norm": 0.8030557632446289, "learning_rate": 4.460987996306556e-05, "loss": 0.4763, "step": 601 }, { "epoch": 0.26957738595018194, "grad_norm": 1.0184237957000732, "learning_rate": 4.459833795013851e-05, "loss": 0.5457, "step": 602 }, { "epoch": 0.2700251889168766, "grad_norm": 1.0043987035751343, "learning_rate": 4.4586795937211454e-05, "loss": 0.7558, "step": 603 }, { "epoch": 0.2704729918835712, "grad_norm": 1.0344120264053345, "learning_rate": 4.4575253924284396e-05, "loss": 0.6735, "step": 604 }, { "epoch": 0.2709207948502659, "grad_norm": 0.8846050500869751, "learning_rate": 4.456371191135734e-05, "loss": 0.5962, "step": 605 }, { "epoch": 0.27136859781696054, "grad_norm": 0.9319285750389099, "learning_rate": 4.455216989843029e-05, "loss": 0.6983, "step": 606 }, { "epoch": 0.2718164007836552, "grad_norm": 0.878506600856781, "learning_rate": 4.454062788550323e-05, "loss": 0.7427, "step": 607 }, { "epoch": 0.2722642037503498, "grad_norm": 1.0764738321304321, "learning_rate": 4.452908587257618e-05, "loss": 0.5658, "step": 608 }, { "epoch": 0.2727120067170445, "grad_norm": 0.9462092518806458, "learning_rate": 4.451754385964912e-05, "loss": 0.6126, "step": 609 }, { "epoch": 0.27315980968373915, "grad_norm": 0.9421838521957397, "learning_rate": 4.450600184672207e-05, "loss": 0.8937, "step": 610 }, { "epoch": 0.27315980968373915, "eval_loss": 0.6644588112831116, "eval_runtime": 1730.9171, "eval_samples_per_second": 2.581, "eval_steps_per_second": 2.581, "step": 610 }, { "epoch": 0.2736076126504338, "grad_norm": 1.1748048067092896, "learning_rate": 4.4494459833795015e-05, "loss": 0.5851, "step": 611 }, { "epoch": 0.2740554156171285, "grad_norm": 0.882234513759613, "learning_rate": 4.4482917820867964e-05, "loss": 0.7101, "step": 612 }, { "epoch": 0.2745032185838231, "grad_norm": 0.9383102059364319, "learning_rate": 4.447137580794091e-05, "loss": 0.6291, "step": 613 }, { "epoch": 0.27495102155051776, "grad_norm": 1.1847293376922607, "learning_rate": 4.445983379501385e-05, "loss": 0.6358, "step": 614 }, { "epoch": 0.2753988245172124, "grad_norm": 1.3309704065322876, "learning_rate": 4.44482917820868e-05, "loss": 0.5924, "step": 615 }, { "epoch": 0.2758466274839071, "grad_norm": 0.8071926236152649, "learning_rate": 4.443674976915974e-05, "loss": 0.6001, "step": 616 }, { "epoch": 0.27629443045060176, "grad_norm": 0.928219199180603, "learning_rate": 4.442520775623269e-05, "loss": 0.6982, "step": 617 }, { "epoch": 0.27674223341729637, "grad_norm": 0.9898912310600281, "learning_rate": 4.4413665743305634e-05, "loss": 0.6513, "step": 618 }, { "epoch": 0.27719003638399103, "grad_norm": 1.0699115991592407, "learning_rate": 4.440212373037858e-05, "loss": 0.5662, "step": 619 }, { "epoch": 0.2776378393506857, "grad_norm": 0.9030242562294006, "learning_rate": 4.4390581717451526e-05, "loss": 0.6892, "step": 620 }, { "epoch": 0.2776378393506857, "eval_loss": 0.6576302647590637, "eval_runtime": 1732.4607, "eval_samples_per_second": 2.578, "eval_steps_per_second": 2.578, "step": 620 }, { "epoch": 0.27808564231738037, "grad_norm": 1.031383752822876, "learning_rate": 4.4379039704524475e-05, "loss": 0.4274, "step": 621 }, { "epoch": 0.27853344528407503, "grad_norm": 0.8698911070823669, "learning_rate": 4.436749769159742e-05, "loss": 0.6138, "step": 622 }, { "epoch": 0.27898124825076964, "grad_norm": 1.0288652181625366, "learning_rate": 4.435595567867036e-05, "loss": 0.6111, "step": 623 }, { "epoch": 0.2794290512174643, "grad_norm": 1.21888267993927, "learning_rate": 4.434441366574331e-05, "loss": 0.5836, "step": 624 }, { "epoch": 0.279876854184159, "grad_norm": 0.9868746399879456, "learning_rate": 4.433287165281625e-05, "loss": 0.665, "step": 625 }, { "epoch": 0.28032465715085364, "grad_norm": 1.1043651103973389, "learning_rate": 4.4321329639889195e-05, "loss": 0.6688, "step": 626 }, { "epoch": 0.2807724601175483, "grad_norm": 0.9997096657752991, "learning_rate": 4.4309787626962144e-05, "loss": 0.6676, "step": 627 }, { "epoch": 0.2812202630842429, "grad_norm": 0.8945668339729309, "learning_rate": 4.429824561403509e-05, "loss": 0.5091, "step": 628 }, { "epoch": 0.2816680660509376, "grad_norm": 0.9280533790588379, "learning_rate": 4.4286703601108036e-05, "loss": 0.3896, "step": 629 }, { "epoch": 0.28211586901763225, "grad_norm": 0.9969098567962646, "learning_rate": 4.427516158818098e-05, "loss": 0.674, "step": 630 }, { "epoch": 0.28211586901763225, "eval_loss": 0.6506349444389343, "eval_runtime": 1732.7002, "eval_samples_per_second": 2.578, "eval_steps_per_second": 2.578, "step": 630 }, { "epoch": 0.2825636719843269, "grad_norm": 0.9867515563964844, "learning_rate": 4.426361957525393e-05, "loss": 0.5049, "step": 631 }, { "epoch": 0.2830114749510215, "grad_norm": 0.9162868857383728, "learning_rate": 4.425207756232687e-05, "loss": 0.8855, "step": 632 }, { "epoch": 0.2834592779177162, "grad_norm": 1.0479141473770142, "learning_rate": 4.424053554939982e-05, "loss": 0.686, "step": 633 }, { "epoch": 0.28390708088441086, "grad_norm": 0.9793229699134827, "learning_rate": 4.422899353647276e-05, "loss": 0.8075, "step": 634 }, { "epoch": 0.2843548838511055, "grad_norm": 1.1976258754730225, "learning_rate": 4.421745152354571e-05, "loss": 0.7487, "step": 635 }, { "epoch": 0.2848026868178002, "grad_norm": 1.13443922996521, "learning_rate": 4.4205909510618655e-05, "loss": 0.7534, "step": 636 }, { "epoch": 0.2852504897844948, "grad_norm": 1.0198792219161987, "learning_rate": 4.4194367497691604e-05, "loss": 0.6477, "step": 637 }, { "epoch": 0.28569829275118946, "grad_norm": 1.0110708475112915, "learning_rate": 4.418282548476455e-05, "loss": 0.8262, "step": 638 }, { "epoch": 0.28614609571788413, "grad_norm": 0.9380364418029785, "learning_rate": 4.417128347183749e-05, "loss": 0.9128, "step": 639 }, { "epoch": 0.2865938986845788, "grad_norm": 0.9336382746696472, "learning_rate": 4.415974145891044e-05, "loss": 0.8021, "step": 640 }, { "epoch": 0.2865938986845788, "eval_loss": 0.6476196646690369, "eval_runtime": 1733.8978, "eval_samples_per_second": 2.576, "eval_steps_per_second": 2.576, "step": 640 }, { "epoch": 0.28704170165127346, "grad_norm": 1.0600725412368774, "learning_rate": 4.414819944598338e-05, "loss": 0.4586, "step": 641 }, { "epoch": 0.2874895046179681, "grad_norm": 1.0352274179458618, "learning_rate": 4.413665743305633e-05, "loss": 0.5248, "step": 642 }, { "epoch": 0.28793730758466274, "grad_norm": 0.9235619306564331, "learning_rate": 4.412511542012927e-05, "loss": 0.6809, "step": 643 }, { "epoch": 0.2883851105513574, "grad_norm": 1.0097838640213013, "learning_rate": 4.4113573407202216e-05, "loss": 0.6336, "step": 644 }, { "epoch": 0.28883291351805207, "grad_norm": 0.9870620965957642, "learning_rate": 4.410203139427516e-05, "loss": 0.6283, "step": 645 }, { "epoch": 0.28928071648474674, "grad_norm": 0.9528816938400269, "learning_rate": 4.409048938134811e-05, "loss": 0.5438, "step": 646 }, { "epoch": 0.28972851945144135, "grad_norm": 1.0778117179870605, "learning_rate": 4.407894736842105e-05, "loss": 0.6447, "step": 647 }, { "epoch": 0.290176322418136, "grad_norm": 1.1421031951904297, "learning_rate": 4.4067405355494e-05, "loss": 0.8675, "step": 648 }, { "epoch": 0.2906241253848307, "grad_norm": 1.1636042594909668, "learning_rate": 4.405586334256694e-05, "loss": 0.6389, "step": 649 }, { "epoch": 0.29107192835152534, "grad_norm": 0.926356315612793, "learning_rate": 4.404432132963989e-05, "loss": 0.4951, "step": 650 }, { "epoch": 0.29107192835152534, "eval_loss": 0.6369218230247498, "eval_runtime": 1733.9976, "eval_samples_per_second": 2.576, "eval_steps_per_second": 2.576, "step": 650 }, { "epoch": 0.29151973131822, "grad_norm": 0.835151731967926, "learning_rate": 4.4032779316712835e-05, "loss": 0.6196, "step": 651 }, { "epoch": 0.2919675342849146, "grad_norm": 0.8978967666625977, "learning_rate": 4.4021237303785784e-05, "loss": 0.8643, "step": 652 }, { "epoch": 0.2924153372516093, "grad_norm": 0.956881582736969, "learning_rate": 4.4009695290858727e-05, "loss": 0.5167, "step": 653 }, { "epoch": 0.29286314021830395, "grad_norm": 0.8161943554878235, "learning_rate": 4.3998153277931676e-05, "loss": 0.3244, "step": 654 }, { "epoch": 0.2933109431849986, "grad_norm": 1.175564169883728, "learning_rate": 4.398661126500462e-05, "loss": 0.6833, "step": 655 }, { "epoch": 0.29375874615169323, "grad_norm": 0.8921207189559937, "learning_rate": 4.397506925207757e-05, "loss": 0.5833, "step": 656 }, { "epoch": 0.2942065491183879, "grad_norm": 1.0308955907821655, "learning_rate": 4.396352723915051e-05, "loss": 0.6401, "step": 657 }, { "epoch": 0.29465435208508256, "grad_norm": 0.8860639333724976, "learning_rate": 4.395198522622346e-05, "loss": 0.6095, "step": 658 }, { "epoch": 0.2951021550517772, "grad_norm": 1.0290175676345825, "learning_rate": 4.39404432132964e-05, "loss": 0.8198, "step": 659 }, { "epoch": 0.2955499580184719, "grad_norm": 1.1758856773376465, "learning_rate": 4.392890120036935e-05, "loss": 0.5188, "step": 660 }, { "epoch": 0.2955499580184719, "eval_loss": 0.6303642988204956, "eval_runtime": 1735.5608, "eval_samples_per_second": 2.574, "eval_steps_per_second": 2.574, "step": 660 }, { "epoch": 0.2959977609851665, "grad_norm": 0.9396849274635315, "learning_rate": 4.3917359187442294e-05, "loss": 0.5156, "step": 661 }, { "epoch": 0.29644556395186117, "grad_norm": 1.129699468612671, "learning_rate": 4.390581717451524e-05, "loss": 0.6017, "step": 662 }, { "epoch": 0.29689336691855583, "grad_norm": 1.2421824932098389, "learning_rate": 4.3894275161588186e-05, "loss": 0.6726, "step": 663 }, { "epoch": 0.2973411698852505, "grad_norm": 0.9329867959022522, "learning_rate": 4.388273314866113e-05, "loss": 0.5167, "step": 664 }, { "epoch": 0.29778897285194517, "grad_norm": 1.0112805366516113, "learning_rate": 4.387119113573407e-05, "loss": 0.704, "step": 665 }, { "epoch": 0.2982367758186398, "grad_norm": 1.0268175601959229, "learning_rate": 4.3859649122807014e-05, "loss": 0.6813, "step": 666 }, { "epoch": 0.29868457878533444, "grad_norm": 0.8663952350616455, "learning_rate": 4.3848107109879964e-05, "loss": 0.7125, "step": 667 }, { "epoch": 0.2991323817520291, "grad_norm": 1.1380077600479126, "learning_rate": 4.3836565096952906e-05, "loss": 0.5561, "step": 668 }, { "epoch": 0.2995801847187238, "grad_norm": 1.1226226091384888, "learning_rate": 4.3825023084025856e-05, "loss": 0.6075, "step": 669 }, { "epoch": 0.30002798768541844, "grad_norm": 1.0129913091659546, "learning_rate": 4.38134810710988e-05, "loss": 0.6707, "step": 670 }, { "epoch": 0.30002798768541844, "eval_loss": 0.6253139972686768, "eval_runtime": 1735.9274, "eval_samples_per_second": 2.573, "eval_steps_per_second": 2.573, "step": 670 }, { "epoch": 0.30047579065211305, "grad_norm": 0.9254921674728394, "learning_rate": 4.380193905817175e-05, "loss": 0.5443, "step": 671 }, { "epoch": 0.3009235936188077, "grad_norm": 0.9654009938240051, "learning_rate": 4.379039704524469e-05, "loss": 0.6952, "step": 672 }, { "epoch": 0.3013713965855024, "grad_norm": 1.0288201570510864, "learning_rate": 4.377885503231764e-05, "loss": 0.5304, "step": 673 }, { "epoch": 0.30181919955219705, "grad_norm": 1.2195161581039429, "learning_rate": 4.376731301939058e-05, "loss": 0.7748, "step": 674 }, { "epoch": 0.3022670025188917, "grad_norm": 1.2199338674545288, "learning_rate": 4.375577100646353e-05, "loss": 0.3766, "step": 675 }, { "epoch": 0.3027148054855863, "grad_norm": 1.010966181755066, "learning_rate": 4.3744228993536474e-05, "loss": 0.8081, "step": 676 }, { "epoch": 0.303162608452281, "grad_norm": 1.1240795850753784, "learning_rate": 4.3732686980609424e-05, "loss": 0.8757, "step": 677 }, { "epoch": 0.30361041141897566, "grad_norm": 1.2343254089355469, "learning_rate": 4.3721144967682366e-05, "loss": 0.7133, "step": 678 }, { "epoch": 0.3040582143856703, "grad_norm": 0.9523788690567017, "learning_rate": 4.3709602954755316e-05, "loss": 0.6402, "step": 679 }, { "epoch": 0.30450601735236493, "grad_norm": 0.9624511003494263, "learning_rate": 4.369806094182826e-05, "loss": 0.7798, "step": 680 }, { "epoch": 0.30450601735236493, "eval_loss": 0.620413601398468, "eval_runtime": 1735.0028, "eval_samples_per_second": 2.575, "eval_steps_per_second": 2.575, "step": 680 }, { "epoch": 0.3049538203190596, "grad_norm": 1.0228697061538696, "learning_rate": 4.368651892890121e-05, "loss": 0.47, "step": 681 }, { "epoch": 0.30540162328575426, "grad_norm": 0.9654502272605896, "learning_rate": 4.367497691597415e-05, "loss": 0.597, "step": 682 }, { "epoch": 0.30584942625244893, "grad_norm": 1.1050530672073364, "learning_rate": 4.366343490304709e-05, "loss": 0.6511, "step": 683 }, { "epoch": 0.3062972292191436, "grad_norm": 0.8982037901878357, "learning_rate": 4.3651892890120035e-05, "loss": 0.4284, "step": 684 }, { "epoch": 0.3067450321858382, "grad_norm": 1.0873932838439941, "learning_rate": 4.3640350877192985e-05, "loss": 0.7394, "step": 685 }, { "epoch": 0.3071928351525329, "grad_norm": 1.0383609533309937, "learning_rate": 4.362880886426593e-05, "loss": 0.5046, "step": 686 }, { "epoch": 0.30764063811922754, "grad_norm": 1.1227695941925049, "learning_rate": 4.361726685133888e-05, "loss": 0.34, "step": 687 }, { "epoch": 0.3080884410859222, "grad_norm": 0.9949764013290405, "learning_rate": 4.360572483841182e-05, "loss": 0.4576, "step": 688 }, { "epoch": 0.30853624405261687, "grad_norm": 1.1606419086456299, "learning_rate": 4.359418282548476e-05, "loss": 0.5196, "step": 689 }, { "epoch": 0.3089840470193115, "grad_norm": 1.3154551982879639, "learning_rate": 4.358264081255771e-05, "loss": 0.5968, "step": 690 }, { "epoch": 0.3089840470193115, "eval_loss": 0.6166278123855591, "eval_runtime": 1736.0293, "eval_samples_per_second": 2.573, "eval_steps_per_second": 2.573, "step": 690 }, { "epoch": 0.30943184998600615, "grad_norm": 1.2704520225524902, "learning_rate": 4.3571098799630654e-05, "loss": 0.668, "step": 691 }, { "epoch": 0.3098796529527008, "grad_norm": 1.2325021028518677, "learning_rate": 4.3559556786703603e-05, "loss": 0.4863, "step": 692 }, { "epoch": 0.3103274559193955, "grad_norm": 1.1729905605316162, "learning_rate": 4.3548014773776546e-05, "loss": 0.7218, "step": 693 }, { "epoch": 0.31077525888609014, "grad_norm": 1.1204662322998047, "learning_rate": 4.3536472760849495e-05, "loss": 0.9501, "step": 694 }, { "epoch": 0.31122306185278475, "grad_norm": 1.0278434753417969, "learning_rate": 4.352493074792244e-05, "loss": 0.5807, "step": 695 }, { "epoch": 0.3116708648194794, "grad_norm": 0.9404197335243225, "learning_rate": 4.351338873499539e-05, "loss": 0.4889, "step": 696 }, { "epoch": 0.3121186677861741, "grad_norm": 1.0532269477844238, "learning_rate": 4.350184672206833e-05, "loss": 0.5541, "step": 697 }, { "epoch": 0.31256647075286875, "grad_norm": 1.015628695487976, "learning_rate": 4.349030470914128e-05, "loss": 0.5046, "step": 698 }, { "epoch": 0.3130142737195634, "grad_norm": 0.9070887565612793, "learning_rate": 4.347876269621422e-05, "loss": 0.6346, "step": 699 }, { "epoch": 0.31346207668625803, "grad_norm": 1.0546003580093384, "learning_rate": 4.346722068328717e-05, "loss": 0.4904, "step": 700 }, { "epoch": 0.31346207668625803, "eval_loss": 0.6045358777046204, "eval_runtime": 1734.5753, "eval_samples_per_second": 2.575, "eval_steps_per_second": 2.575, "step": 700 }, { "epoch": 0.3139098796529527, "grad_norm": 1.2544265985488892, "learning_rate": 4.3455678670360114e-05, "loss": 0.6624, "step": 701 }, { "epoch": 0.31435768261964736, "grad_norm": 0.9051249027252197, "learning_rate": 4.3444136657433057e-05, "loss": 0.5293, "step": 702 }, { "epoch": 0.314805485586342, "grad_norm": 1.036263108253479, "learning_rate": 4.3432594644506006e-05, "loss": 0.7179, "step": 703 }, { "epoch": 0.31525328855303664, "grad_norm": 1.0076425075531006, "learning_rate": 4.342105263157895e-05, "loss": 0.643, "step": 704 }, { "epoch": 0.3157010915197313, "grad_norm": 1.236649751663208, "learning_rate": 4.340951061865189e-05, "loss": 0.8407, "step": 705 }, { "epoch": 0.31614889448642597, "grad_norm": 1.3177474737167358, "learning_rate": 4.339796860572484e-05, "loss": 0.6383, "step": 706 }, { "epoch": 0.31659669745312063, "grad_norm": 1.1313387155532837, "learning_rate": 4.338642659279778e-05, "loss": 0.7257, "step": 707 }, { "epoch": 0.3170445004198153, "grad_norm": 1.0173510313034058, "learning_rate": 4.337488457987073e-05, "loss": 0.4731, "step": 708 }, { "epoch": 0.3174923033865099, "grad_norm": 1.1304107904434204, "learning_rate": 4.3363342566943675e-05, "loss": 0.5356, "step": 709 }, { "epoch": 0.3179401063532046, "grad_norm": 0.9915953278541565, "learning_rate": 4.3351800554016625e-05, "loss": 0.5619, "step": 710 }, { "epoch": 0.3179401063532046, "eval_loss": 0.598295271396637, "eval_runtime": 1735.247, "eval_samples_per_second": 2.574, "eval_steps_per_second": 2.574, "step": 710 }, { "epoch": 0.31838790931989924, "grad_norm": 1.1005866527557373, "learning_rate": 4.334025854108957e-05, "loss": 0.839, "step": 711 }, { "epoch": 0.3188357122865939, "grad_norm": 0.9614087343215942, "learning_rate": 4.3328716528162517e-05, "loss": 0.6166, "step": 712 }, { "epoch": 0.3192835152532886, "grad_norm": 1.0046594142913818, "learning_rate": 4.331717451523546e-05, "loss": 0.4292, "step": 713 }, { "epoch": 0.3197313182199832, "grad_norm": 1.0307554006576538, "learning_rate": 4.33056325023084e-05, "loss": 0.7486, "step": 714 }, { "epoch": 0.32017912118667785, "grad_norm": 1.0028913021087646, "learning_rate": 4.329409048938135e-05, "loss": 0.5175, "step": 715 }, { "epoch": 0.3206269241533725, "grad_norm": 1.2011581659317017, "learning_rate": 4.3282548476454294e-05, "loss": 0.5116, "step": 716 }, { "epoch": 0.3210747271200672, "grad_norm": 0.9902753829956055, "learning_rate": 4.327100646352724e-05, "loss": 0.3947, "step": 717 }, { "epoch": 0.32152253008676185, "grad_norm": 0.945852518081665, "learning_rate": 4.3259464450600186e-05, "loss": 0.6836, "step": 718 }, { "epoch": 0.32197033305345646, "grad_norm": 1.0151649713516235, "learning_rate": 4.3247922437673135e-05, "loss": 0.6962, "step": 719 }, { "epoch": 0.3224181360201511, "grad_norm": 1.1140705347061157, "learning_rate": 4.323638042474608e-05, "loss": 0.3008, "step": 720 }, { "epoch": 0.3224181360201511, "eval_loss": 0.5958486199378967, "eval_runtime": 1737.2477, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 720 }, { "epoch": 0.3228659389868458, "grad_norm": 1.09420907497406, "learning_rate": 4.322483841181903e-05, "loss": 0.6773, "step": 721 }, { "epoch": 0.32331374195354046, "grad_norm": 1.2701865434646606, "learning_rate": 4.321329639889197e-05, "loss": 0.6063, "step": 722 }, { "epoch": 0.3237615449202351, "grad_norm": 1.0487083196640015, "learning_rate": 4.320175438596491e-05, "loss": 0.6196, "step": 723 }, { "epoch": 0.32420934788692973, "grad_norm": 1.1093167066574097, "learning_rate": 4.3190212373037855e-05, "loss": 0.5419, "step": 724 }, { "epoch": 0.3246571508536244, "grad_norm": 1.1824114322662354, "learning_rate": 4.3178670360110804e-05, "loss": 0.6709, "step": 725 }, { "epoch": 0.32510495382031906, "grad_norm": 1.077880859375, "learning_rate": 4.316712834718375e-05, "loss": 0.6136, "step": 726 }, { "epoch": 0.32555275678701373, "grad_norm": 0.8415115475654602, "learning_rate": 4.3155586334256696e-05, "loss": 0.5034, "step": 727 }, { "epoch": 0.32600055975370834, "grad_norm": 1.0477930307388306, "learning_rate": 4.314404432132964e-05, "loss": 0.5437, "step": 728 }, { "epoch": 0.326448362720403, "grad_norm": 1.1078599691390991, "learning_rate": 4.313250230840259e-05, "loss": 0.648, "step": 729 }, { "epoch": 0.3268961656870977, "grad_norm": 0.946194589138031, "learning_rate": 4.312096029547553e-05, "loss": 0.5502, "step": 730 }, { "epoch": 0.3268961656870977, "eval_loss": 0.5906120538711548, "eval_runtime": 1737.2839, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 730 }, { "epoch": 0.32734396865379234, "grad_norm": 0.9358011484146118, "learning_rate": 4.310941828254848e-05, "loss": 0.5899, "step": 731 }, { "epoch": 0.327791771620487, "grad_norm": 1.104750633239746, "learning_rate": 4.309787626962142e-05, "loss": 0.6237, "step": 732 }, { "epoch": 0.3282395745871816, "grad_norm": 0.8731098771095276, "learning_rate": 4.308633425669437e-05, "loss": 0.5086, "step": 733 }, { "epoch": 0.3286873775538763, "grad_norm": 1.0071966648101807, "learning_rate": 4.3074792243767315e-05, "loss": 0.6828, "step": 734 }, { "epoch": 0.32913518052057095, "grad_norm": 1.203137993812561, "learning_rate": 4.3063250230840264e-05, "loss": 0.4755, "step": 735 }, { "epoch": 0.3295829834872656, "grad_norm": 0.9339362382888794, "learning_rate": 4.305170821791321e-05, "loss": 0.5706, "step": 736 }, { "epoch": 0.3300307864539603, "grad_norm": 1.1789636611938477, "learning_rate": 4.3040166204986156e-05, "loss": 0.8102, "step": 737 }, { "epoch": 0.3304785894206549, "grad_norm": 1.0512614250183105, "learning_rate": 4.30286241920591e-05, "loss": 0.3716, "step": 738 }, { "epoch": 0.33092639238734955, "grad_norm": 1.079956293106079, "learning_rate": 4.301708217913204e-05, "loss": 0.4777, "step": 739 }, { "epoch": 0.3313741953540442, "grad_norm": 0.9875457882881165, "learning_rate": 4.300554016620499e-05, "loss": 0.5379, "step": 740 }, { "epoch": 0.3313741953540442, "eval_loss": 0.5849510431289673, "eval_runtime": 1736.9612, "eval_samples_per_second": 2.572, "eval_steps_per_second": 2.572, "step": 740 }, { "epoch": 0.3318219983207389, "grad_norm": 1.170372486114502, "learning_rate": 4.2993998153277933e-05, "loss": 0.6948, "step": 741 }, { "epoch": 0.33226980128743355, "grad_norm": 1.150221347808838, "learning_rate": 4.298245614035088e-05, "loss": 0.6632, "step": 742 }, { "epoch": 0.33271760425412816, "grad_norm": 0.9060766696929932, "learning_rate": 4.2970914127423825e-05, "loss": 0.4876, "step": 743 }, { "epoch": 0.33316540722082283, "grad_norm": 1.0560237169265747, "learning_rate": 4.295937211449677e-05, "loss": 0.4003, "step": 744 }, { "epoch": 0.3336132101875175, "grad_norm": 1.204740047454834, "learning_rate": 4.294783010156971e-05, "loss": 0.5309, "step": 745 }, { "epoch": 0.33406101315421216, "grad_norm": 1.0319269895553589, "learning_rate": 4.293628808864266e-05, "loss": 0.4674, "step": 746 }, { "epoch": 0.3345088161209068, "grad_norm": 1.0303031206130981, "learning_rate": 4.29247460757156e-05, "loss": 0.7802, "step": 747 }, { "epoch": 0.33495661908760144, "grad_norm": 1.2019189596176147, "learning_rate": 4.291320406278855e-05, "loss": 0.6186, "step": 748 }, { "epoch": 0.3354044220542961, "grad_norm": 1.0149527788162231, "learning_rate": 4.2901662049861495e-05, "loss": 0.3242, "step": 749 }, { "epoch": 0.33585222502099077, "grad_norm": 1.458787441253662, "learning_rate": 4.2890120036934444e-05, "loss": 0.8334, "step": 750 }, { "epoch": 0.33585222502099077, "eval_loss": 0.5777731537818909, "eval_runtime": 1737.7233, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 750 }, { "epoch": 0.33630002798768543, "grad_norm": 0.9327331781387329, "learning_rate": 4.2878578024007387e-05, "loss": 0.494, "step": 751 }, { "epoch": 0.33674783095438005, "grad_norm": 0.9456080794334412, "learning_rate": 4.2867036011080336e-05, "loss": 0.3454, "step": 752 }, { "epoch": 0.3371956339210747, "grad_norm": 1.035635232925415, "learning_rate": 4.285549399815328e-05, "loss": 0.6425, "step": 753 }, { "epoch": 0.3376434368877694, "grad_norm": 1.1074715852737427, "learning_rate": 4.284395198522623e-05, "loss": 0.653, "step": 754 }, { "epoch": 0.33809123985446404, "grad_norm": 1.190762996673584, "learning_rate": 4.283240997229917e-05, "loss": 0.7805, "step": 755 }, { "epoch": 0.3385390428211587, "grad_norm": 0.8755838871002197, "learning_rate": 4.282086795937212e-05, "loss": 0.5877, "step": 756 }, { "epoch": 0.3389868457878533, "grad_norm": 0.9096702337265015, "learning_rate": 4.280932594644506e-05, "loss": 0.6816, "step": 757 }, { "epoch": 0.339434648754548, "grad_norm": 0.9515770673751831, "learning_rate": 4.279778393351801e-05, "loss": 0.6265, "step": 758 }, { "epoch": 0.33988245172124265, "grad_norm": 1.0240706205368042, "learning_rate": 4.2786241920590955e-05, "loss": 0.4192, "step": 759 }, { "epoch": 0.3403302546879373, "grad_norm": 1.1301640272140503, "learning_rate": 4.2774699907663904e-05, "loss": 0.4906, "step": 760 }, { "epoch": 0.3403302546879373, "eval_loss": 0.5691836476325989, "eval_runtime": 1736.0523, "eval_samples_per_second": 2.573, "eval_steps_per_second": 2.573, "step": 760 }, { "epoch": 0.340778057654632, "grad_norm": 1.1020630598068237, "learning_rate": 4.2763157894736847e-05, "loss": 0.6314, "step": 761 }, { "epoch": 0.3412258606213266, "grad_norm": 0.9853556752204895, "learning_rate": 4.275161588180979e-05, "loss": 0.288, "step": 762 }, { "epoch": 0.34167366358802126, "grad_norm": 0.965360164642334, "learning_rate": 4.274007386888273e-05, "loss": 0.5128, "step": 763 }, { "epoch": 0.3421214665547159, "grad_norm": 1.1848323345184326, "learning_rate": 4.272853185595568e-05, "loss": 0.6169, "step": 764 }, { "epoch": 0.3425692695214106, "grad_norm": 1.1289207935333252, "learning_rate": 4.2716989843028624e-05, "loss": 0.687, "step": 765 }, { "epoch": 0.34301707248810526, "grad_norm": 1.0918805599212646, "learning_rate": 4.2705447830101566e-05, "loss": 0.5477, "step": 766 }, { "epoch": 0.34346487545479987, "grad_norm": 0.9774726033210754, "learning_rate": 4.2693905817174516e-05, "loss": 0.6455, "step": 767 }, { "epoch": 0.34391267842149453, "grad_norm": 1.1208592653274536, "learning_rate": 4.268236380424746e-05, "loss": 0.492, "step": 768 }, { "epoch": 0.3443604813881892, "grad_norm": 1.0818626880645752, "learning_rate": 4.267082179132041e-05, "loss": 0.7567, "step": 769 }, { "epoch": 0.34480828435488386, "grad_norm": 1.057212471961975, "learning_rate": 4.265927977839335e-05, "loss": 0.6216, "step": 770 }, { "epoch": 0.34480828435488386, "eval_loss": 0.5637683868408203, "eval_runtime": 1737.2303, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 770 }, { "epoch": 0.34525608732157853, "grad_norm": 1.2107055187225342, "learning_rate": 4.26477377654663e-05, "loss": 0.4313, "step": 771 }, { "epoch": 0.34570389028827314, "grad_norm": 1.0803090333938599, "learning_rate": 4.263619575253924e-05, "loss": 0.7932, "step": 772 }, { "epoch": 0.3461516932549678, "grad_norm": 0.9676966667175293, "learning_rate": 4.262465373961219e-05, "loss": 0.57, "step": 773 }, { "epoch": 0.3465994962216625, "grad_norm": 1.0771076679229736, "learning_rate": 4.2613111726685134e-05, "loss": 0.591, "step": 774 }, { "epoch": 0.34704729918835714, "grad_norm": 1.1740727424621582, "learning_rate": 4.2601569713758084e-05, "loss": 0.8161, "step": 775 }, { "epoch": 0.3474951021550518, "grad_norm": 1.026063323020935, "learning_rate": 4.2590027700831026e-05, "loss": 0.6341, "step": 776 }, { "epoch": 0.3479429051217464, "grad_norm": 1.220108985900879, "learning_rate": 4.2578485687903976e-05, "loss": 0.5722, "step": 777 }, { "epoch": 0.3483907080884411, "grad_norm": 0.8669452667236328, "learning_rate": 4.256694367497692e-05, "loss": 0.6794, "step": 778 }, { "epoch": 0.34883851105513575, "grad_norm": 1.0848174095153809, "learning_rate": 4.255540166204987e-05, "loss": 0.4824, "step": 779 }, { "epoch": 0.3492863140218304, "grad_norm": 0.985701858997345, "learning_rate": 4.254385964912281e-05, "loss": 0.6193, "step": 780 }, { "epoch": 0.3492863140218304, "eval_loss": 0.5568503737449646, "eval_runtime": 1733.3312, "eval_samples_per_second": 2.577, "eval_steps_per_second": 2.577, "step": 780 }, { "epoch": 0.349734116988525, "grad_norm": 1.01187264919281, "learning_rate": 4.253231763619576e-05, "loss": 0.6011, "step": 781 }, { "epoch": 0.3501819199552197, "grad_norm": 1.1729329824447632, "learning_rate": 4.25207756232687e-05, "loss": 0.4948, "step": 782 }, { "epoch": 0.35062972292191436, "grad_norm": 1.123641014099121, "learning_rate": 4.2509233610341645e-05, "loss": 0.6441, "step": 783 }, { "epoch": 0.351077525888609, "grad_norm": 1.071698784828186, "learning_rate": 4.249769159741459e-05, "loss": 0.3689, "step": 784 }, { "epoch": 0.3515253288553037, "grad_norm": 1.2566313743591309, "learning_rate": 4.248614958448754e-05, "loss": 0.574, "step": 785 }, { "epoch": 0.3519731318219983, "grad_norm": 0.8795626163482666, "learning_rate": 4.247460757156048e-05, "loss": 0.574, "step": 786 }, { "epoch": 0.35242093478869296, "grad_norm": 1.1304922103881836, "learning_rate": 4.246306555863343e-05, "loss": 0.574, "step": 787 }, { "epoch": 0.35286873775538763, "grad_norm": 0.9006721377372742, "learning_rate": 4.245152354570637e-05, "loss": 0.6294, "step": 788 }, { "epoch": 0.3533165407220823, "grad_norm": 1.0647133588790894, "learning_rate": 4.2439981532779314e-05, "loss": 0.3783, "step": 789 }, { "epoch": 0.35376434368877696, "grad_norm": 1.190906286239624, "learning_rate": 4.2428439519852263e-05, "loss": 0.5647, "step": 790 }, { "epoch": 0.35376434368877696, "eval_loss": 0.5501430630683899, "eval_runtime": 1734.9154, "eval_samples_per_second": 2.575, "eval_steps_per_second": 2.575, "step": 790 }, { "epoch": 0.35421214665547157, "grad_norm": 1.3009206056594849, "learning_rate": 4.2416897506925206e-05, "loss": 0.5486, "step": 791 }, { "epoch": 0.35465994962216624, "grad_norm": 0.9215652346611023, "learning_rate": 4.2405355493998155e-05, "loss": 0.3632, "step": 792 }, { "epoch": 0.3551077525888609, "grad_norm": 0.9980651140213013, "learning_rate": 4.23938134810711e-05, "loss": 0.4124, "step": 793 }, { "epoch": 0.35555555555555557, "grad_norm": 1.074180006980896, "learning_rate": 4.238227146814405e-05, "loss": 0.6014, "step": 794 }, { "epoch": 0.35600335852225024, "grad_norm": 1.0638560056686401, "learning_rate": 4.237072945521699e-05, "loss": 0.454, "step": 795 }, { "epoch": 0.35645116148894485, "grad_norm": 1.1408402919769287, "learning_rate": 4.235918744228994e-05, "loss": 0.5157, "step": 796 }, { "epoch": 0.3568989644556395, "grad_norm": 1.1119613647460938, "learning_rate": 4.234764542936288e-05, "loss": 0.6754, "step": 797 }, { "epoch": 0.3573467674223342, "grad_norm": 1.1197654008865356, "learning_rate": 4.233610341643583e-05, "loss": 0.4981, "step": 798 }, { "epoch": 0.35779457038902884, "grad_norm": 1.090043306350708, "learning_rate": 4.2324561403508774e-05, "loss": 0.716, "step": 799 }, { "epoch": 0.3582423733557235, "grad_norm": 1.050110936164856, "learning_rate": 4.2313019390581723e-05, "loss": 0.4693, "step": 800 }, { "epoch": 0.3582423733557235, "eval_loss": 0.5443281531333923, "eval_runtime": 1730.4364, "eval_samples_per_second": 2.581, "eval_steps_per_second": 2.581, "step": 800 }, { "epoch": 0.3586901763224181, "grad_norm": 1.0029529333114624, "learning_rate": 4.2301477377654666e-05, "loss": 0.6834, "step": 801 }, { "epoch": 0.3591379792891128, "grad_norm": 0.8656796813011169, "learning_rate": 4.228993536472761e-05, "loss": 0.4182, "step": 802 }, { "epoch": 0.35958578225580745, "grad_norm": 0.9622914791107178, "learning_rate": 4.227839335180056e-05, "loss": 0.5761, "step": 803 }, { "epoch": 0.3600335852225021, "grad_norm": 1.291436791419983, "learning_rate": 4.22668513388735e-05, "loss": 0.7108, "step": 804 }, { "epoch": 0.3604813881891967, "grad_norm": 1.2705053091049194, "learning_rate": 4.225530932594644e-05, "loss": 0.5405, "step": 805 }, { "epoch": 0.3609291911558914, "grad_norm": 1.041524052619934, "learning_rate": 4.224376731301939e-05, "loss": 0.3357, "step": 806 }, { "epoch": 0.36137699412258606, "grad_norm": 1.0177375078201294, "learning_rate": 4.2232225300092335e-05, "loss": 0.4982, "step": 807 }, { "epoch": 0.3618247970892807, "grad_norm": 1.0229461193084717, "learning_rate": 4.2220683287165285e-05, "loss": 0.572, "step": 808 }, { "epoch": 0.3622726000559754, "grad_norm": 1.037686824798584, "learning_rate": 4.220914127423823e-05, "loss": 0.7705, "step": 809 }, { "epoch": 0.36272040302267, "grad_norm": 1.0016790628433228, "learning_rate": 4.2197599261311177e-05, "loss": 0.5448, "step": 810 }, { "epoch": 0.36272040302267, "eval_loss": 0.537259042263031, "eval_runtime": 1761.6249, "eval_samples_per_second": 2.536, "eval_steps_per_second": 2.536, "step": 810 }, { "epoch": 0.36316820598936467, "grad_norm": 1.0614540576934814, "learning_rate": 4.218605724838412e-05, "loss": 0.5356, "step": 811 }, { "epoch": 0.36361600895605933, "grad_norm": 0.9634736180305481, "learning_rate": 4.217451523545707e-05, "loss": 0.5571, "step": 812 }, { "epoch": 0.364063811922754, "grad_norm": 1.0522750616073608, "learning_rate": 4.216297322253001e-05, "loss": 0.5121, "step": 813 }, { "epoch": 0.36451161488944867, "grad_norm": 1.1789124011993408, "learning_rate": 4.215143120960296e-05, "loss": 0.5571, "step": 814 }, { "epoch": 0.3649594178561433, "grad_norm": 1.1370149850845337, "learning_rate": 4.21398891966759e-05, "loss": 0.3999, "step": 815 }, { "epoch": 0.36540722082283794, "grad_norm": 1.006212830543518, "learning_rate": 4.2128347183748846e-05, "loss": 0.5291, "step": 816 }, { "epoch": 0.3658550237895326, "grad_norm": 1.1692012548446655, "learning_rate": 4.2116805170821795e-05, "loss": 0.53, "step": 817 }, { "epoch": 0.3663028267562273, "grad_norm": 1.0599125623703003, "learning_rate": 4.210526315789474e-05, "loss": 0.8076, "step": 818 }, { "epoch": 0.36675062972292194, "grad_norm": 1.2722653150558472, "learning_rate": 4.209372114496769e-05, "loss": 0.4796, "step": 819 }, { "epoch": 0.36719843268961655, "grad_norm": 1.155556321144104, "learning_rate": 4.208217913204063e-05, "loss": 0.5004, "step": 820 }, { "epoch": 0.36719843268961655, "eval_loss": 0.5317677855491638, "eval_runtime": 1759.931, "eval_samples_per_second": 2.538, "eval_steps_per_second": 2.538, "step": 820 }, { "epoch": 0.3676462356563112, "grad_norm": 1.2540476322174072, "learning_rate": 4.207063711911358e-05, "loss": 0.6489, "step": 821 }, { "epoch": 0.3680940386230059, "grad_norm": 1.5235843658447266, "learning_rate": 4.205909510618652e-05, "loss": 0.457, "step": 822 }, { "epoch": 0.36854184158970055, "grad_norm": 1.1602377891540527, "learning_rate": 4.2047553093259464e-05, "loss": 0.5357, "step": 823 }, { "epoch": 0.3689896445563952, "grad_norm": 1.1831027269363403, "learning_rate": 4.203601108033241e-05, "loss": 0.5766, "step": 824 }, { "epoch": 0.3694374475230898, "grad_norm": 1.0105196237564087, "learning_rate": 4.2024469067405356e-05, "loss": 0.4265, "step": 825 }, { "epoch": 0.3698852504897845, "grad_norm": 1.036521315574646, "learning_rate": 4.20129270544783e-05, "loss": 0.5833, "step": 826 }, { "epoch": 0.37033305345647916, "grad_norm": 1.0582512617111206, "learning_rate": 4.200138504155125e-05, "loss": 0.279, "step": 827 }, { "epoch": 0.3707808564231738, "grad_norm": 1.1272261142730713, "learning_rate": 4.198984302862419e-05, "loss": 0.9891, "step": 828 }, { "epoch": 0.37122865938986843, "grad_norm": 0.911460816860199, "learning_rate": 4.197830101569714e-05, "loss": 0.4668, "step": 829 }, { "epoch": 0.3716764623565631, "grad_norm": 1.0097686052322388, "learning_rate": 4.196675900277008e-05, "loss": 0.4885, "step": 830 }, { "epoch": 0.3716764623565631, "eval_loss": 0.5257967114448547, "eval_runtime": 1764.0371, "eval_samples_per_second": 2.532, "eval_steps_per_second": 2.532, "step": 830 }, { "epoch": 0.37212426532325776, "grad_norm": 1.0220969915390015, "learning_rate": 4.195521698984303e-05, "loss": 0.3719, "step": 831 }, { "epoch": 0.37257206828995243, "grad_norm": 1.0850633382797241, "learning_rate": 4.1943674976915975e-05, "loss": 0.6425, "step": 832 }, { "epoch": 0.3730198712566471, "grad_norm": 1.0111445188522339, "learning_rate": 4.1932132963988924e-05, "loss": 0.5395, "step": 833 }, { "epoch": 0.3734676742233417, "grad_norm": 0.9761947989463806, "learning_rate": 4.192059095106187e-05, "loss": 0.427, "step": 834 }, { "epoch": 0.37391547719003637, "grad_norm": 1.1134892702102661, "learning_rate": 4.1909048938134816e-05, "loss": 0.4008, "step": 835 }, { "epoch": 0.37436328015673104, "grad_norm": 1.1790378093719482, "learning_rate": 4.189750692520776e-05, "loss": 0.709, "step": 836 }, { "epoch": 0.3748110831234257, "grad_norm": 1.3236595392227173, "learning_rate": 4.188596491228071e-05, "loss": 0.5734, "step": 837 }, { "epoch": 0.37525888609012037, "grad_norm": 1.2026864290237427, "learning_rate": 4.187442289935365e-05, "loss": 0.7704, "step": 838 }, { "epoch": 0.375706689056815, "grad_norm": 1.0784732103347778, "learning_rate": 4.18628808864266e-05, "loss": 0.6095, "step": 839 }, { "epoch": 0.37615449202350965, "grad_norm": 1.0638351440429688, "learning_rate": 4.185133887349954e-05, "loss": 0.8335, "step": 840 }, { "epoch": 0.37615449202350965, "eval_loss": 0.5203561782836914, "eval_runtime": 1759.4932, "eval_samples_per_second": 2.539, "eval_steps_per_second": 2.539, "step": 840 }, { "epoch": 0.3766022949902043, "grad_norm": 1.3564544916152954, "learning_rate": 4.1839796860572486e-05, "loss": 0.5154, "step": 841 }, { "epoch": 0.377050097956899, "grad_norm": 0.9023253917694092, "learning_rate": 4.1828254847645435e-05, "loss": 0.2059, "step": 842 }, { "epoch": 0.37749790092359364, "grad_norm": 0.8635544180870056, "learning_rate": 4.181671283471838e-05, "loss": 0.3199, "step": 843 }, { "epoch": 0.37794570389028825, "grad_norm": 1.0583134889602661, "learning_rate": 4.180517082179132e-05, "loss": 0.7493, "step": 844 }, { "epoch": 0.3783935068569829, "grad_norm": 1.0581653118133545, "learning_rate": 4.179362880886426e-05, "loss": 0.8315, "step": 845 }, { "epoch": 0.3788413098236776, "grad_norm": 1.022495150566101, "learning_rate": 4.178208679593721e-05, "loss": 0.5801, "step": 846 }, { "epoch": 0.37928911279037225, "grad_norm": 1.0368661880493164, "learning_rate": 4.1770544783010155e-05, "loss": 0.4629, "step": 847 }, { "epoch": 0.3797369157570669, "grad_norm": 1.0277600288391113, "learning_rate": 4.1759002770083104e-05, "loss": 0.4293, "step": 848 }, { "epoch": 0.38018471872376153, "grad_norm": 0.9926353096961975, "learning_rate": 4.174746075715605e-05, "loss": 0.545, "step": 849 }, { "epoch": 0.3806325216904562, "grad_norm": 1.0656540393829346, "learning_rate": 4.1735918744228996e-05, "loss": 0.4953, "step": 850 }, { "epoch": 0.3806325216904562, "eval_loss": 0.5157405138015747, "eval_runtime": 1765.5766, "eval_samples_per_second": 2.53, "eval_steps_per_second": 2.53, "step": 850 }, { "epoch": 0.38108032465715086, "grad_norm": 1.0566133260726929, "learning_rate": 4.172437673130194e-05, "loss": 0.3186, "step": 851 }, { "epoch": 0.3815281276238455, "grad_norm": 1.0668479204177856, "learning_rate": 4.171283471837489e-05, "loss": 0.8912, "step": 852 }, { "epoch": 0.38197593059054014, "grad_norm": 0.9448925852775574, "learning_rate": 4.170129270544783e-05, "loss": 0.5759, "step": 853 }, { "epoch": 0.3824237335572348, "grad_norm": 0.9699609279632568, "learning_rate": 4.168975069252078e-05, "loss": 0.7082, "step": 854 }, { "epoch": 0.38287153652392947, "grad_norm": 1.1473195552825928, "learning_rate": 4.167820867959372e-05, "loss": 0.3057, "step": 855 }, { "epoch": 0.38331933949062413, "grad_norm": 1.154795527458191, "learning_rate": 4.166666666666667e-05, "loss": 0.4783, "step": 856 }, { "epoch": 0.3837671424573188, "grad_norm": 1.1641485691070557, "learning_rate": 4.1655124653739615e-05, "loss": 0.428, "step": 857 }, { "epoch": 0.3842149454240134, "grad_norm": 1.0281574726104736, "learning_rate": 4.1643582640812564e-05, "loss": 0.4742, "step": 858 }, { "epoch": 0.3846627483907081, "grad_norm": 1.2137088775634766, "learning_rate": 4.163204062788551e-05, "loss": 0.6097, "step": 859 }, { "epoch": 0.38511055135740274, "grad_norm": 1.0084288120269775, "learning_rate": 4.1620498614958456e-05, "loss": 0.386, "step": 860 }, { "epoch": 0.38511055135740274, "eval_loss": 0.5101390480995178, "eval_runtime": 1766.2692, "eval_samples_per_second": 2.529, "eval_steps_per_second": 2.529, "step": 860 }, { "epoch": 0.3855583543240974, "grad_norm": 1.200697898864746, "learning_rate": 4.16089566020314e-05, "loss": 0.5233, "step": 861 }, { "epoch": 0.3860061572907921, "grad_norm": 1.0038084983825684, "learning_rate": 4.159741458910434e-05, "loss": 0.4321, "step": 862 }, { "epoch": 0.3864539602574867, "grad_norm": 1.0555037260055542, "learning_rate": 4.1585872576177284e-05, "loss": 0.4842, "step": 863 }, { "epoch": 0.38690176322418135, "grad_norm": 1.0753816366195679, "learning_rate": 4.157433056325023e-05, "loss": 0.5594, "step": 864 }, { "epoch": 0.387349566190876, "grad_norm": 0.92066490650177, "learning_rate": 4.1562788550323176e-05, "loss": 0.4383, "step": 865 }, { "epoch": 0.3877973691575707, "grad_norm": 1.1402944326400757, "learning_rate": 4.155124653739612e-05, "loss": 0.4339, "step": 866 }, { "epoch": 0.38824517212426535, "grad_norm": 1.1352843046188354, "learning_rate": 4.153970452446907e-05, "loss": 0.6557, "step": 867 }, { "epoch": 0.38869297509095996, "grad_norm": 1.0460140705108643, "learning_rate": 4.152816251154201e-05, "loss": 0.4145, "step": 868 }, { "epoch": 0.3891407780576546, "grad_norm": 1.0888867378234863, "learning_rate": 4.151662049861496e-05, "loss": 0.6837, "step": 869 }, { "epoch": 0.3895885810243493, "grad_norm": 1.1084412336349487, "learning_rate": 4.15050784856879e-05, "loss": 0.5287, "step": 870 }, { "epoch": 0.3895885810243493, "eval_loss": 0.5020670294761658, "eval_runtime": 1766.2603, "eval_samples_per_second": 2.529, "eval_steps_per_second": 2.529, "step": 870 }, { "epoch": 0.39003638399104396, "grad_norm": 1.0915954113006592, "learning_rate": 4.149353647276085e-05, "loss": 0.48, "step": 871 }, { "epoch": 0.3904841869577386, "grad_norm": 1.0187283754348755, "learning_rate": 4.1481994459833794e-05, "loss": 0.6489, "step": 872 }, { "epoch": 0.39093198992443323, "grad_norm": 1.15608549118042, "learning_rate": 4.1470452446906744e-05, "loss": 0.6752, "step": 873 }, { "epoch": 0.3913797928911279, "grad_norm": 1.2111482620239258, "learning_rate": 4.1458910433979686e-05, "loss": 0.5118, "step": 874 }, { "epoch": 0.39182759585782256, "grad_norm": 0.9627257585525513, "learning_rate": 4.1447368421052636e-05, "loss": 0.4093, "step": 875 }, { "epoch": 0.39227539882451723, "grad_norm": 0.8828468918800354, "learning_rate": 4.143582640812558e-05, "loss": 0.5358, "step": 876 }, { "epoch": 0.39272320179121184, "grad_norm": 0.963966429233551, "learning_rate": 4.142428439519853e-05, "loss": 0.8033, "step": 877 }, { "epoch": 0.3931710047579065, "grad_norm": 0.9207846522331238, "learning_rate": 4.141274238227147e-05, "loss": 0.2906, "step": 878 }, { "epoch": 0.39361880772460117, "grad_norm": 1.082098126411438, "learning_rate": 4.140120036934442e-05, "loss": 0.2977, "step": 879 }, { "epoch": 0.39406661069129584, "grad_norm": 1.2937519550323486, "learning_rate": 4.138965835641736e-05, "loss": 0.407, "step": 880 }, { "epoch": 0.39406661069129584, "eval_loss": 0.49985942244529724, "eval_runtime": 1767.4919, "eval_samples_per_second": 2.527, "eval_steps_per_second": 2.527, "step": 880 }, { "epoch": 0.3945144136579905, "grad_norm": 1.326373815536499, "learning_rate": 4.137811634349031e-05, "loss": 0.4247, "step": 881 }, { "epoch": 0.3949622166246851, "grad_norm": 1.042016625404358, "learning_rate": 4.1366574330563254e-05, "loss": 0.5957, "step": 882 }, { "epoch": 0.3954100195913798, "grad_norm": 1.083022952079773, "learning_rate": 4.13550323176362e-05, "loss": 0.5244, "step": 883 }, { "epoch": 0.39585782255807445, "grad_norm": 1.174361228942871, "learning_rate": 4.134349030470914e-05, "loss": 0.8584, "step": 884 }, { "epoch": 0.3963056255247691, "grad_norm": 1.1042940616607666, "learning_rate": 4.133194829178209e-05, "loss": 0.2712, "step": 885 }, { "epoch": 0.3967534284914638, "grad_norm": 1.1315444707870483, "learning_rate": 4.132040627885503e-05, "loss": 0.5922, "step": 886 }, { "epoch": 0.3972012314581584, "grad_norm": 1.0596696138381958, "learning_rate": 4.130886426592798e-05, "loss": 0.3525, "step": 887 }, { "epoch": 0.39764903442485305, "grad_norm": 1.0386688709259033, "learning_rate": 4.1297322253000924e-05, "loss": 0.7037, "step": 888 }, { "epoch": 0.3980968373915477, "grad_norm": 1.3697748184204102, "learning_rate": 4.128578024007387e-05, "loss": 0.3933, "step": 889 }, { "epoch": 0.3985446403582424, "grad_norm": 1.1484674215316772, "learning_rate": 4.1274238227146816e-05, "loss": 0.4183, "step": 890 }, { "epoch": 0.3985446403582424, "eval_loss": 0.4953237771987915, "eval_runtime": 1762.0817, "eval_samples_per_second": 2.535, "eval_steps_per_second": 2.535, "step": 890 }, { "epoch": 0.39899244332493705, "grad_norm": 1.1704154014587402, "learning_rate": 4.126269621421976e-05, "loss": 0.565, "step": 891 }, { "epoch": 0.39944024629163166, "grad_norm": 1.0532429218292236, "learning_rate": 4.125115420129271e-05, "loss": 0.442, "step": 892 }, { "epoch": 0.39988804925832633, "grad_norm": 1.23839271068573, "learning_rate": 4.123961218836565e-05, "loss": 0.6296, "step": 893 }, { "epoch": 0.400335852225021, "grad_norm": 1.184489369392395, "learning_rate": 4.12280701754386e-05, "loss": 0.4153, "step": 894 }, { "epoch": 0.40078365519171566, "grad_norm": 0.9506192803382874, "learning_rate": 4.121652816251154e-05, "loss": 0.4286, "step": 895 }, { "epoch": 0.4012314581584103, "grad_norm": 1.1313607692718506, "learning_rate": 4.120498614958449e-05, "loss": 0.5289, "step": 896 }, { "epoch": 0.40167926112510494, "grad_norm": 1.0801643133163452, "learning_rate": 4.1193444136657434e-05, "loss": 0.6045, "step": 897 }, { "epoch": 0.4021270640917996, "grad_norm": 1.1855746507644653, "learning_rate": 4.1181902123730384e-05, "loss": 0.5275, "step": 898 }, { "epoch": 0.40257486705849427, "grad_norm": 0.948414146900177, "learning_rate": 4.1170360110803326e-05, "loss": 0.5052, "step": 899 }, { "epoch": 0.40302267002518893, "grad_norm": 0.9570877552032471, "learning_rate": 4.1158818097876276e-05, "loss": 0.3068, "step": 900 }, { "epoch": 0.40302267002518893, "eval_loss": 0.48936161398887634, "eval_runtime": 1738.8904, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 900 }, { "epoch": 0.40347047299188354, "grad_norm": 1.0901323556900024, "learning_rate": 4.114727608494922e-05, "loss": 0.5199, "step": 901 }, { "epoch": 0.4039182759585782, "grad_norm": 1.0945810079574585, "learning_rate": 4.113573407202216e-05, "loss": 0.4492, "step": 902 }, { "epoch": 0.4043660789252729, "grad_norm": 1.1015689373016357, "learning_rate": 4.112419205909511e-05, "loss": 0.39, "step": 903 }, { "epoch": 0.40481388189196754, "grad_norm": 1.30820631980896, "learning_rate": 4.111265004616805e-05, "loss": 0.5595, "step": 904 }, { "epoch": 0.4052616848586622, "grad_norm": 1.3023115396499634, "learning_rate": 4.1101108033240995e-05, "loss": 0.7433, "step": 905 }, { "epoch": 0.4057094878253568, "grad_norm": 1.2857636213302612, "learning_rate": 4.1089566020313945e-05, "loss": 0.7139, "step": 906 }, { "epoch": 0.4061572907920515, "grad_norm": 1.0232025384902954, "learning_rate": 4.107802400738689e-05, "loss": 0.345, "step": 907 }, { "epoch": 0.40660509375874615, "grad_norm": 1.0496271848678589, "learning_rate": 4.106648199445984e-05, "loss": 0.2903, "step": 908 }, { "epoch": 0.4070528967254408, "grad_norm": 1.170128345489502, "learning_rate": 4.105493998153278e-05, "loss": 0.5301, "step": 909 }, { "epoch": 0.4075006996921355, "grad_norm": 0.9690027832984924, "learning_rate": 4.104339796860573e-05, "loss": 0.5454, "step": 910 }, { "epoch": 0.4075006996921355, "eval_loss": 0.48335975408554077, "eval_runtime": 1729.8426, "eval_samples_per_second": 2.582, "eval_steps_per_second": 2.582, "step": 910 }, { "epoch": 0.4079485026588301, "grad_norm": 1.0750856399536133, "learning_rate": 4.103185595567867e-05, "loss": 0.396, "step": 911 }, { "epoch": 0.40839630562552476, "grad_norm": 0.9750925302505493, "learning_rate": 4.102031394275162e-05, "loss": 0.3567, "step": 912 }, { "epoch": 0.4088441085922194, "grad_norm": 1.2686316967010498, "learning_rate": 4.100877192982456e-05, "loss": 0.4793, "step": 913 }, { "epoch": 0.4092919115589141, "grad_norm": 1.0663954019546509, "learning_rate": 4.099722991689751e-05, "loss": 0.7291, "step": 914 }, { "epoch": 0.40973971452560876, "grad_norm": 1.1266555786132812, "learning_rate": 4.0985687903970455e-05, "loss": 0.729, "step": 915 }, { "epoch": 0.41018751749230337, "grad_norm": 1.218075156211853, "learning_rate": 4.09741458910434e-05, "loss": 0.6157, "step": 916 }, { "epoch": 0.41063532045899803, "grad_norm": 1.142625093460083, "learning_rate": 4.096260387811635e-05, "loss": 0.5192, "step": 917 }, { "epoch": 0.4110831234256927, "grad_norm": 1.0120737552642822, "learning_rate": 4.095106186518929e-05, "loss": 0.5094, "step": 918 }, { "epoch": 0.41153092639238736, "grad_norm": 1.1946959495544434, "learning_rate": 4.093951985226224e-05, "loss": 0.4118, "step": 919 }, { "epoch": 0.41197872935908203, "grad_norm": 0.9003228545188904, "learning_rate": 4.092797783933518e-05, "loss": 0.3265, "step": 920 }, { "epoch": 0.41197872935908203, "eval_loss": 0.477049857378006, "eval_runtime": 1730.2627, "eval_samples_per_second": 2.582, "eval_steps_per_second": 2.582, "step": 920 }, { "epoch": 0.41242653232577664, "grad_norm": 1.0392440557479858, "learning_rate": 4.091643582640813e-05, "loss": 0.3326, "step": 921 }, { "epoch": 0.4128743352924713, "grad_norm": 1.0120160579681396, "learning_rate": 4.0904893813481074e-05, "loss": 0.4699, "step": 922 }, { "epoch": 0.41332213825916597, "grad_norm": 1.2628374099731445, "learning_rate": 4.0893351800554016e-05, "loss": 0.455, "step": 923 }, { "epoch": 0.41376994122586064, "grad_norm": 1.1514098644256592, "learning_rate": 4.088180978762696e-05, "loss": 0.4138, "step": 924 }, { "epoch": 0.41421774419255525, "grad_norm": 1.059490442276001, "learning_rate": 4.087026777469991e-05, "loss": 0.3612, "step": 925 }, { "epoch": 0.4146655471592499, "grad_norm": 0.92525714635849, "learning_rate": 4.085872576177285e-05, "loss": 0.4365, "step": 926 }, { "epoch": 0.4151133501259446, "grad_norm": 1.0232129096984863, "learning_rate": 4.08471837488458e-05, "loss": 0.3875, "step": 927 }, { "epoch": 0.41556115309263925, "grad_norm": 1.272973656654358, "learning_rate": 4.083564173591874e-05, "loss": 0.3881, "step": 928 }, { "epoch": 0.4160089560593339, "grad_norm": 1.49363374710083, "learning_rate": 4.082409972299169e-05, "loss": 0.6014, "step": 929 }, { "epoch": 0.4164567590260285, "grad_norm": 1.0534272193908691, "learning_rate": 4.0812557710064635e-05, "loss": 0.339, "step": 930 }, { "epoch": 0.4164567590260285, "eval_loss": 0.4716756343841553, "eval_runtime": 1730.6307, "eval_samples_per_second": 2.581, "eval_steps_per_second": 2.581, "step": 930 }, { "epoch": 0.4169045619927232, "grad_norm": 1.0245121717453003, "learning_rate": 4.0801015697137584e-05, "loss": 0.7959, "step": 931 }, { "epoch": 0.41735236495941785, "grad_norm": 1.0729509592056274, "learning_rate": 4.078947368421053e-05, "loss": 0.3776, "step": 932 }, { "epoch": 0.4178001679261125, "grad_norm": 1.1630797386169434, "learning_rate": 4.0777931671283476e-05, "loss": 0.5052, "step": 933 }, { "epoch": 0.4182479708928072, "grad_norm": 0.955295205116272, "learning_rate": 4.076638965835642e-05, "loss": 0.3525, "step": 934 }, { "epoch": 0.4186957738595018, "grad_norm": 1.0843642950057983, "learning_rate": 4.075484764542937e-05, "loss": 0.4731, "step": 935 }, { "epoch": 0.41914357682619646, "grad_norm": 0.9524621367454529, "learning_rate": 4.074330563250231e-05, "loss": 0.3274, "step": 936 }, { "epoch": 0.41959137979289113, "grad_norm": 1.1125507354736328, "learning_rate": 4.073176361957526e-05, "loss": 0.4461, "step": 937 }, { "epoch": 0.4200391827595858, "grad_norm": 1.0988569259643555, "learning_rate": 4.07202216066482e-05, "loss": 0.3417, "step": 938 }, { "epoch": 0.42048698572628046, "grad_norm": 1.270066261291504, "learning_rate": 4.070867959372115e-05, "loss": 0.5205, "step": 939 }, { "epoch": 0.42093478869297507, "grad_norm": 1.0238381624221802, "learning_rate": 4.0697137580794095e-05, "loss": 0.4402, "step": 940 }, { "epoch": 0.42093478869297507, "eval_loss": 0.4671795666217804, "eval_runtime": 1732.843, "eval_samples_per_second": 2.578, "eval_steps_per_second": 2.578, "step": 940 }, { "epoch": 0.42138259165966974, "grad_norm": 0.8779534697532654, "learning_rate": 4.068559556786704e-05, "loss": 0.3124, "step": 941 }, { "epoch": 0.4218303946263644, "grad_norm": 1.201365351676941, "learning_rate": 4.067405355493999e-05, "loss": 0.2708, "step": 942 }, { "epoch": 0.42227819759305907, "grad_norm": 1.1746056079864502, "learning_rate": 4.066251154201293e-05, "loss": 0.3772, "step": 943 }, { "epoch": 0.42272600055975373, "grad_norm": 1.3690954446792603, "learning_rate": 4.065096952908587e-05, "loss": 0.569, "step": 944 }, { "epoch": 0.42317380352644834, "grad_norm": 1.2674678564071655, "learning_rate": 4.0639427516158815e-05, "loss": 0.3943, "step": 945 }, { "epoch": 0.423621606493143, "grad_norm": 1.16683030128479, "learning_rate": 4.0627885503231764e-05, "loss": 0.5455, "step": 946 }, { "epoch": 0.4240694094598377, "grad_norm": 1.226865291595459, "learning_rate": 4.061634349030471e-05, "loss": 0.4399, "step": 947 }, { "epoch": 0.42451721242653234, "grad_norm": 0.9444255232810974, "learning_rate": 4.0604801477377656e-05, "loss": 0.5685, "step": 948 }, { "epoch": 0.42496501539322695, "grad_norm": 1.2744015455245972, "learning_rate": 4.05932594644506e-05, "loss": 0.7167, "step": 949 }, { "epoch": 0.4254128183599216, "grad_norm": 0.9263292551040649, "learning_rate": 4.058171745152355e-05, "loss": 0.2135, "step": 950 }, { "epoch": 0.4254128183599216, "eval_loss": 0.46275782585144043, "eval_runtime": 1737.1967, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 950 }, { "epoch": 0.4258606213266163, "grad_norm": 1.1970070600509644, "learning_rate": 4.057017543859649e-05, "loss": 0.4788, "step": 951 }, { "epoch": 0.42630842429331095, "grad_norm": 1.0885076522827148, "learning_rate": 4.055863342566944e-05, "loss": 0.1885, "step": 952 }, { "epoch": 0.4267562272600056, "grad_norm": 1.155806064605713, "learning_rate": 4.054709141274238e-05, "loss": 0.3639, "step": 953 }, { "epoch": 0.4272040302267002, "grad_norm": 0.9880549311637878, "learning_rate": 4.053554939981533e-05, "loss": 0.4937, "step": 954 }, { "epoch": 0.4276518331933949, "grad_norm": 1.2488168478012085, "learning_rate": 4.0524007386888275e-05, "loss": 0.3599, "step": 955 }, { "epoch": 0.42809963616008956, "grad_norm": 1.1837854385375977, "learning_rate": 4.0512465373961224e-05, "loss": 0.4442, "step": 956 }, { "epoch": 0.4285474391267842, "grad_norm": 1.3064805269241333, "learning_rate": 4.050092336103417e-05, "loss": 0.6201, "step": 957 }, { "epoch": 0.4289952420934789, "grad_norm": 1.2136608362197876, "learning_rate": 4.0489381348107116e-05, "loss": 0.6388, "step": 958 }, { "epoch": 0.4294430450601735, "grad_norm": 1.073799729347229, "learning_rate": 4.047783933518006e-05, "loss": 0.5717, "step": 959 }, { "epoch": 0.42989084802686817, "grad_norm": 1.0983150005340576, "learning_rate": 4.046629732225301e-05, "loss": 0.6641, "step": 960 }, { "epoch": 0.42989084802686817, "eval_loss": 0.457937628030777, "eval_runtime": 1737.7661, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 960 }, { "epoch": 0.43033865099356283, "grad_norm": 1.405277132987976, "learning_rate": 4.045475530932595e-05, "loss": 0.5287, "step": 961 }, { "epoch": 0.4307864539602575, "grad_norm": 0.9269359111785889, "learning_rate": 4.044321329639889e-05, "loss": 0.2857, "step": 962 }, { "epoch": 0.43123425692695216, "grad_norm": 1.2387222051620483, "learning_rate": 4.0431671283471836e-05, "loss": 0.4877, "step": 963 }, { "epoch": 0.4316820598936468, "grad_norm": 0.8296558856964111, "learning_rate": 4.0420129270544785e-05, "loss": 0.302, "step": 964 }, { "epoch": 0.43212986286034144, "grad_norm": 1.1528252363204956, "learning_rate": 4.040858725761773e-05, "loss": 0.3738, "step": 965 }, { "epoch": 0.4325776658270361, "grad_norm": 1.3704917430877686, "learning_rate": 4.039704524469067e-05, "loss": 0.5414, "step": 966 }, { "epoch": 0.4330254687937308, "grad_norm": 1.276453971862793, "learning_rate": 4.038550323176362e-05, "loss": 0.6736, "step": 967 }, { "epoch": 0.43347327176042544, "grad_norm": 0.9381370544433594, "learning_rate": 4.037396121883656e-05, "loss": 0.4799, "step": 968 }, { "epoch": 0.43392107472712005, "grad_norm": 1.047900915145874, "learning_rate": 4.036241920590951e-05, "loss": 0.4303, "step": 969 }, { "epoch": 0.4343688776938147, "grad_norm": 1.3831329345703125, "learning_rate": 4.0350877192982455e-05, "loss": 0.4318, "step": 970 }, { "epoch": 0.4343688776938147, "eval_loss": 0.4529132843017578, "eval_runtime": 1743.0645, "eval_samples_per_second": 2.563, "eval_steps_per_second": 2.563, "step": 970 }, { "epoch": 0.4348166806605094, "grad_norm": 1.0554351806640625, "learning_rate": 4.0339335180055404e-05, "loss": 0.3476, "step": 971 }, { "epoch": 0.43526448362720405, "grad_norm": 1.0319788455963135, "learning_rate": 4.0327793167128346e-05, "loss": 0.4205, "step": 972 }, { "epoch": 0.4357122865938987, "grad_norm": 1.0676701068878174, "learning_rate": 4.0316251154201296e-05, "loss": 0.4594, "step": 973 }, { "epoch": 0.4361600895605933, "grad_norm": 1.3130353689193726, "learning_rate": 4.030470914127424e-05, "loss": 0.5385, "step": 974 }, { "epoch": 0.436607892527288, "grad_norm": 1.171388030052185, "learning_rate": 4.029316712834719e-05, "loss": 0.3008, "step": 975 }, { "epoch": 0.43705569549398265, "grad_norm": 1.1871421337127686, "learning_rate": 4.028162511542013e-05, "loss": 0.518, "step": 976 }, { "epoch": 0.4375034984606773, "grad_norm": 1.2678828239440918, "learning_rate": 4.027008310249308e-05, "loss": 0.4653, "step": 977 }, { "epoch": 0.43795130142737193, "grad_norm": 1.2946516275405884, "learning_rate": 4.025854108956602e-05, "loss": 0.4609, "step": 978 }, { "epoch": 0.4383991043940666, "grad_norm": 1.4077446460723877, "learning_rate": 4.024699907663897e-05, "loss": 0.7787, "step": 979 }, { "epoch": 0.43884690736076126, "grad_norm": 1.2191566228866577, "learning_rate": 4.0235457063711914e-05, "loss": 0.4084, "step": 980 }, { "epoch": 0.43884690736076126, "eval_loss": 0.4486415684223175, "eval_runtime": 1746.9057, "eval_samples_per_second": 2.557, "eval_steps_per_second": 2.557, "step": 980 }, { "epoch": 0.43929471032745593, "grad_norm": 1.2254949808120728, "learning_rate": 4.0223915050784864e-05, "loss": 0.3569, "step": 981 }, { "epoch": 0.4397425132941506, "grad_norm": 0.9401348233222961, "learning_rate": 4.0212373037857806e-05, "loss": 0.3883, "step": 982 }, { "epoch": 0.4401903162608452, "grad_norm": 1.2795180082321167, "learning_rate": 4.020083102493075e-05, "loss": 0.4951, "step": 983 }, { "epoch": 0.44063811922753987, "grad_norm": 1.2436785697937012, "learning_rate": 4.018928901200369e-05, "loss": 0.4338, "step": 984 }, { "epoch": 0.44108592219423454, "grad_norm": 1.1067421436309814, "learning_rate": 4.017774699907664e-05, "loss": 0.4877, "step": 985 }, { "epoch": 0.4415337251609292, "grad_norm": 1.0319819450378418, "learning_rate": 4.0166204986149584e-05, "loss": 0.3879, "step": 986 }, { "epoch": 0.44198152812762387, "grad_norm": 1.2362407445907593, "learning_rate": 4.015466297322253e-05, "loss": 0.6812, "step": 987 }, { "epoch": 0.4424293310943185, "grad_norm": 1.3658016920089722, "learning_rate": 4.0143120960295476e-05, "loss": 0.5854, "step": 988 }, { "epoch": 0.44287713406101314, "grad_norm": 1.0967875719070435, "learning_rate": 4.0131578947368425e-05, "loss": 0.4034, "step": 989 }, { "epoch": 0.4433249370277078, "grad_norm": 1.025646686553955, "learning_rate": 4.012003693444137e-05, "loss": 0.2335, "step": 990 }, { "epoch": 0.4433249370277078, "eval_loss": 0.4429576098918915, "eval_runtime": 1743.9799, "eval_samples_per_second": 2.561, "eval_steps_per_second": 2.561, "step": 990 }, { "epoch": 0.4437727399944025, "grad_norm": 1.1534349918365479, "learning_rate": 4.010849492151431e-05, "loss": 0.4779, "step": 991 }, { "epoch": 0.44422054296109714, "grad_norm": 1.3908884525299072, "learning_rate": 4.009695290858726e-05, "loss": 0.4558, "step": 992 }, { "epoch": 0.44466834592779175, "grad_norm": 1.4888511896133423, "learning_rate": 4.00854108956602e-05, "loss": 0.5484, "step": 993 }, { "epoch": 0.4451161488944864, "grad_norm": 1.1760286092758179, "learning_rate": 4.007386888273315e-05, "loss": 0.3029, "step": 994 }, { "epoch": 0.4455639518611811, "grad_norm": 1.3324183225631714, "learning_rate": 4.0062326869806094e-05, "loss": 0.2873, "step": 995 }, { "epoch": 0.44601175482787575, "grad_norm": 1.3383257389068604, "learning_rate": 4.0050784856879044e-05, "loss": 0.4015, "step": 996 }, { "epoch": 0.4464595577945704, "grad_norm": 1.0451068878173828, "learning_rate": 4.0039242843951986e-05, "loss": 0.3162, "step": 997 }, { "epoch": 0.446907360761265, "grad_norm": 1.1706875562667847, "learning_rate": 4.0027700831024936e-05, "loss": 0.5261, "step": 998 }, { "epoch": 0.4473551637279597, "grad_norm": 1.3044228553771973, "learning_rate": 4.001615881809788e-05, "loss": 0.4846, "step": 999 }, { "epoch": 0.44780296669465436, "grad_norm": 1.2698777914047241, "learning_rate": 4.000461680517083e-05, "loss": 0.3915, "step": 1000 }, { "epoch": 0.44780296669465436, "eval_loss": 0.43732336163520813, "eval_runtime": 1742.5158, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 1000 }, { "epoch": 0.448250769661349, "grad_norm": 1.1517488956451416, "learning_rate": 3.999307479224377e-05, "loss": 0.4172, "step": 1001 }, { "epoch": 0.44869857262804363, "grad_norm": 0.9378765821456909, "learning_rate": 3.998153277931671e-05, "loss": 0.3321, "step": 1002 }, { "epoch": 0.4491463755947383, "grad_norm": 1.2423738241195679, "learning_rate": 3.996999076638966e-05, "loss": 0.3526, "step": 1003 }, { "epoch": 0.44959417856143297, "grad_norm": 1.476121187210083, "learning_rate": 3.9958448753462605e-05, "loss": 0.4987, "step": 1004 }, { "epoch": 0.45004198152812763, "grad_norm": 1.0502073764801025, "learning_rate": 3.994690674053555e-05, "loss": 0.351, "step": 1005 }, { "epoch": 0.4504897844948223, "grad_norm": 1.0767852067947388, "learning_rate": 3.99353647276085e-05, "loss": 0.4662, "step": 1006 }, { "epoch": 0.4509375874615169, "grad_norm": 1.2034271955490112, "learning_rate": 3.992382271468144e-05, "loss": 0.5037, "step": 1007 }, { "epoch": 0.4513853904282116, "grad_norm": 0.9691647887229919, "learning_rate": 3.991228070175439e-05, "loss": 0.3718, "step": 1008 }, { "epoch": 0.45183319339490624, "grad_norm": 1.139522910118103, "learning_rate": 3.990073868882733e-05, "loss": 0.5095, "step": 1009 }, { "epoch": 0.4522809963616009, "grad_norm": 1.2414535284042358, "learning_rate": 3.988919667590028e-05, "loss": 0.4806, "step": 1010 }, { "epoch": 0.4522809963616009, "eval_loss": 0.43430060148239136, "eval_runtime": 1742.0984, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 1010 }, { "epoch": 0.4527287993282956, "grad_norm": 1.1482878923416138, "learning_rate": 3.987765466297322e-05, "loss": 0.2714, "step": 1011 }, { "epoch": 0.4531766022949902, "grad_norm": 1.3171560764312744, "learning_rate": 3.986611265004617e-05, "loss": 0.4045, "step": 1012 }, { "epoch": 0.45362440526168485, "grad_norm": 1.0673166513442993, "learning_rate": 3.9854570637119115e-05, "loss": 0.5288, "step": 1013 }, { "epoch": 0.4540722082283795, "grad_norm": 1.1590580940246582, "learning_rate": 3.9843028624192065e-05, "loss": 0.4681, "step": 1014 }, { "epoch": 0.4545200111950742, "grad_norm": 1.1945033073425293, "learning_rate": 3.983148661126501e-05, "loss": 0.3764, "step": 1015 }, { "epoch": 0.45496781416176885, "grad_norm": 1.2974134683609009, "learning_rate": 3.981994459833795e-05, "loss": 0.3517, "step": 1016 }, { "epoch": 0.45541561712846346, "grad_norm": 1.2538046836853027, "learning_rate": 3.98084025854109e-05, "loss": 0.5615, "step": 1017 }, { "epoch": 0.4558634200951581, "grad_norm": 1.0046873092651367, "learning_rate": 3.979686057248384e-05, "loss": 0.2909, "step": 1018 }, { "epoch": 0.4563112230618528, "grad_norm": 1.049226999282837, "learning_rate": 3.978531855955679e-05, "loss": 0.4252, "step": 1019 }, { "epoch": 0.45675902602854745, "grad_norm": 1.1835418939590454, "learning_rate": 3.9773776546629734e-05, "loss": 0.3724, "step": 1020 }, { "epoch": 0.45675902602854745, "eval_loss": 0.4290771484375, "eval_runtime": 1743.8313, "eval_samples_per_second": 2.562, "eval_steps_per_second": 2.562, "step": 1020 }, { "epoch": 0.4572068289952421, "grad_norm": 0.9210574626922607, "learning_rate": 3.976223453370268e-05, "loss": 0.3855, "step": 1021 }, { "epoch": 0.45765463196193673, "grad_norm": 1.167533278465271, "learning_rate": 3.9750692520775626e-05, "loss": 0.5742, "step": 1022 }, { "epoch": 0.4581024349286314, "grad_norm": 0.9851381778717041, "learning_rate": 3.973915050784857e-05, "loss": 0.3645, "step": 1023 }, { "epoch": 0.45855023789532606, "grad_norm": 1.2704929113388062, "learning_rate": 3.972760849492151e-05, "loss": 0.4467, "step": 1024 }, { "epoch": 0.45899804086202073, "grad_norm": 1.1447492837905884, "learning_rate": 3.971606648199446e-05, "loss": 0.3171, "step": 1025 }, { "epoch": 0.45944584382871534, "grad_norm": 1.1135897636413574, "learning_rate": 3.97045244690674e-05, "loss": 0.5033, "step": 1026 }, { "epoch": 0.45989364679541, "grad_norm": 1.3386653661727905, "learning_rate": 3.969298245614035e-05, "loss": 0.6805, "step": 1027 }, { "epoch": 0.46034144976210467, "grad_norm": 0.9690244197845459, "learning_rate": 3.9681440443213295e-05, "loss": 0.2986, "step": 1028 }, { "epoch": 0.46078925272879934, "grad_norm": 1.201811671257019, "learning_rate": 3.9669898430286245e-05, "loss": 0.4149, "step": 1029 }, { "epoch": 0.461237055695494, "grad_norm": 0.9881176948547363, "learning_rate": 3.965835641735919e-05, "loss": 0.4466, "step": 1030 }, { "epoch": 0.461237055695494, "eval_loss": 0.423381507396698, "eval_runtime": 1742.3334, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 1030 }, { "epoch": 0.4616848586621886, "grad_norm": 1.0963369607925415, "learning_rate": 3.9646814404432136e-05, "loss": 0.5915, "step": 1031 }, { "epoch": 0.4621326616288833, "grad_norm": 1.0670597553253174, "learning_rate": 3.963527239150508e-05, "loss": 0.4366, "step": 1032 }, { "epoch": 0.46258046459557794, "grad_norm": 1.0526273250579834, "learning_rate": 3.962373037857803e-05, "loss": 0.3719, "step": 1033 }, { "epoch": 0.4630282675622726, "grad_norm": 1.1232683658599854, "learning_rate": 3.961218836565097e-05, "loss": 0.5496, "step": 1034 }, { "epoch": 0.4634760705289673, "grad_norm": 1.2455625534057617, "learning_rate": 3.960064635272392e-05, "loss": 0.3716, "step": 1035 }, { "epoch": 0.4639238734956619, "grad_norm": 1.116101622581482, "learning_rate": 3.958910433979686e-05, "loss": 0.2784, "step": 1036 }, { "epoch": 0.46437167646235655, "grad_norm": 1.1567442417144775, "learning_rate": 3.957756232686981e-05, "loss": 0.4029, "step": 1037 }, { "epoch": 0.4648194794290512, "grad_norm": 1.0800672769546509, "learning_rate": 3.9566020313942755e-05, "loss": 0.2335, "step": 1038 }, { "epoch": 0.4652672823957459, "grad_norm": 0.8808770775794983, "learning_rate": 3.9554478301015704e-05, "loss": 0.2624, "step": 1039 }, { "epoch": 0.46571508536244055, "grad_norm": 1.176003098487854, "learning_rate": 3.954293628808865e-05, "loss": 0.3314, "step": 1040 }, { "epoch": 0.46571508536244055, "eval_loss": 0.41852641105651855, "eval_runtime": 1741.3418, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 1040 }, { "epoch": 0.46616288832913516, "grad_norm": 1.1637083292007446, "learning_rate": 3.953139427516159e-05, "loss": 0.2736, "step": 1041 }, { "epoch": 0.4666106912958298, "grad_norm": 1.36524498462677, "learning_rate": 3.951985226223454e-05, "loss": 0.3522, "step": 1042 }, { "epoch": 0.4670584942625245, "grad_norm": 1.0998141765594482, "learning_rate": 3.950831024930748e-05, "loss": 0.4015, "step": 1043 }, { "epoch": 0.46750629722921916, "grad_norm": 1.8939387798309326, "learning_rate": 3.9496768236380424e-05, "loss": 0.4661, "step": 1044 }, { "epoch": 0.4679541001959138, "grad_norm": 1.365972876548767, "learning_rate": 3.948522622345337e-05, "loss": 0.3268, "step": 1045 }, { "epoch": 0.46840190316260844, "grad_norm": 1.078254222869873, "learning_rate": 3.9473684210526316e-05, "loss": 0.37, "step": 1046 }, { "epoch": 0.4688497061293031, "grad_norm": 1.2513341903686523, "learning_rate": 3.946214219759926e-05, "loss": 0.7105, "step": 1047 }, { "epoch": 0.46929750909599777, "grad_norm": 1.2173830270767212, "learning_rate": 3.945060018467221e-05, "loss": 0.4167, "step": 1048 }, { "epoch": 0.46974531206269243, "grad_norm": 1.5618706941604614, "learning_rate": 3.943905817174515e-05, "loss": 0.3482, "step": 1049 }, { "epoch": 0.47019311502938704, "grad_norm": 1.234503984451294, "learning_rate": 3.94275161588181e-05, "loss": 0.5212, "step": 1050 }, { "epoch": 0.47019311502938704, "eval_loss": 0.41508200764656067, "eval_runtime": 1741.1289, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 1050 }, { "epoch": 0.4706409179960817, "grad_norm": 1.360834002494812, "learning_rate": 3.941597414589104e-05, "loss": 0.2982, "step": 1051 }, { "epoch": 0.4710887209627764, "grad_norm": 1.036285638809204, "learning_rate": 3.940443213296399e-05, "loss": 0.2856, "step": 1052 }, { "epoch": 0.47153652392947104, "grad_norm": 1.3228135108947754, "learning_rate": 3.9392890120036935e-05, "loss": 0.4763, "step": 1053 }, { "epoch": 0.4719843268961657, "grad_norm": 1.2010780572891235, "learning_rate": 3.9381348107109884e-05, "loss": 0.3307, "step": 1054 }, { "epoch": 0.4724321298628603, "grad_norm": 1.1086570024490356, "learning_rate": 3.936980609418283e-05, "loss": 0.4546, "step": 1055 }, { "epoch": 0.472879932829555, "grad_norm": 1.1960604190826416, "learning_rate": 3.9358264081255776e-05, "loss": 0.405, "step": 1056 }, { "epoch": 0.47332773579624965, "grad_norm": 1.186313509941101, "learning_rate": 3.934672206832872e-05, "loss": 0.4105, "step": 1057 }, { "epoch": 0.4737755387629443, "grad_norm": 0.9901049137115479, "learning_rate": 3.933518005540167e-05, "loss": 0.4515, "step": 1058 }, { "epoch": 0.474223341729639, "grad_norm": 1.000322699546814, "learning_rate": 3.932363804247461e-05, "loss": 0.2171, "step": 1059 }, { "epoch": 0.4746711446963336, "grad_norm": 1.177179217338562, "learning_rate": 3.931209602954756e-05, "loss": 0.3213, "step": 1060 }, { "epoch": 0.4746711446963336, "eval_loss": 0.4125591516494751, "eval_runtime": 1740.7949, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 1060 }, { "epoch": 0.47511894766302826, "grad_norm": 0.945931613445282, "learning_rate": 3.93005540166205e-05, "loss": 0.4175, "step": 1061 }, { "epoch": 0.4755667506297229, "grad_norm": 1.0835816860198975, "learning_rate": 3.9289012003693445e-05, "loss": 0.2497, "step": 1062 }, { "epoch": 0.4760145535964176, "grad_norm": 1.0238778591156006, "learning_rate": 3.927746999076639e-05, "loss": 0.2807, "step": 1063 }, { "epoch": 0.47646235656311225, "grad_norm": 1.2828023433685303, "learning_rate": 3.926592797783934e-05, "loss": 0.3368, "step": 1064 }, { "epoch": 0.47691015952980687, "grad_norm": 1.323936104774475, "learning_rate": 3.925438596491228e-05, "loss": 0.4173, "step": 1065 }, { "epoch": 0.47735796249650153, "grad_norm": 1.270038366317749, "learning_rate": 3.924284395198522e-05, "loss": 0.4249, "step": 1066 }, { "epoch": 0.4778057654631962, "grad_norm": 1.2153494358062744, "learning_rate": 3.923130193905817e-05, "loss": 0.2474, "step": 1067 }, { "epoch": 0.47825356842989086, "grad_norm": 1.2241121530532837, "learning_rate": 3.9219759926131115e-05, "loss": 0.3937, "step": 1068 }, { "epoch": 0.47870137139658553, "grad_norm": 1.294264793395996, "learning_rate": 3.9208217913204064e-05, "loss": 0.7127, "step": 1069 }, { "epoch": 0.47914917436328014, "grad_norm": 1.1305440664291382, "learning_rate": 3.9196675900277007e-05, "loss": 0.552, "step": 1070 }, { "epoch": 0.47914917436328014, "eval_loss": 0.4095277786254883, "eval_runtime": 1740.4302, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 1070 }, { "epoch": 0.4795969773299748, "grad_norm": 1.3406230211257935, "learning_rate": 3.9185133887349956e-05, "loss": 0.4512, "step": 1071 }, { "epoch": 0.48004478029666947, "grad_norm": 1.2547788619995117, "learning_rate": 3.91735918744229e-05, "loss": 0.4066, "step": 1072 }, { "epoch": 0.48049258326336414, "grad_norm": 1.0894948244094849, "learning_rate": 3.916204986149585e-05, "loss": 0.5634, "step": 1073 }, { "epoch": 0.48094038623005875, "grad_norm": 1.1141157150268555, "learning_rate": 3.915050784856879e-05, "loss": 0.5119, "step": 1074 }, { "epoch": 0.4813881891967534, "grad_norm": 1.0408393144607544, "learning_rate": 3.913896583564174e-05, "loss": 0.3573, "step": 1075 }, { "epoch": 0.4818359921634481, "grad_norm": 1.20759117603302, "learning_rate": 3.912742382271468e-05, "loss": 0.3264, "step": 1076 }, { "epoch": 0.48228379513014275, "grad_norm": 1.1531039476394653, "learning_rate": 3.911588180978763e-05, "loss": 0.4826, "step": 1077 }, { "epoch": 0.4827315980968374, "grad_norm": 1.6651794910430908, "learning_rate": 3.9104339796860575e-05, "loss": 0.4274, "step": 1078 }, { "epoch": 0.483179401063532, "grad_norm": 0.916927695274353, "learning_rate": 3.9092797783933524e-05, "loss": 0.3246, "step": 1079 }, { "epoch": 0.4836272040302267, "grad_norm": 1.0637052059173584, "learning_rate": 3.9081255771006467e-05, "loss": 0.3588, "step": 1080 }, { "epoch": 0.4836272040302267, "eval_loss": 0.40346789360046387, "eval_runtime": 1739.6386, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 1080 }, { "epoch": 0.48407500699692135, "grad_norm": 1.419047236442566, "learning_rate": 3.9069713758079416e-05, "loss": 0.2933, "step": 1081 }, { "epoch": 0.484522809963616, "grad_norm": 1.3041396141052246, "learning_rate": 3.905817174515236e-05, "loss": 0.6344, "step": 1082 }, { "epoch": 0.4849706129303107, "grad_norm": 1.1709884405136108, "learning_rate": 3.90466297322253e-05, "loss": 0.7341, "step": 1083 }, { "epoch": 0.4854184158970053, "grad_norm": 1.2271945476531982, "learning_rate": 3.9035087719298244e-05, "loss": 0.6506, "step": 1084 }, { "epoch": 0.48586621886369996, "grad_norm": 1.1007139682769775, "learning_rate": 3.902354570637119e-05, "loss": 0.4708, "step": 1085 }, { "epoch": 0.4863140218303946, "grad_norm": 1.242135763168335, "learning_rate": 3.9012003693444136e-05, "loss": 0.6122, "step": 1086 }, { "epoch": 0.4867618247970893, "grad_norm": 1.1739150285720825, "learning_rate": 3.9000461680517085e-05, "loss": 0.23, "step": 1087 }, { "epoch": 0.48720962776378396, "grad_norm": 1.2357375621795654, "learning_rate": 3.898891966759003e-05, "loss": 0.3669, "step": 1088 }, { "epoch": 0.48765743073047857, "grad_norm": 0.9539027214050293, "learning_rate": 3.897737765466298e-05, "loss": 0.3564, "step": 1089 }, { "epoch": 0.48810523369717324, "grad_norm": 1.1826626062393188, "learning_rate": 3.896583564173592e-05, "loss": 0.5192, "step": 1090 }, { "epoch": 0.48810523369717324, "eval_loss": 0.39695626497268677, "eval_runtime": 1739.8638, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 1090 }, { "epoch": 0.4885530366638679, "grad_norm": 1.36544668674469, "learning_rate": 3.895429362880887e-05, "loss": 0.4346, "step": 1091 }, { "epoch": 0.48900083963056257, "grad_norm": 1.2532953023910522, "learning_rate": 3.894275161588181e-05, "loss": 0.3558, "step": 1092 }, { "epoch": 0.48944864259725723, "grad_norm": 1.2060133218765259, "learning_rate": 3.8931209602954754e-05, "loss": 0.354, "step": 1093 }, { "epoch": 0.48989644556395184, "grad_norm": 1.3531482219696045, "learning_rate": 3.8919667590027704e-05, "loss": 0.7354, "step": 1094 }, { "epoch": 0.4903442485306465, "grad_norm": 1.2521357536315918, "learning_rate": 3.8908125577100646e-05, "loss": 0.4558, "step": 1095 }, { "epoch": 0.4907920514973412, "grad_norm": 1.319419264793396, "learning_rate": 3.8896583564173596e-05, "loss": 0.4897, "step": 1096 }, { "epoch": 0.49123985446403584, "grad_norm": 1.3026233911514282, "learning_rate": 3.888504155124654e-05, "loss": 0.5194, "step": 1097 }, { "epoch": 0.49168765743073045, "grad_norm": 1.3030741214752197, "learning_rate": 3.887349953831949e-05, "loss": 0.5338, "step": 1098 }, { "epoch": 0.4921354603974251, "grad_norm": 1.2530677318572998, "learning_rate": 3.886195752539243e-05, "loss": 0.3714, "step": 1099 }, { "epoch": 0.4925832633641198, "grad_norm": 1.1098151206970215, "learning_rate": 3.885041551246538e-05, "loss": 0.3104, "step": 1100 }, { "epoch": 0.4925832633641198, "eval_loss": 0.39361706376075745, "eval_runtime": 1739.4256, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 1100 }, { "epoch": 0.49303106633081445, "grad_norm": 1.1043074131011963, "learning_rate": 3.883887349953832e-05, "loss": 0.4308, "step": 1101 }, { "epoch": 0.4934788692975091, "grad_norm": 1.0311301946640015, "learning_rate": 3.8827331486611265e-05, "loss": 0.2908, "step": 1102 }, { "epoch": 0.4939266722642037, "grad_norm": 1.250179409980774, "learning_rate": 3.8815789473684214e-05, "loss": 0.4199, "step": 1103 }, { "epoch": 0.4943744752308984, "grad_norm": 1.0847578048706055, "learning_rate": 3.880424746075716e-05, "loss": 0.474, "step": 1104 }, { "epoch": 0.49482227819759306, "grad_norm": 1.0442312955856323, "learning_rate": 3.87927054478301e-05, "loss": 0.2112, "step": 1105 }, { "epoch": 0.4952700811642877, "grad_norm": 1.09678316116333, "learning_rate": 3.878116343490305e-05, "loss": 0.3507, "step": 1106 }, { "epoch": 0.4957178841309824, "grad_norm": 0.8428501486778259, "learning_rate": 3.876962142197599e-05, "loss": 0.1935, "step": 1107 }, { "epoch": 0.496165687097677, "grad_norm": 1.255764365196228, "learning_rate": 3.875807940904894e-05, "loss": 0.4886, "step": 1108 }, { "epoch": 0.49661349006437167, "grad_norm": 1.2545994520187378, "learning_rate": 3.8746537396121883e-05, "loss": 0.4899, "step": 1109 }, { "epoch": 0.49706129303106633, "grad_norm": 0.9944257140159607, "learning_rate": 3.873499538319483e-05, "loss": 0.3231, "step": 1110 }, { "epoch": 0.49706129303106633, "eval_loss": 0.388060063123703, "eval_runtime": 1739.6113, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 1110 }, { "epoch": 0.497509095997761, "grad_norm": 1.3695605993270874, "learning_rate": 3.8723453370267775e-05, "loss": 0.3714, "step": 1111 }, { "epoch": 0.49795689896445566, "grad_norm": 1.5230897665023804, "learning_rate": 3.8711911357340725e-05, "loss": 0.8132, "step": 1112 }, { "epoch": 0.4984047019311503, "grad_norm": 1.4187443256378174, "learning_rate": 3.870036934441367e-05, "loss": 0.4506, "step": 1113 }, { "epoch": 0.49885250489784494, "grad_norm": 1.4074289798736572, "learning_rate": 3.868882733148662e-05, "loss": 0.4261, "step": 1114 }, { "epoch": 0.4993003078645396, "grad_norm": 1.161434292793274, "learning_rate": 3.867728531855956e-05, "loss": 0.4857, "step": 1115 }, { "epoch": 0.49974811083123427, "grad_norm": 1.053652048110962, "learning_rate": 3.866574330563251e-05, "loss": 0.2793, "step": 1116 }, { "epoch": 0.5001959137979289, "grad_norm": 1.2816743850708008, "learning_rate": 3.865420129270545e-05, "loss": 0.4757, "step": 1117 }, { "epoch": 0.5006437167646236, "grad_norm": 1.2309850454330444, "learning_rate": 3.8642659279778394e-05, "loss": 0.4121, "step": 1118 }, { "epoch": 0.5010915197313183, "grad_norm": 1.3217289447784424, "learning_rate": 3.8631117266851343e-05, "loss": 0.4224, "step": 1119 }, { "epoch": 0.5015393226980128, "grad_norm": 1.1132780313491821, "learning_rate": 3.8619575253924286e-05, "loss": 0.4038, "step": 1120 }, { "epoch": 0.5015393226980128, "eval_loss": 0.38312047719955444, "eval_runtime": 1739.3273, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 1120 }, { "epoch": 0.5019871256647075, "grad_norm": 1.0189847946166992, "learning_rate": 3.8608033240997235e-05, "loss": 0.3739, "step": 1121 }, { "epoch": 0.5024349286314022, "grad_norm": 0.9733580350875854, "learning_rate": 3.859649122807018e-05, "loss": 0.3766, "step": 1122 }, { "epoch": 0.5028827315980968, "grad_norm": 1.2229063510894775, "learning_rate": 3.858494921514312e-05, "loss": 0.4201, "step": 1123 }, { "epoch": 0.5033305345647915, "grad_norm": 1.1701637506484985, "learning_rate": 3.857340720221606e-05, "loss": 0.4795, "step": 1124 }, { "epoch": 0.5037783375314862, "grad_norm": 1.2924901247024536, "learning_rate": 3.856186518928901e-05, "loss": 0.4878, "step": 1125 }, { "epoch": 0.5042261404981808, "grad_norm": 1.1813278198242188, "learning_rate": 3.8550323176361955e-05, "loss": 0.402, "step": 1126 }, { "epoch": 0.5046739434648755, "grad_norm": 1.4923017024993896, "learning_rate": 3.8538781163434905e-05, "loss": 0.464, "step": 1127 }, { "epoch": 0.5051217464315702, "grad_norm": 1.1882212162017822, "learning_rate": 3.852723915050785e-05, "loss": 0.2678, "step": 1128 }, { "epoch": 0.5055695493982648, "grad_norm": 0.9495273232460022, "learning_rate": 3.8515697137580797e-05, "loss": 0.2749, "step": 1129 }, { "epoch": 0.5060173523649594, "grad_norm": 1.1850472688674927, "learning_rate": 3.850415512465374e-05, "loss": 0.2712, "step": 1130 }, { "epoch": 0.5060173523649594, "eval_loss": 0.3788580894470215, "eval_runtime": 1739.553, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 1130 }, { "epoch": 0.506465155331654, "grad_norm": 1.2247016429901123, "learning_rate": 3.849261311172669e-05, "loss": 0.5771, "step": 1131 }, { "epoch": 0.5069129582983487, "grad_norm": 1.2585265636444092, "learning_rate": 3.848107109879963e-05, "loss": 0.4273, "step": 1132 }, { "epoch": 0.5073607612650434, "grad_norm": 1.1470332145690918, "learning_rate": 3.846952908587258e-05, "loss": 0.4164, "step": 1133 }, { "epoch": 0.507808564231738, "grad_norm": 1.2961275577545166, "learning_rate": 3.845798707294552e-05, "loss": 0.4638, "step": 1134 }, { "epoch": 0.5082563671984327, "grad_norm": 1.2521647214889526, "learning_rate": 3.844644506001847e-05, "loss": 0.2843, "step": 1135 }, { "epoch": 0.5087041701651274, "grad_norm": 1.1778457164764404, "learning_rate": 3.8434903047091415e-05, "loss": 0.4627, "step": 1136 }, { "epoch": 0.509151973131822, "grad_norm": 1.1717246770858765, "learning_rate": 3.8423361034164365e-05, "loss": 0.386, "step": 1137 }, { "epoch": 0.5095997760985167, "grad_norm": 1.1839237213134766, "learning_rate": 3.841181902123731e-05, "loss": 0.35, "step": 1138 }, { "epoch": 0.5100475790652113, "grad_norm": 1.2348573207855225, "learning_rate": 3.8400277008310257e-05, "loss": 0.5258, "step": 1139 }, { "epoch": 0.5104953820319059, "grad_norm": 1.4483014345169067, "learning_rate": 3.83887349953832e-05, "loss": 0.3698, "step": 1140 }, { "epoch": 0.5104953820319059, "eval_loss": 0.3735828101634979, "eval_runtime": 1738.8301, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 1140 }, { "epoch": 0.5109431849986006, "grad_norm": 1.017661452293396, "learning_rate": 3.837719298245614e-05, "loss": 0.2918, "step": 1141 }, { "epoch": 0.5113909879652953, "grad_norm": 1.1523457765579224, "learning_rate": 3.836565096952909e-05, "loss": 0.4667, "step": 1142 }, { "epoch": 0.5118387909319899, "grad_norm": 1.0640584230422974, "learning_rate": 3.8354108956602034e-05, "loss": 0.3532, "step": 1143 }, { "epoch": 0.5122865938986846, "grad_norm": 1.2194321155548096, "learning_rate": 3.8342566943674976e-05, "loss": 0.4472, "step": 1144 }, { "epoch": 0.5127343968653792, "grad_norm": 1.1272408962249756, "learning_rate": 3.833102493074792e-05, "loss": 0.3863, "step": 1145 }, { "epoch": 0.5131821998320739, "grad_norm": 1.1771918535232544, "learning_rate": 3.831948291782087e-05, "loss": 0.3669, "step": 1146 }, { "epoch": 0.5136300027987686, "grad_norm": 1.154484748840332, "learning_rate": 3.830794090489381e-05, "loss": 0.4898, "step": 1147 }, { "epoch": 0.5140778057654632, "grad_norm": 1.1640018224716187, "learning_rate": 3.829639889196676e-05, "loss": 0.2343, "step": 1148 }, { "epoch": 0.5145256087321578, "grad_norm": 1.1481584310531616, "learning_rate": 3.82848568790397e-05, "loss": 0.2811, "step": 1149 }, { "epoch": 0.5149734116988525, "grad_norm": 1.322683334350586, "learning_rate": 3.827331486611265e-05, "loss": 0.279, "step": 1150 }, { "epoch": 0.5149734116988525, "eval_loss": 0.36834877729415894, "eval_runtime": 1738.7448, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 1150 }, { "epoch": 0.5154212146655471, "grad_norm": 1.4827194213867188, "learning_rate": 3.8261772853185595e-05, "loss": 0.4846, "step": 1151 }, { "epoch": 0.5158690176322418, "grad_norm": 1.0381073951721191, "learning_rate": 3.8250230840258544e-05, "loss": 0.2628, "step": 1152 }, { "epoch": 0.5163168205989365, "grad_norm": 1.0377421379089355, "learning_rate": 3.823868882733149e-05, "loss": 0.2416, "step": 1153 }, { "epoch": 0.5167646235656311, "grad_norm": 1.114513635635376, "learning_rate": 3.8227146814404436e-05, "loss": 0.3389, "step": 1154 }, { "epoch": 0.5172124265323258, "grad_norm": 1.1325706243515015, "learning_rate": 3.821560480147738e-05, "loss": 0.5003, "step": 1155 }, { "epoch": 0.5176602294990205, "grad_norm": 0.9513109922409058, "learning_rate": 3.820406278855033e-05, "loss": 0.5039, "step": 1156 }, { "epoch": 0.5181080324657151, "grad_norm": 1.3812943696975708, "learning_rate": 3.819252077562327e-05, "loss": 0.3025, "step": 1157 }, { "epoch": 0.5185558354324098, "grad_norm": 1.07074773311615, "learning_rate": 3.818097876269622e-05, "loss": 0.4571, "step": 1158 }, { "epoch": 0.5190036383991044, "grad_norm": 1.3915842771530151, "learning_rate": 3.816943674976916e-05, "loss": 0.3686, "step": 1159 }, { "epoch": 0.519451441365799, "grad_norm": 1.3118082284927368, "learning_rate": 3.815789473684211e-05, "loss": 0.268, "step": 1160 }, { "epoch": 0.519451441365799, "eval_loss": 0.36535876989364624, "eval_runtime": 1738.7404, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 1160 }, { "epoch": 0.5198992443324937, "grad_norm": 1.2658312320709229, "learning_rate": 3.8146352723915055e-05, "loss": 0.4729, "step": 1161 }, { "epoch": 0.5203470472991883, "grad_norm": 1.1261812448501587, "learning_rate": 3.8134810710988e-05, "loss": 0.3151, "step": 1162 }, { "epoch": 0.520794850265883, "grad_norm": 1.354509711265564, "learning_rate": 3.812326869806094e-05, "loss": 0.4913, "step": 1163 }, { "epoch": 0.5212426532325777, "grad_norm": 1.498897910118103, "learning_rate": 3.811172668513389e-05, "loss": 0.4962, "step": 1164 }, { "epoch": 0.5216904561992723, "grad_norm": 1.4946974515914917, "learning_rate": 3.810018467220683e-05, "loss": 0.6484, "step": 1165 }, { "epoch": 0.522138259165967, "grad_norm": 1.1827718019485474, "learning_rate": 3.808864265927978e-05, "loss": 0.3115, "step": 1166 }, { "epoch": 0.5225860621326617, "grad_norm": 1.1302978992462158, "learning_rate": 3.8077100646352724e-05, "loss": 0.404, "step": 1167 }, { "epoch": 0.5230338650993562, "grad_norm": 1.1893664598464966, "learning_rate": 3.806555863342567e-05, "loss": 0.4, "step": 1168 }, { "epoch": 0.5234816680660509, "grad_norm": 1.1274092197418213, "learning_rate": 3.8054016620498616e-05, "loss": 0.4143, "step": 1169 }, { "epoch": 0.5239294710327456, "grad_norm": 1.305298924446106, "learning_rate": 3.804247460757156e-05, "loss": 0.1854, "step": 1170 }, { "epoch": 0.5239294710327456, "eval_loss": 0.3615010380744934, "eval_runtime": 1738.5553, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 1170 }, { "epoch": 0.5243772739994402, "grad_norm": 1.3721306324005127, "learning_rate": 3.803093259464451e-05, "loss": 0.5466, "step": 1171 }, { "epoch": 0.5248250769661349, "grad_norm": 1.2742910385131836, "learning_rate": 3.801939058171745e-05, "loss": 0.4656, "step": 1172 }, { "epoch": 0.5252728799328296, "grad_norm": 1.2864550352096558, "learning_rate": 3.80078485687904e-05, "loss": 0.5515, "step": 1173 }, { "epoch": 0.5257206828995242, "grad_norm": 1.3748183250427246, "learning_rate": 3.799630655586334e-05, "loss": 0.4864, "step": 1174 }, { "epoch": 0.5261684858662189, "grad_norm": 1.0401967763900757, "learning_rate": 3.798476454293629e-05, "loss": 0.3758, "step": 1175 }, { "epoch": 0.5266162888329136, "grad_norm": 1.117813229560852, "learning_rate": 3.7973222530009235e-05, "loss": 0.3174, "step": 1176 }, { "epoch": 0.5270640917996082, "grad_norm": 1.2500524520874023, "learning_rate": 3.7961680517082184e-05, "loss": 0.6011, "step": 1177 }, { "epoch": 0.5275118947663028, "grad_norm": 1.2969615459442139, "learning_rate": 3.795013850415513e-05, "loss": 0.3895, "step": 1178 }, { "epoch": 0.5279596977329974, "grad_norm": 1.301548719406128, "learning_rate": 3.7938596491228076e-05, "loss": 0.5013, "step": 1179 }, { "epoch": 0.5284075006996921, "grad_norm": 1.1384559869766235, "learning_rate": 3.792705447830102e-05, "loss": 0.4033, "step": 1180 }, { "epoch": 0.5284075006996921, "eval_loss": 0.35599184036254883, "eval_runtime": 1738.2824, "eval_samples_per_second": 2.57, "eval_steps_per_second": 2.57, "step": 1180 }, { "epoch": 0.5288553036663868, "grad_norm": 1.06693434715271, "learning_rate": 3.791551246537397e-05, "loss": 0.2809, "step": 1181 }, { "epoch": 0.5293031066330814, "grad_norm": 1.5773426294326782, "learning_rate": 3.790397045244691e-05, "loss": 0.3743, "step": 1182 }, { "epoch": 0.5297509095997761, "grad_norm": 1.1038140058517456, "learning_rate": 3.789242843951985e-05, "loss": 0.15, "step": 1183 }, { "epoch": 0.5301987125664708, "grad_norm": 1.9519281387329102, "learning_rate": 3.7880886426592796e-05, "loss": 0.5218, "step": 1184 }, { "epoch": 0.5306465155331654, "grad_norm": 1.0286353826522827, "learning_rate": 3.7869344413665745e-05, "loss": 0.3094, "step": 1185 }, { "epoch": 0.5310943184998601, "grad_norm": 1.140738606452942, "learning_rate": 3.785780240073869e-05, "loss": 0.3357, "step": 1186 }, { "epoch": 0.5315421214665547, "grad_norm": 1.3645200729370117, "learning_rate": 3.784626038781164e-05, "loss": 0.4584, "step": 1187 }, { "epoch": 0.5319899244332493, "grad_norm": 1.1500276327133179, "learning_rate": 3.783471837488458e-05, "loss": 0.4634, "step": 1188 }, { "epoch": 0.532437727399944, "grad_norm": 1.3467127084732056, "learning_rate": 3.782317636195753e-05, "loss": 0.3407, "step": 1189 }, { "epoch": 0.5328855303666387, "grad_norm": 1.0754728317260742, "learning_rate": 3.781163434903047e-05, "loss": 0.4932, "step": 1190 }, { "epoch": 0.5328855303666387, "eval_loss": 0.35155370831489563, "eval_runtime": 1738.1857, "eval_samples_per_second": 2.57, "eval_steps_per_second": 2.57, "step": 1190 }, { "epoch": 0.5333333333333333, "grad_norm": 1.2793216705322266, "learning_rate": 3.780009233610342e-05, "loss": 0.3066, "step": 1191 }, { "epoch": 0.533781136300028, "grad_norm": 1.366106390953064, "learning_rate": 3.7788550323176364e-05, "loss": 0.2695, "step": 1192 }, { "epoch": 0.5342289392667227, "grad_norm": 1.2858949899673462, "learning_rate": 3.7777008310249306e-05, "loss": 0.4392, "step": 1193 }, { "epoch": 0.5346767422334173, "grad_norm": 1.2085633277893066, "learning_rate": 3.7765466297322256e-05, "loss": 0.4836, "step": 1194 }, { "epoch": 0.535124545200112, "grad_norm": 1.352075457572937, "learning_rate": 3.77539242843952e-05, "loss": 0.2062, "step": 1195 }, { "epoch": 0.5355723481668067, "grad_norm": 0.9742631912231445, "learning_rate": 3.774238227146815e-05, "loss": 0.2527, "step": 1196 }, { "epoch": 0.5360201511335012, "grad_norm": 1.268314242362976, "learning_rate": 3.773084025854109e-05, "loss": 0.387, "step": 1197 }, { "epoch": 0.5364679541001959, "grad_norm": 1.1327756643295288, "learning_rate": 3.771929824561404e-05, "loss": 0.2698, "step": 1198 }, { "epoch": 0.5369157570668905, "grad_norm": 1.1978434324264526, "learning_rate": 3.770775623268698e-05, "loss": 0.2696, "step": 1199 }, { "epoch": 0.5373635600335852, "grad_norm": 1.2355518341064453, "learning_rate": 3.769621421975993e-05, "loss": 0.3093, "step": 1200 }, { "epoch": 0.5373635600335852, "eval_loss": 0.34780725836753845, "eval_runtime": 1738.201, "eval_samples_per_second": 2.57, "eval_steps_per_second": 2.57, "step": 1200 }, { "epoch": 0.5378113630002799, "grad_norm": 0.8600719571113586, "learning_rate": 3.7684672206832874e-05, "loss": 0.1402, "step": 1201 }, { "epoch": 0.5382591659669745, "grad_norm": 1.3365446329116821, "learning_rate": 3.767313019390582e-05, "loss": 0.251, "step": 1202 }, { "epoch": 0.5387069689336692, "grad_norm": 1.1617084741592407, "learning_rate": 3.7661588180978766e-05, "loss": 0.3694, "step": 1203 }, { "epoch": 0.5391547719003639, "grad_norm": 1.145098328590393, "learning_rate": 3.765004616805171e-05, "loss": 0.4229, "step": 1204 }, { "epoch": 0.5396025748670585, "grad_norm": 1.3333810567855835, "learning_rate": 3.763850415512465e-05, "loss": 0.2195, "step": 1205 }, { "epoch": 0.5400503778337532, "grad_norm": 1.2409133911132812, "learning_rate": 3.76269621421976e-05, "loss": 0.4328, "step": 1206 }, { "epoch": 0.5404981808004478, "grad_norm": 1.194001317024231, "learning_rate": 3.7615420129270544e-05, "loss": 0.2599, "step": 1207 }, { "epoch": 0.5409459837671424, "grad_norm": 1.3317934274673462, "learning_rate": 3.760387811634349e-05, "loss": 0.3035, "step": 1208 }, { "epoch": 0.5413937867338371, "grad_norm": 1.0461061000823975, "learning_rate": 3.7592336103416436e-05, "loss": 0.1203, "step": 1209 }, { "epoch": 0.5418415897005318, "grad_norm": 1.2445271015167236, "learning_rate": 3.7580794090489385e-05, "loss": 0.2857, "step": 1210 }, { "epoch": 0.5418415897005318, "eval_loss": 0.3450431227684021, "eval_runtime": 1738.2005, "eval_samples_per_second": 2.57, "eval_steps_per_second": 2.57, "step": 1210 }, { "epoch": 0.5422893926672264, "grad_norm": 1.0992363691329956, "learning_rate": 3.756925207756233e-05, "loss": 0.2729, "step": 1211 }, { "epoch": 0.5427371956339211, "grad_norm": 1.0799282789230347, "learning_rate": 3.755771006463528e-05, "loss": 0.3273, "step": 1212 }, { "epoch": 0.5431849986006158, "grad_norm": 1.0961830615997314, "learning_rate": 3.754616805170822e-05, "loss": 0.3408, "step": 1213 }, { "epoch": 0.5436328015673104, "grad_norm": 1.1589432954788208, "learning_rate": 3.753462603878117e-05, "loss": 0.2953, "step": 1214 }, { "epoch": 0.5440806045340051, "grad_norm": 1.4813915491104126, "learning_rate": 3.752308402585411e-05, "loss": 0.5627, "step": 1215 }, { "epoch": 0.5445284075006996, "grad_norm": 1.0269968509674072, "learning_rate": 3.751154201292706e-05, "loss": 0.3505, "step": 1216 }, { "epoch": 0.5449762104673943, "grad_norm": 1.140181541442871, "learning_rate": 3.7500000000000003e-05, "loss": 0.2568, "step": 1217 }, { "epoch": 0.545424013434089, "grad_norm": 1.115614414215088, "learning_rate": 3.7488457987072946e-05, "loss": 0.2442, "step": 1218 }, { "epoch": 0.5458718164007836, "grad_norm": 1.0412161350250244, "learning_rate": 3.7476915974145895e-05, "loss": 0.2746, "step": 1219 }, { "epoch": 0.5463196193674783, "grad_norm": 1.283544659614563, "learning_rate": 3.746537396121884e-05, "loss": 0.2452, "step": 1220 }, { "epoch": 0.5463196193674783, "eval_loss": 0.34167274832725525, "eval_runtime": 1738.5376, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 1220 }, { "epoch": 0.546767422334173, "grad_norm": 1.2286393642425537, "learning_rate": 3.745383194829179e-05, "loss": 0.3005, "step": 1221 }, { "epoch": 0.5472152253008676, "grad_norm": 1.4023370742797852, "learning_rate": 3.744228993536473e-05, "loss": 0.3721, "step": 1222 }, { "epoch": 0.5476630282675623, "grad_norm": 1.4825446605682373, "learning_rate": 3.743074792243767e-05, "loss": 0.3667, "step": 1223 }, { "epoch": 0.548110831234257, "grad_norm": 1.1614080667495728, "learning_rate": 3.7419205909510615e-05, "loss": 0.249, "step": 1224 }, { "epoch": 0.5485586342009516, "grad_norm": 1.1903398036956787, "learning_rate": 3.7407663896583565e-05, "loss": 0.3123, "step": 1225 }, { "epoch": 0.5490064371676462, "grad_norm": 1.2279646396636963, "learning_rate": 3.739612188365651e-05, "loss": 0.3751, "step": 1226 }, { "epoch": 0.5494542401343409, "grad_norm": 1.272114634513855, "learning_rate": 3.738457987072946e-05, "loss": 0.2775, "step": 1227 }, { "epoch": 0.5499020431010355, "grad_norm": 1.0231918096542358, "learning_rate": 3.73730378578024e-05, "loss": 0.2722, "step": 1228 }, { "epoch": 0.5503498460677302, "grad_norm": 1.0256974697113037, "learning_rate": 3.736149584487535e-05, "loss": 0.3589, "step": 1229 }, { "epoch": 0.5507976490344249, "grad_norm": 1.0507721900939941, "learning_rate": 3.734995383194829e-05, "loss": 0.2636, "step": 1230 }, { "epoch": 0.5507976490344249, "eval_loss": 0.33938801288604736, "eval_runtime": 1737.4881, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 1230 }, { "epoch": 0.5512454520011195, "grad_norm": 1.433797836303711, "learning_rate": 3.733841181902124e-05, "loss": 0.2577, "step": 1231 }, { "epoch": 0.5516932549678142, "grad_norm": 1.160000205039978, "learning_rate": 3.732686980609418e-05, "loss": 0.3427, "step": 1232 }, { "epoch": 0.5521410579345088, "grad_norm": 1.1534769535064697, "learning_rate": 3.731532779316713e-05, "loss": 0.2084, "step": 1233 }, { "epoch": 0.5525888609012035, "grad_norm": 1.4578096866607666, "learning_rate": 3.7303785780240075e-05, "loss": 0.398, "step": 1234 }, { "epoch": 0.5530366638678981, "grad_norm": 1.6965374946594238, "learning_rate": 3.7292243767313025e-05, "loss": 0.5219, "step": 1235 }, { "epoch": 0.5534844668345927, "grad_norm": 1.1840635538101196, "learning_rate": 3.728070175438597e-05, "loss": 0.3324, "step": 1236 }, { "epoch": 0.5539322698012874, "grad_norm": 1.2262353897094727, "learning_rate": 3.726915974145892e-05, "loss": 0.3844, "step": 1237 }, { "epoch": 0.5543800727679821, "grad_norm": 1.120253086090088, "learning_rate": 3.725761772853186e-05, "loss": 0.2836, "step": 1238 }, { "epoch": 0.5548278757346767, "grad_norm": 1.3321200609207153, "learning_rate": 3.724607571560481e-05, "loss": 0.4458, "step": 1239 }, { "epoch": 0.5552756787013714, "grad_norm": 1.2431046962738037, "learning_rate": 3.723453370267775e-05, "loss": 0.3338, "step": 1240 }, { "epoch": 0.5552756787013714, "eval_loss": 0.33455634117126465, "eval_runtime": 1735.9972, "eval_samples_per_second": 2.573, "eval_steps_per_second": 2.573, "step": 1240 }, { "epoch": 0.5557234816680661, "grad_norm": 1.113531231880188, "learning_rate": 3.7222991689750694e-05, "loss": 0.1868, "step": 1241 }, { "epoch": 0.5561712846347607, "grad_norm": 1.1995195150375366, "learning_rate": 3.7211449676823636e-05, "loss": 0.5076, "step": 1242 }, { "epoch": 0.5566190876014554, "grad_norm": 1.3281294107437134, "learning_rate": 3.7199907663896586e-05, "loss": 0.3138, "step": 1243 }, { "epoch": 0.5570668905681501, "grad_norm": 1.101189136505127, "learning_rate": 3.718836565096953e-05, "loss": 0.3515, "step": 1244 }, { "epoch": 0.5575146935348446, "grad_norm": 1.8178327083587646, "learning_rate": 3.717682363804247e-05, "loss": 0.2748, "step": 1245 }, { "epoch": 0.5579624965015393, "grad_norm": 1.3038519620895386, "learning_rate": 3.716528162511542e-05, "loss": 0.2843, "step": 1246 }, { "epoch": 0.558410299468234, "grad_norm": 1.1656179428100586, "learning_rate": 3.715373961218836e-05, "loss": 0.3318, "step": 1247 }, { "epoch": 0.5588581024349286, "grad_norm": 1.280800223350525, "learning_rate": 3.714219759926131e-05, "loss": 0.4303, "step": 1248 }, { "epoch": 0.5593059054016233, "grad_norm": 1.352533221244812, "learning_rate": 3.7130655586334255e-05, "loss": 0.3739, "step": 1249 }, { "epoch": 0.559753708368318, "grad_norm": 1.1881428956985474, "learning_rate": 3.7119113573407204e-05, "loss": 0.1833, "step": 1250 }, { "epoch": 0.559753708368318, "eval_loss": 0.33061400055885315, "eval_runtime": 1730.969, "eval_samples_per_second": 2.581, "eval_steps_per_second": 2.581, "step": 1250 }, { "epoch": 0.5602015113350126, "grad_norm": 0.903508722782135, "learning_rate": 3.710757156048015e-05, "loss": 0.1424, "step": 1251 }, { "epoch": 0.5606493143017073, "grad_norm": 1.3883750438690186, "learning_rate": 3.7096029547553096e-05, "loss": 0.5074, "step": 1252 }, { "epoch": 0.561097117268402, "grad_norm": 1.451582431793213, "learning_rate": 3.708448753462604e-05, "loss": 0.4956, "step": 1253 }, { "epoch": 0.5615449202350966, "grad_norm": 1.3098926544189453, "learning_rate": 3.707294552169899e-05, "loss": 0.3404, "step": 1254 }, { "epoch": 0.5619927232017912, "grad_norm": 1.2385728359222412, "learning_rate": 3.706140350877193e-05, "loss": 0.1917, "step": 1255 }, { "epoch": 0.5624405261684858, "grad_norm": 1.0623195171356201, "learning_rate": 3.704986149584488e-05, "loss": 0.2205, "step": 1256 }, { "epoch": 0.5628883291351805, "grad_norm": 1.0155680179595947, "learning_rate": 3.703831948291782e-05, "loss": 0.3373, "step": 1257 }, { "epoch": 0.5633361321018752, "grad_norm": 1.3060396909713745, "learning_rate": 3.702677746999077e-05, "loss": 0.4576, "step": 1258 }, { "epoch": 0.5637839350685698, "grad_norm": 0.9929831027984619, "learning_rate": 3.7015235457063715e-05, "loss": 0.1533, "step": 1259 }, { "epoch": 0.5642317380352645, "grad_norm": 1.223260760307312, "learning_rate": 3.7003693444136664e-05, "loss": 0.3359, "step": 1260 }, { "epoch": 0.5642317380352645, "eval_loss": 0.32611918449401855, "eval_runtime": 1729.8713, "eval_samples_per_second": 2.582, "eval_steps_per_second": 2.582, "step": 1260 }, { "epoch": 0.5646795410019592, "grad_norm": 1.2260444164276123, "learning_rate": 3.699215143120961e-05, "loss": 0.3395, "step": 1261 }, { "epoch": 0.5651273439686538, "grad_norm": 1.2700910568237305, "learning_rate": 3.698060941828255e-05, "loss": 0.2942, "step": 1262 }, { "epoch": 0.5655751469353485, "grad_norm": 1.1794884204864502, "learning_rate": 3.696906740535549e-05, "loss": 0.3885, "step": 1263 }, { "epoch": 0.566022949902043, "grad_norm": 1.5495160818099976, "learning_rate": 3.695752539242844e-05, "loss": 0.442, "step": 1264 }, { "epoch": 0.5664707528687377, "grad_norm": 1.0414493083953857, "learning_rate": 3.6945983379501384e-05, "loss": 0.227, "step": 1265 }, { "epoch": 0.5669185558354324, "grad_norm": 1.0589414834976196, "learning_rate": 3.6934441366574334e-05, "loss": 0.211, "step": 1266 }, { "epoch": 0.567366358802127, "grad_norm": 1.1238470077514648, "learning_rate": 3.6922899353647276e-05, "loss": 0.4103, "step": 1267 }, { "epoch": 0.5678141617688217, "grad_norm": 1.4189252853393555, "learning_rate": 3.691135734072022e-05, "loss": 0.487, "step": 1268 }, { "epoch": 0.5682619647355164, "grad_norm": 1.078750491142273, "learning_rate": 3.689981532779317e-05, "loss": 0.2973, "step": 1269 }, { "epoch": 0.568709767702211, "grad_norm": 1.2650834321975708, "learning_rate": 3.688827331486611e-05, "loss": 0.2079, "step": 1270 }, { "epoch": 0.568709767702211, "eval_loss": 0.32332947850227356, "eval_runtime": 1738.8495, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 1270 }, { "epoch": 0.5691575706689057, "grad_norm": 1.3321959972381592, "learning_rate": 3.687673130193906e-05, "loss": 0.3667, "step": 1271 }, { "epoch": 0.5696053736356004, "grad_norm": 1.6029940843582153, "learning_rate": 3.6865189289012e-05, "loss": 0.4847, "step": 1272 }, { "epoch": 0.570053176602295, "grad_norm": 1.2480456829071045, "learning_rate": 3.685364727608495e-05, "loss": 0.3311, "step": 1273 }, { "epoch": 0.5705009795689896, "grad_norm": 1.4163548946380615, "learning_rate": 3.6842105263157895e-05, "loss": 0.2867, "step": 1274 }, { "epoch": 0.5709487825356843, "grad_norm": 1.233312964439392, "learning_rate": 3.6830563250230844e-05, "loss": 0.2895, "step": 1275 }, { "epoch": 0.5713965855023789, "grad_norm": 1.4446786642074585, "learning_rate": 3.681902123730379e-05, "loss": 0.3115, "step": 1276 }, { "epoch": 0.5718443884690736, "grad_norm": 1.1519650220870972, "learning_rate": 3.6807479224376736e-05, "loss": 0.2677, "step": 1277 }, { "epoch": 0.5722921914357683, "grad_norm": 1.3036447763442993, "learning_rate": 3.679593721144968e-05, "loss": 0.497, "step": 1278 }, { "epoch": 0.5727399944024629, "grad_norm": 1.2030128240585327, "learning_rate": 3.678439519852263e-05, "loss": 0.2617, "step": 1279 }, { "epoch": 0.5731877973691576, "grad_norm": 1.4113270044326782, "learning_rate": 3.677285318559557e-05, "loss": 0.3102, "step": 1280 }, { "epoch": 0.5731877973691576, "eval_loss": 0.3202439546585083, "eval_runtime": 1741.4969, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 1280 }, { "epoch": 0.5736356003358523, "grad_norm": 1.2645702362060547, "learning_rate": 3.676131117266851e-05, "loss": 0.5035, "step": 1281 }, { "epoch": 0.5740834033025469, "grad_norm": 0.8573437929153442, "learning_rate": 3.674976915974146e-05, "loss": 0.2647, "step": 1282 }, { "epoch": 0.5745312062692415, "grad_norm": 1.2316564321517944, "learning_rate": 3.6738227146814405e-05, "loss": 0.3333, "step": 1283 }, { "epoch": 0.5749790092359361, "grad_norm": 1.1025688648223877, "learning_rate": 3.672668513388735e-05, "loss": 0.4113, "step": 1284 }, { "epoch": 0.5754268122026308, "grad_norm": 1.2769172191619873, "learning_rate": 3.67151431209603e-05, "loss": 0.3261, "step": 1285 }, { "epoch": 0.5758746151693255, "grad_norm": 1.4074493646621704, "learning_rate": 3.670360110803324e-05, "loss": 0.5384, "step": 1286 }, { "epoch": 0.5763224181360201, "grad_norm": 1.248610258102417, "learning_rate": 3.669205909510619e-05, "loss": 0.3164, "step": 1287 }, { "epoch": 0.5767702211027148, "grad_norm": 1.2112908363342285, "learning_rate": 3.668051708217913e-05, "loss": 0.4445, "step": 1288 }, { "epoch": 0.5772180240694095, "grad_norm": 1.2379544973373413, "learning_rate": 3.666897506925208e-05, "loss": 0.38, "step": 1289 }, { "epoch": 0.5776658270361041, "grad_norm": 1.0747911930084229, "learning_rate": 3.6657433056325024e-05, "loss": 0.3254, "step": 1290 }, { "epoch": 0.5776658270361041, "eval_loss": 0.3167678415775299, "eval_runtime": 1742.3404, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 1290 }, { "epoch": 0.5781136300027988, "grad_norm": 0.9601860046386719, "learning_rate": 3.664589104339797e-05, "loss": 0.183, "step": 1291 }, { "epoch": 0.5785614329694935, "grad_norm": 0.8449709415435791, "learning_rate": 3.6634349030470916e-05, "loss": 0.1727, "step": 1292 }, { "epoch": 0.579009235936188, "grad_norm": 1.3243083953857422, "learning_rate": 3.662280701754386e-05, "loss": 0.2925, "step": 1293 }, { "epoch": 0.5794570389028827, "grad_norm": 1.3475184440612793, "learning_rate": 3.661126500461681e-05, "loss": 0.4296, "step": 1294 }, { "epoch": 0.5799048418695774, "grad_norm": 1.3963221311569214, "learning_rate": 3.659972299168975e-05, "loss": 0.5055, "step": 1295 }, { "epoch": 0.580352644836272, "grad_norm": 1.177854061126709, "learning_rate": 3.65881809787627e-05, "loss": 0.2422, "step": 1296 }, { "epoch": 0.5808004478029667, "grad_norm": 1.038023591041565, "learning_rate": 3.657663896583564e-05, "loss": 0.1763, "step": 1297 }, { "epoch": 0.5812482507696614, "grad_norm": 1.0623871088027954, "learning_rate": 3.656509695290859e-05, "loss": 0.2948, "step": 1298 }, { "epoch": 0.581696053736356, "grad_norm": 1.0913971662521362, "learning_rate": 3.6553554939981534e-05, "loss": 0.2271, "step": 1299 }, { "epoch": 0.5821438567030507, "grad_norm": 1.0985262393951416, "learning_rate": 3.6542012927054484e-05, "loss": 0.3481, "step": 1300 }, { "epoch": 0.5821438567030507, "eval_loss": 0.3124310374259949, "eval_runtime": 1741.5934, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 1300 }, { "epoch": 0.5825916596697454, "grad_norm": 1.1117430925369263, "learning_rate": 3.6530470914127426e-05, "loss": 0.4351, "step": 1301 }, { "epoch": 0.58303946263644, "grad_norm": 1.3771589994430542, "learning_rate": 3.651892890120037e-05, "loss": 0.2323, "step": 1302 }, { "epoch": 0.5834872656031346, "grad_norm": 1.3689393997192383, "learning_rate": 3.650738688827331e-05, "loss": 0.2608, "step": 1303 }, { "epoch": 0.5839350685698292, "grad_norm": 1.3155872821807861, "learning_rate": 3.649584487534626e-05, "loss": 0.3222, "step": 1304 }, { "epoch": 0.5843828715365239, "grad_norm": 1.2287445068359375, "learning_rate": 3.6484302862419204e-05, "loss": 0.2483, "step": 1305 }, { "epoch": 0.5848306745032186, "grad_norm": 1.4728074073791504, "learning_rate": 3.647276084949215e-05, "loss": 0.4875, "step": 1306 }, { "epoch": 0.5852784774699132, "grad_norm": 0.9676113128662109, "learning_rate": 3.6461218836565096e-05, "loss": 0.2232, "step": 1307 }, { "epoch": 0.5857262804366079, "grad_norm": 1.1757640838623047, "learning_rate": 3.6449676823638045e-05, "loss": 0.2085, "step": 1308 }, { "epoch": 0.5861740834033026, "grad_norm": 1.0415418148040771, "learning_rate": 3.643813481071099e-05, "loss": 0.259, "step": 1309 }, { "epoch": 0.5866218863699972, "grad_norm": 1.1401948928833008, "learning_rate": 3.642659279778394e-05, "loss": 0.2322, "step": 1310 }, { "epoch": 0.5866218863699972, "eval_loss": 0.30859696865081787, "eval_runtime": 1742.4133, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 1310 }, { "epoch": 0.5870696893366919, "grad_norm": 1.0934462547302246, "learning_rate": 3.641505078485688e-05, "loss": 0.1186, "step": 1311 }, { "epoch": 0.5875174923033865, "grad_norm": 1.0055665969848633, "learning_rate": 3.640350877192983e-05, "loss": 0.3582, "step": 1312 }, { "epoch": 0.5879652952700811, "grad_norm": 1.4080592393875122, "learning_rate": 3.639196675900277e-05, "loss": 0.3942, "step": 1313 }, { "epoch": 0.5884130982367758, "grad_norm": 1.1707267761230469, "learning_rate": 3.638042474607572e-05, "loss": 0.246, "step": 1314 }, { "epoch": 0.5888609012034705, "grad_norm": 1.1650632619857788, "learning_rate": 3.6368882733148664e-05, "loss": 0.177, "step": 1315 }, { "epoch": 0.5893087041701651, "grad_norm": 1.2004889249801636, "learning_rate": 3.635734072022161e-05, "loss": 0.2384, "step": 1316 }, { "epoch": 0.5897565071368598, "grad_norm": 1.0767138004302979, "learning_rate": 3.6345798707294556e-05, "loss": 0.2802, "step": 1317 }, { "epoch": 0.5902043101035545, "grad_norm": 1.1107332706451416, "learning_rate": 3.63342566943675e-05, "loss": 0.1619, "step": 1318 }, { "epoch": 0.5906521130702491, "grad_norm": 1.2549272775650024, "learning_rate": 3.632271468144045e-05, "loss": 0.2781, "step": 1319 }, { "epoch": 0.5910999160369438, "grad_norm": 1.3902312517166138, "learning_rate": 3.631117266851339e-05, "loss": 0.3365, "step": 1320 }, { "epoch": 0.5910999160369438, "eval_loss": 0.3084462285041809, "eval_runtime": 1741.5592, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 1320 }, { "epoch": 0.5915477190036385, "grad_norm": 1.1804687976837158, "learning_rate": 3.629963065558634e-05, "loss": 0.2902, "step": 1321 }, { "epoch": 0.591995521970333, "grad_norm": 0.9639450907707214, "learning_rate": 3.628808864265928e-05, "loss": 0.1469, "step": 1322 }, { "epoch": 0.5924433249370277, "grad_norm": 1.1308844089508057, "learning_rate": 3.6276546629732225e-05, "loss": 0.2995, "step": 1323 }, { "epoch": 0.5928911279037223, "grad_norm": 1.3623363971710205, "learning_rate": 3.626500461680517e-05, "loss": 0.4717, "step": 1324 }, { "epoch": 0.593338930870417, "grad_norm": 1.0320011377334595, "learning_rate": 3.625346260387812e-05, "loss": 0.2517, "step": 1325 }, { "epoch": 0.5937867338371117, "grad_norm": 1.0291073322296143, "learning_rate": 3.624192059095106e-05, "loss": 0.368, "step": 1326 }, { "epoch": 0.5942345368038063, "grad_norm": 0.9665423631668091, "learning_rate": 3.623037857802401e-05, "loss": 0.2019, "step": 1327 }, { "epoch": 0.594682339770501, "grad_norm": 1.0355550050735474, "learning_rate": 3.621883656509695e-05, "loss": 0.1839, "step": 1328 }, { "epoch": 0.5951301427371957, "grad_norm": 1.0803695917129517, "learning_rate": 3.62072945521699e-05, "loss": 0.2234, "step": 1329 }, { "epoch": 0.5955779457038903, "grad_norm": 1.0858778953552246, "learning_rate": 3.619575253924284e-05, "loss": 0.3195, "step": 1330 }, { "epoch": 0.5955779457038903, "eval_loss": 0.3059205412864685, "eval_runtime": 1742.4212, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 1330 }, { "epoch": 0.5960257486705849, "grad_norm": 1.0849155187606812, "learning_rate": 3.618421052631579e-05, "loss": 0.2967, "step": 1331 }, { "epoch": 0.5964735516372796, "grad_norm": 1.3284932374954224, "learning_rate": 3.6172668513388735e-05, "loss": 0.4775, "step": 1332 }, { "epoch": 0.5969213546039742, "grad_norm": 1.0310250520706177, "learning_rate": 3.6161126500461685e-05, "loss": 0.2639, "step": 1333 }, { "epoch": 0.5973691575706689, "grad_norm": 1.4501495361328125, "learning_rate": 3.614958448753463e-05, "loss": 0.4214, "step": 1334 }, { "epoch": 0.5978169605373636, "grad_norm": 0.9147229194641113, "learning_rate": 3.613804247460758e-05, "loss": 0.1917, "step": 1335 }, { "epoch": 0.5982647635040582, "grad_norm": 1.262824296951294, "learning_rate": 3.612650046168052e-05, "loss": 0.2901, "step": 1336 }, { "epoch": 0.5987125664707529, "grad_norm": 1.1430469751358032, "learning_rate": 3.611495844875347e-05, "loss": 0.3782, "step": 1337 }, { "epoch": 0.5991603694374475, "grad_norm": 1.228498101234436, "learning_rate": 3.610341643582641e-05, "loss": 0.3737, "step": 1338 }, { "epoch": 0.5996081724041422, "grad_norm": 1.1623265743255615, "learning_rate": 3.609187442289936e-05, "loss": 0.1474, "step": 1339 }, { "epoch": 0.6000559753708369, "grad_norm": 1.1078261137008667, "learning_rate": 3.60803324099723e-05, "loss": 0.2201, "step": 1340 }, { "epoch": 0.6000559753708369, "eval_loss": 0.3014572560787201, "eval_runtime": 1741.5342, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 1340 }, { "epoch": 0.6005037783375314, "grad_norm": 1.2032644748687744, "learning_rate": 3.6068790397045246e-05, "loss": 0.1858, "step": 1341 }, { "epoch": 0.6009515813042261, "grad_norm": 0.9365187883377075, "learning_rate": 3.605724838411819e-05, "loss": 0.2956, "step": 1342 }, { "epoch": 0.6013993842709208, "grad_norm": 1.3125098943710327, "learning_rate": 3.604570637119114e-05, "loss": 0.3214, "step": 1343 }, { "epoch": 0.6018471872376154, "grad_norm": 1.2828110456466675, "learning_rate": 3.603416435826408e-05, "loss": 0.3273, "step": 1344 }, { "epoch": 0.6022949902043101, "grad_norm": 1.0955266952514648, "learning_rate": 3.602262234533702e-05, "loss": 0.1819, "step": 1345 }, { "epoch": 0.6027427931710048, "grad_norm": 1.1152938604354858, "learning_rate": 3.601108033240997e-05, "loss": 0.2204, "step": 1346 }, { "epoch": 0.6031905961376994, "grad_norm": 1.3604631423950195, "learning_rate": 3.5999538319482915e-05, "loss": 0.2325, "step": 1347 }, { "epoch": 0.6036383991043941, "grad_norm": 1.0130641460418701, "learning_rate": 3.5987996306555864e-05, "loss": 0.2148, "step": 1348 }, { "epoch": 0.6040862020710888, "grad_norm": 0.9263889193534851, "learning_rate": 3.597645429362881e-05, "loss": 0.1883, "step": 1349 }, { "epoch": 0.6045340050377834, "grad_norm": 1.4137290716171265, "learning_rate": 3.5964912280701756e-05, "loss": 0.3185, "step": 1350 }, { "epoch": 0.6045340050377834, "eval_loss": 0.2987724840641022, "eval_runtime": 1740.5422, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 1350 }, { "epoch": 0.604981808004478, "grad_norm": 1.3324114084243774, "learning_rate": 3.59533702677747e-05, "loss": 0.2055, "step": 1351 }, { "epoch": 0.6054296109711726, "grad_norm": 1.4274815320968628, "learning_rate": 3.594182825484765e-05, "loss": 0.434, "step": 1352 }, { "epoch": 0.6058774139378673, "grad_norm": 1.538460612297058, "learning_rate": 3.593028624192059e-05, "loss": 0.4736, "step": 1353 }, { "epoch": 0.606325216904562, "grad_norm": 1.4670627117156982, "learning_rate": 3.591874422899354e-05, "loss": 0.5298, "step": 1354 }, { "epoch": 0.6067730198712566, "grad_norm": 1.2798479795455933, "learning_rate": 3.590720221606648e-05, "loss": 0.3156, "step": 1355 }, { "epoch": 0.6072208228379513, "grad_norm": 1.0595784187316895, "learning_rate": 3.589566020313943e-05, "loss": 0.2297, "step": 1356 }, { "epoch": 0.607668625804646, "grad_norm": 1.2012929916381836, "learning_rate": 3.5884118190212375e-05, "loss": 0.3495, "step": 1357 }, { "epoch": 0.6081164287713406, "grad_norm": 1.1378802061080933, "learning_rate": 3.5872576177285324e-05, "loss": 0.186, "step": 1358 }, { "epoch": 0.6085642317380353, "grad_norm": 1.5004686117172241, "learning_rate": 3.586103416435827e-05, "loss": 0.245, "step": 1359 }, { "epoch": 0.6090120347047299, "grad_norm": 1.2461063861846924, "learning_rate": 3.5849492151431216e-05, "loss": 0.3392, "step": 1360 }, { "epoch": 0.6090120347047299, "eval_loss": 0.29492369294166565, "eval_runtime": 1741.3422, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 1360 }, { "epoch": 0.6094598376714245, "grad_norm": 1.1116904020309448, "learning_rate": 3.583795013850416e-05, "loss": 0.496, "step": 1361 }, { "epoch": 0.6099076406381192, "grad_norm": 1.0401750802993774, "learning_rate": 3.58264081255771e-05, "loss": 0.2223, "step": 1362 }, { "epoch": 0.6103554436048139, "grad_norm": 1.5595612525939941, "learning_rate": 3.5814866112650044e-05, "loss": 0.2784, "step": 1363 }, { "epoch": 0.6108032465715085, "grad_norm": 1.3437434434890747, "learning_rate": 3.5803324099722994e-05, "loss": 0.3927, "step": 1364 }, { "epoch": 0.6112510495382032, "grad_norm": 1.167486548423767, "learning_rate": 3.5791782086795936e-05, "loss": 0.3245, "step": 1365 }, { "epoch": 0.6116988525048979, "grad_norm": 1.1190348863601685, "learning_rate": 3.5780240073868886e-05, "loss": 0.2274, "step": 1366 }, { "epoch": 0.6121466554715925, "grad_norm": 1.1168073415756226, "learning_rate": 3.576869806094183e-05, "loss": 0.2366, "step": 1367 }, { "epoch": 0.6125944584382872, "grad_norm": 1.1127575635910034, "learning_rate": 3.575715604801478e-05, "loss": 0.2578, "step": 1368 }, { "epoch": 0.6130422614049819, "grad_norm": 1.1270995140075684, "learning_rate": 3.574561403508772e-05, "loss": 0.1832, "step": 1369 }, { "epoch": 0.6134900643716764, "grad_norm": 1.4525073766708374, "learning_rate": 3.573407202216066e-05, "loss": 0.1798, "step": 1370 }, { "epoch": 0.6134900643716764, "eval_loss": 0.2917921543121338, "eval_runtime": 1741.2161, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 1370 }, { "epoch": 0.6139378673383711, "grad_norm": 1.565889596939087, "learning_rate": 3.572253000923361e-05, "loss": 0.3886, "step": 1371 }, { "epoch": 0.6143856703050657, "grad_norm": 1.1590737104415894, "learning_rate": 3.5710987996306555e-05, "loss": 0.262, "step": 1372 }, { "epoch": 0.6148334732717604, "grad_norm": 1.444347620010376, "learning_rate": 3.5699445983379504e-05, "loss": 0.2706, "step": 1373 }, { "epoch": 0.6152812762384551, "grad_norm": 1.4933663606643677, "learning_rate": 3.568790397045245e-05, "loss": 0.3262, "step": 1374 }, { "epoch": 0.6157290792051497, "grad_norm": 0.8957858681678772, "learning_rate": 3.5676361957525396e-05, "loss": 0.1987, "step": 1375 }, { "epoch": 0.6161768821718444, "grad_norm": 1.3272017240524292, "learning_rate": 3.566481994459834e-05, "loss": 0.2494, "step": 1376 }, { "epoch": 0.6166246851385391, "grad_norm": 1.256388545036316, "learning_rate": 3.565327793167129e-05, "loss": 0.4452, "step": 1377 }, { "epoch": 0.6170724881052337, "grad_norm": 1.1355270147323608, "learning_rate": 3.564173591874423e-05, "loss": 0.2524, "step": 1378 }, { "epoch": 0.6175202910719284, "grad_norm": 1.2793563604354858, "learning_rate": 3.563019390581718e-05, "loss": 0.3375, "step": 1379 }, { "epoch": 0.617968094038623, "grad_norm": 1.4125409126281738, "learning_rate": 3.561865189289012e-05, "loss": 0.4409, "step": 1380 }, { "epoch": 0.617968094038623, "eval_loss": 0.2876901924610138, "eval_runtime": 1737.7665, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 1380 }, { "epoch": 0.6184158970053176, "grad_norm": 0.9987519979476929, "learning_rate": 3.5607109879963065e-05, "loss": 0.2368, "step": 1381 }, { "epoch": 0.6188636999720123, "grad_norm": 1.0614521503448486, "learning_rate": 3.5595567867036015e-05, "loss": 0.2734, "step": 1382 }, { "epoch": 0.619311502938707, "grad_norm": 0.9792033433914185, "learning_rate": 3.558402585410896e-05, "loss": 0.1667, "step": 1383 }, { "epoch": 0.6197593059054016, "grad_norm": 1.4053237438201904, "learning_rate": 3.55724838411819e-05, "loss": 0.3517, "step": 1384 }, { "epoch": 0.6202071088720963, "grad_norm": 1.050905466079712, "learning_rate": 3.556094182825485e-05, "loss": 0.304, "step": 1385 }, { "epoch": 0.620654911838791, "grad_norm": 1.0837103128433228, "learning_rate": 3.554939981532779e-05, "loss": 0.1562, "step": 1386 }, { "epoch": 0.6211027148054856, "grad_norm": 1.3736618757247925, "learning_rate": 3.553785780240074e-05, "loss": 0.2442, "step": 1387 }, { "epoch": 0.6215505177721803, "grad_norm": 1.0219933986663818, "learning_rate": 3.5526315789473684e-05, "loss": 0.1763, "step": 1388 }, { "epoch": 0.6219983207388748, "grad_norm": 1.1732109785079956, "learning_rate": 3.551477377654663e-05, "loss": 0.2701, "step": 1389 }, { "epoch": 0.6224461237055695, "grad_norm": 1.294912576675415, "learning_rate": 3.5503231763619576e-05, "loss": 0.2805, "step": 1390 }, { "epoch": 0.6224461237055695, "eval_loss": 0.28569599986076355, "eval_runtime": 1737.3228, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 1390 }, { "epoch": 0.6228939266722642, "grad_norm": 0.9582709074020386, "learning_rate": 3.5491689750692525e-05, "loss": 0.2353, "step": 1391 }, { "epoch": 0.6233417296389588, "grad_norm": 1.2713261842727661, "learning_rate": 3.548014773776547e-05, "loss": 0.3263, "step": 1392 }, { "epoch": 0.6237895326056535, "grad_norm": 0.983315110206604, "learning_rate": 3.546860572483842e-05, "loss": 0.4056, "step": 1393 }, { "epoch": 0.6242373355723482, "grad_norm": 1.2925885915756226, "learning_rate": 3.545706371191136e-05, "loss": 0.3493, "step": 1394 }, { "epoch": 0.6246851385390428, "grad_norm": 1.1899527311325073, "learning_rate": 3.54455216989843e-05, "loss": 0.2079, "step": 1395 }, { "epoch": 0.6251329415057375, "grad_norm": 1.4036329984664917, "learning_rate": 3.543397968605725e-05, "loss": 0.3941, "step": 1396 }, { "epoch": 0.6255807444724322, "grad_norm": 1.3126832246780396, "learning_rate": 3.5422437673130195e-05, "loss": 0.2855, "step": 1397 }, { "epoch": 0.6260285474391268, "grad_norm": 0.9315616488456726, "learning_rate": 3.5410895660203144e-05, "loss": 0.1829, "step": 1398 }, { "epoch": 0.6264763504058214, "grad_norm": 1.2371937036514282, "learning_rate": 3.5399353647276087e-05, "loss": 0.3544, "step": 1399 }, { "epoch": 0.6269241533725161, "grad_norm": 1.414616346359253, "learning_rate": 3.5387811634349036e-05, "loss": 0.2595, "step": 1400 }, { "epoch": 0.6269241533725161, "eval_loss": 0.28282201290130615, "eval_runtime": 1742.7887, "eval_samples_per_second": 2.563, "eval_steps_per_second": 2.563, "step": 1400 }, { "epoch": 0.6273719563392107, "grad_norm": 1.3357312679290771, "learning_rate": 3.537626962142198e-05, "loss": 0.3193, "step": 1401 }, { "epoch": 0.6278197593059054, "grad_norm": 1.2589373588562012, "learning_rate": 3.536472760849492e-05, "loss": 0.2254, "step": 1402 }, { "epoch": 0.6282675622726, "grad_norm": 1.0175939798355103, "learning_rate": 3.5353185595567864e-05, "loss": 0.1809, "step": 1403 }, { "epoch": 0.6287153652392947, "grad_norm": 1.132478952407837, "learning_rate": 3.534164358264081e-05, "loss": 0.2762, "step": 1404 }, { "epoch": 0.6291631682059894, "grad_norm": 1.0150865316390991, "learning_rate": 3.5330101569713756e-05, "loss": 0.1599, "step": 1405 }, { "epoch": 0.629610971172684, "grad_norm": 1.0766525268554688, "learning_rate": 3.5318559556786705e-05, "loss": 0.4027, "step": 1406 }, { "epoch": 0.6300587741393787, "grad_norm": 1.0079785585403442, "learning_rate": 3.530701754385965e-05, "loss": 0.1424, "step": 1407 }, { "epoch": 0.6305065771060733, "grad_norm": 1.1688034534454346, "learning_rate": 3.52954755309326e-05, "loss": 0.2678, "step": 1408 }, { "epoch": 0.6309543800727679, "grad_norm": 1.4623137712478638, "learning_rate": 3.528393351800554e-05, "loss": 0.3958, "step": 1409 }, { "epoch": 0.6314021830394626, "grad_norm": 1.6712130308151245, "learning_rate": 3.527239150507849e-05, "loss": 0.4481, "step": 1410 }, { "epoch": 0.6314021830394626, "eval_loss": 0.28085920214653015, "eval_runtime": 1744.3896, "eval_samples_per_second": 2.561, "eval_steps_per_second": 2.561, "step": 1410 }, { "epoch": 0.6318499860061573, "grad_norm": 1.0096598863601685, "learning_rate": 3.526084949215143e-05, "loss": 0.1529, "step": 1411 }, { "epoch": 0.6322977889728519, "grad_norm": 1.2336546182632446, "learning_rate": 3.524930747922438e-05, "loss": 0.3192, "step": 1412 }, { "epoch": 0.6327455919395466, "grad_norm": 1.1255513429641724, "learning_rate": 3.5237765466297324e-05, "loss": 0.1696, "step": 1413 }, { "epoch": 0.6331933949062413, "grad_norm": 1.4015638828277588, "learning_rate": 3.522622345337027e-05, "loss": 0.3031, "step": 1414 }, { "epoch": 0.6336411978729359, "grad_norm": 1.450169324874878, "learning_rate": 3.5214681440443216e-05, "loss": 0.3528, "step": 1415 }, { "epoch": 0.6340890008396306, "grad_norm": 1.0433216094970703, "learning_rate": 3.5203139427516165e-05, "loss": 0.1812, "step": 1416 }, { "epoch": 0.6345368038063253, "grad_norm": 1.7097972631454468, "learning_rate": 3.519159741458911e-05, "loss": 0.4827, "step": 1417 }, { "epoch": 0.6349846067730198, "grad_norm": 1.2833137512207031, "learning_rate": 3.518005540166206e-05, "loss": 0.3412, "step": 1418 }, { "epoch": 0.6354324097397145, "grad_norm": 1.1409881114959717, "learning_rate": 3.5168513388735e-05, "loss": 0.2976, "step": 1419 }, { "epoch": 0.6358802127064092, "grad_norm": 1.182773232460022, "learning_rate": 3.515697137580794e-05, "loss": 0.2242, "step": 1420 }, { "epoch": 0.6358802127064092, "eval_loss": 0.2783552408218384, "eval_runtime": 1744.4292, "eval_samples_per_second": 2.561, "eval_steps_per_second": 2.561, "step": 1420 }, { "epoch": 0.6363280156731038, "grad_norm": 1.0328185558319092, "learning_rate": 3.514542936288089e-05, "loss": 0.2419, "step": 1421 }, { "epoch": 0.6367758186397985, "grad_norm": 1.4960310459136963, "learning_rate": 3.5133887349953834e-05, "loss": 0.3231, "step": 1422 }, { "epoch": 0.6372236216064932, "grad_norm": 0.8000389337539673, "learning_rate": 3.512234533702678e-05, "loss": 0.1323, "step": 1423 }, { "epoch": 0.6376714245731878, "grad_norm": 0.9059187173843384, "learning_rate": 3.511080332409972e-05, "loss": 0.1372, "step": 1424 }, { "epoch": 0.6381192275398825, "grad_norm": 1.1321302652359009, "learning_rate": 3.509926131117267e-05, "loss": 0.2905, "step": 1425 }, { "epoch": 0.6385670305065771, "grad_norm": 1.164314866065979, "learning_rate": 3.508771929824561e-05, "loss": 0.3124, "step": 1426 }, { "epoch": 0.6390148334732718, "grad_norm": 1.1748840808868408, "learning_rate": 3.507617728531856e-05, "loss": 0.3135, "step": 1427 }, { "epoch": 0.6394626364399664, "grad_norm": 1.3437206745147705, "learning_rate": 3.5064635272391503e-05, "loss": 0.3196, "step": 1428 }, { "epoch": 0.639910439406661, "grad_norm": 1.161334753036499, "learning_rate": 3.505309325946445e-05, "loss": 0.2674, "step": 1429 }, { "epoch": 0.6403582423733557, "grad_norm": 1.2514257431030273, "learning_rate": 3.5041551246537395e-05, "loss": 0.2372, "step": 1430 }, { "epoch": 0.6403582423733557, "eval_loss": 0.27350693941116333, "eval_runtime": 1743.9261, "eval_samples_per_second": 2.561, "eval_steps_per_second": 2.561, "step": 1430 }, { "epoch": 0.6408060453400504, "grad_norm": 1.1305255889892578, "learning_rate": 3.5030009233610345e-05, "loss": 0.2812, "step": 1431 }, { "epoch": 0.641253848306745, "grad_norm": 1.2445604801177979, "learning_rate": 3.501846722068329e-05, "loss": 0.1822, "step": 1432 }, { "epoch": 0.6417016512734397, "grad_norm": 1.1391922235488892, "learning_rate": 3.500692520775624e-05, "loss": 0.257, "step": 1433 }, { "epoch": 0.6421494542401344, "grad_norm": 1.5449109077453613, "learning_rate": 3.499538319482918e-05, "loss": 0.1946, "step": 1434 }, { "epoch": 0.642597257206829, "grad_norm": 1.9147025346755981, "learning_rate": 3.498384118190213e-05, "loss": 0.3978, "step": 1435 }, { "epoch": 0.6430450601735237, "grad_norm": 1.0643846988677979, "learning_rate": 3.497229916897507e-05, "loss": 0.2178, "step": 1436 }, { "epoch": 0.6434928631402183, "grad_norm": 1.2681896686553955, "learning_rate": 3.496075715604802e-05, "loss": 0.1863, "step": 1437 }, { "epoch": 0.6439406661069129, "grad_norm": 1.3740264177322388, "learning_rate": 3.494921514312096e-05, "loss": 0.1325, "step": 1438 }, { "epoch": 0.6443884690736076, "grad_norm": 1.1765505075454712, "learning_rate": 3.493767313019391e-05, "loss": 0.3691, "step": 1439 }, { "epoch": 0.6448362720403022, "grad_norm": 1.2055754661560059, "learning_rate": 3.4926131117266855e-05, "loss": 0.2446, "step": 1440 }, { "epoch": 0.6448362720403022, "eval_loss": 0.27113863825798035, "eval_runtime": 1744.5256, "eval_samples_per_second": 2.561, "eval_steps_per_second": 2.561, "step": 1440 }, { "epoch": 0.6452840750069969, "grad_norm": 0.9709833860397339, "learning_rate": 3.49145891043398e-05, "loss": 0.2387, "step": 1441 }, { "epoch": 0.6457318779736916, "grad_norm": 1.0405006408691406, "learning_rate": 3.490304709141274e-05, "loss": 0.1372, "step": 1442 }, { "epoch": 0.6461796809403862, "grad_norm": 0.8977200984954834, "learning_rate": 3.489150507848569e-05, "loss": 0.1927, "step": 1443 }, { "epoch": 0.6466274839070809, "grad_norm": 1.368783712387085, "learning_rate": 3.487996306555863e-05, "loss": 0.3845, "step": 1444 }, { "epoch": 0.6470752868737756, "grad_norm": 0.966835081577301, "learning_rate": 3.4868421052631575e-05, "loss": 0.2843, "step": 1445 }, { "epoch": 0.6475230898404702, "grad_norm": 1.244189977645874, "learning_rate": 3.4856879039704525e-05, "loss": 0.2754, "step": 1446 }, { "epoch": 0.6479708928071648, "grad_norm": 1.1181278228759766, "learning_rate": 3.484533702677747e-05, "loss": 0.2702, "step": 1447 }, { "epoch": 0.6484186957738595, "grad_norm": 1.1835464239120483, "learning_rate": 3.4833795013850417e-05, "loss": 0.3099, "step": 1448 }, { "epoch": 0.6488664987405541, "grad_norm": 1.0337269306182861, "learning_rate": 3.482225300092336e-05, "loss": 0.1738, "step": 1449 }, { "epoch": 0.6493143017072488, "grad_norm": 1.0362787246704102, "learning_rate": 3.481071098799631e-05, "loss": 0.3784, "step": 1450 }, { "epoch": 0.6493143017072488, "eval_loss": 0.26652002334594727, "eval_runtime": 1745.891, "eval_samples_per_second": 2.559, "eval_steps_per_second": 2.559, "step": 1450 }, { "epoch": 0.6497621046739435, "grad_norm": 1.4951701164245605, "learning_rate": 3.479916897506925e-05, "loss": 0.3286, "step": 1451 }, { "epoch": 0.6502099076406381, "grad_norm": 1.4321891069412231, "learning_rate": 3.47876269621422e-05, "loss": 0.418, "step": 1452 }, { "epoch": 0.6506577106073328, "grad_norm": 1.2720353603363037, "learning_rate": 3.477608494921514e-05, "loss": 0.2375, "step": 1453 }, { "epoch": 0.6511055135740275, "grad_norm": 0.9470394253730774, "learning_rate": 3.476454293628809e-05, "loss": 0.1657, "step": 1454 }, { "epoch": 0.6515533165407221, "grad_norm": 1.3401026725769043, "learning_rate": 3.4753000923361035e-05, "loss": 0.2846, "step": 1455 }, { "epoch": 0.6520011195074167, "grad_norm": 0.9690277576446533, "learning_rate": 3.4741458910433985e-05, "loss": 0.1218, "step": 1456 }, { "epoch": 0.6524489224741113, "grad_norm": 1.0513359308242798, "learning_rate": 3.472991689750693e-05, "loss": 0.193, "step": 1457 }, { "epoch": 0.652896725440806, "grad_norm": 1.2486155033111572, "learning_rate": 3.4718374884579877e-05, "loss": 0.4864, "step": 1458 }, { "epoch": 0.6533445284075007, "grad_norm": 1.1349366903305054, "learning_rate": 3.470683287165282e-05, "loss": 0.1879, "step": 1459 }, { "epoch": 0.6537923313741953, "grad_norm": 1.3820996284484863, "learning_rate": 3.469529085872577e-05, "loss": 0.4345, "step": 1460 }, { "epoch": 0.6537923313741953, "eval_loss": 0.2638843357563019, "eval_runtime": 1741.2113, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 1460 }, { "epoch": 0.65424013434089, "grad_norm": 1.2622342109680176, "learning_rate": 3.468374884579871e-05, "loss": 0.2013, "step": 1461 }, { "epoch": 0.6546879373075847, "grad_norm": 0.8820511698722839, "learning_rate": 3.4672206832871654e-05, "loss": 0.1695, "step": 1462 }, { "epoch": 0.6551357402742793, "grad_norm": 1.4605138301849365, "learning_rate": 3.4660664819944596e-05, "loss": 0.2984, "step": 1463 }, { "epoch": 0.655583543240974, "grad_norm": 0.8579814434051514, "learning_rate": 3.4649122807017546e-05, "loss": 0.1279, "step": 1464 }, { "epoch": 0.6560313462076687, "grad_norm": 1.398829460144043, "learning_rate": 3.463758079409049e-05, "loss": 0.3021, "step": 1465 }, { "epoch": 0.6564791491743632, "grad_norm": 1.234954833984375, "learning_rate": 3.462603878116344e-05, "loss": 0.1898, "step": 1466 }, { "epoch": 0.6569269521410579, "grad_norm": 1.1851727962493896, "learning_rate": 3.461449676823638e-05, "loss": 0.2184, "step": 1467 }, { "epoch": 0.6573747551077526, "grad_norm": 1.3027613162994385, "learning_rate": 3.460295475530933e-05, "loss": 0.218, "step": 1468 }, { "epoch": 0.6578225580744472, "grad_norm": 1.4629859924316406, "learning_rate": 3.459141274238227e-05, "loss": 0.3775, "step": 1469 }, { "epoch": 0.6582703610411419, "grad_norm": 1.2793594598770142, "learning_rate": 3.4579870729455215e-05, "loss": 0.4109, "step": 1470 }, { "epoch": 0.6582703610411419, "eval_loss": 0.2607268989086151, "eval_runtime": 1741.3184, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 1470 }, { "epoch": 0.6587181640078366, "grad_norm": 1.1653704643249512, "learning_rate": 3.4568328716528164e-05, "loss": 0.1692, "step": 1471 }, { "epoch": 0.6591659669745312, "grad_norm": 1.1862878799438477, "learning_rate": 3.455678670360111e-05, "loss": 0.2082, "step": 1472 }, { "epoch": 0.6596137699412259, "grad_norm": 1.1138640642166138, "learning_rate": 3.4545244690674056e-05, "loss": 0.1064, "step": 1473 }, { "epoch": 0.6600615729079206, "grad_norm": 1.390017032623291, "learning_rate": 3.4533702677747e-05, "loss": 0.3819, "step": 1474 }, { "epoch": 0.6605093758746152, "grad_norm": 1.2229011058807373, "learning_rate": 3.452216066481995e-05, "loss": 0.2179, "step": 1475 }, { "epoch": 0.6609571788413098, "grad_norm": 1.1169127225875854, "learning_rate": 3.451061865189289e-05, "loss": 0.212, "step": 1476 }, { "epoch": 0.6614049818080044, "grad_norm": 1.3647346496582031, "learning_rate": 3.449907663896584e-05, "loss": 0.2052, "step": 1477 }, { "epoch": 0.6618527847746991, "grad_norm": 1.423122525215149, "learning_rate": 3.448753462603878e-05, "loss": 0.1691, "step": 1478 }, { "epoch": 0.6623005877413938, "grad_norm": 1.2200878858566284, "learning_rate": 3.447599261311173e-05, "loss": 0.3275, "step": 1479 }, { "epoch": 0.6627483907080884, "grad_norm": 1.3014010190963745, "learning_rate": 3.4464450600184675e-05, "loss": 0.3449, "step": 1480 }, { "epoch": 0.6627483907080884, "eval_loss": 0.25847139954566956, "eval_runtime": 1741.5279, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 1480 }, { "epoch": 0.6631961936747831, "grad_norm": 1.3815656900405884, "learning_rate": 3.445290858725762e-05, "loss": 0.1357, "step": 1481 }, { "epoch": 0.6636439966414778, "grad_norm": 1.3851501941680908, "learning_rate": 3.444136657433057e-05, "loss": 0.388, "step": 1482 }, { "epoch": 0.6640917996081724, "grad_norm": 0.9760904908180237, "learning_rate": 3.442982456140351e-05, "loss": 0.2618, "step": 1483 }, { "epoch": 0.6645396025748671, "grad_norm": 1.1593406200408936, "learning_rate": 3.441828254847645e-05, "loss": 0.1924, "step": 1484 }, { "epoch": 0.6649874055415617, "grad_norm": 1.4297518730163574, "learning_rate": 3.44067405355494e-05, "loss": 0.2887, "step": 1485 }, { "epoch": 0.6654352085082563, "grad_norm": 1.0096676349639893, "learning_rate": 3.4395198522622344e-05, "loss": 0.1669, "step": 1486 }, { "epoch": 0.665883011474951, "grad_norm": 1.343379020690918, "learning_rate": 3.4383656509695293e-05, "loss": 0.3973, "step": 1487 }, { "epoch": 0.6663308144416457, "grad_norm": 1.2132956981658936, "learning_rate": 3.4372114496768236e-05, "loss": 0.2312, "step": 1488 }, { "epoch": 0.6667786174083403, "grad_norm": 1.2106480598449707, "learning_rate": 3.4360572483841185e-05, "loss": 0.4319, "step": 1489 }, { "epoch": 0.667226420375035, "grad_norm": 1.4173866510391235, "learning_rate": 3.434903047091413e-05, "loss": 0.3643, "step": 1490 }, { "epoch": 0.667226420375035, "eval_loss": 0.25605136156082153, "eval_runtime": 1742.0461, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 1490 }, { "epoch": 0.6676742233417297, "grad_norm": 1.229823112487793, "learning_rate": 3.433748845798708e-05, "loss": 0.2643, "step": 1491 }, { "epoch": 0.6681220263084243, "grad_norm": 1.2856589555740356, "learning_rate": 3.432594644506002e-05, "loss": 0.2516, "step": 1492 }, { "epoch": 0.668569829275119, "grad_norm": 1.4023637771606445, "learning_rate": 3.431440443213297e-05, "loss": 0.2539, "step": 1493 }, { "epoch": 0.6690176322418137, "grad_norm": 1.1964390277862549, "learning_rate": 3.430286241920591e-05, "loss": 0.4095, "step": 1494 }, { "epoch": 0.6694654352085082, "grad_norm": 1.2144992351531982, "learning_rate": 3.4291320406278855e-05, "loss": 0.3216, "step": 1495 }, { "epoch": 0.6699132381752029, "grad_norm": 1.6141177415847778, "learning_rate": 3.4279778393351804e-05, "loss": 0.3221, "step": 1496 }, { "epoch": 0.6703610411418975, "grad_norm": 1.077372431755066, "learning_rate": 3.4268236380424747e-05, "loss": 0.3164, "step": 1497 }, { "epoch": 0.6708088441085922, "grad_norm": 1.9878309965133667, "learning_rate": 3.4256694367497696e-05, "loss": 0.2299, "step": 1498 }, { "epoch": 0.6712566470752869, "grad_norm": 1.0988008975982666, "learning_rate": 3.424515235457064e-05, "loss": 0.1888, "step": 1499 }, { "epoch": 0.6717044500419815, "grad_norm": 1.2697983980178833, "learning_rate": 3.423361034164359e-05, "loss": 0.2446, "step": 1500 }, { "epoch": 0.6717044500419815, "eval_loss": 0.25102800130844116, "eval_runtime": 1741.064, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 1500 }, { "epoch": 0.6721522530086762, "grad_norm": 1.1101131439208984, "learning_rate": 3.422206832871653e-05, "loss": 0.1175, "step": 1501 }, { "epoch": 0.6726000559753709, "grad_norm": 1.8745020627975464, "learning_rate": 3.421052631578947e-05, "loss": 0.5006, "step": 1502 }, { "epoch": 0.6730478589420655, "grad_norm": 1.5899895429611206, "learning_rate": 3.4198984302862416e-05, "loss": 0.4807, "step": 1503 }, { "epoch": 0.6734956619087601, "grad_norm": 1.6755222082138062, "learning_rate": 3.4187442289935365e-05, "loss": 0.4125, "step": 1504 }, { "epoch": 0.6739434648754548, "grad_norm": 1.5390435457229614, "learning_rate": 3.417590027700831e-05, "loss": 0.524, "step": 1505 }, { "epoch": 0.6743912678421494, "grad_norm": 1.2704511880874634, "learning_rate": 3.416435826408126e-05, "loss": 0.2664, "step": 1506 }, { "epoch": 0.6748390708088441, "grad_norm": 0.9589602947235107, "learning_rate": 3.41528162511542e-05, "loss": 0.2286, "step": 1507 }, { "epoch": 0.6752868737755388, "grad_norm": 1.1151490211486816, "learning_rate": 3.414127423822715e-05, "loss": 0.3238, "step": 1508 }, { "epoch": 0.6757346767422334, "grad_norm": 1.0681778192520142, "learning_rate": 3.412973222530009e-05, "loss": 0.2079, "step": 1509 }, { "epoch": 0.6761824797089281, "grad_norm": 1.1055569648742676, "learning_rate": 3.411819021237304e-05, "loss": 0.2807, "step": 1510 }, { "epoch": 0.6761824797089281, "eval_loss": 0.24902376532554626, "eval_runtime": 1742.0489, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 1510 }, { "epoch": 0.6766302826756228, "grad_norm": 1.061822533607483, "learning_rate": 3.4106648199445984e-05, "loss": 0.2028, "step": 1511 }, { "epoch": 0.6770780856423174, "grad_norm": 1.3082396984100342, "learning_rate": 3.409510618651893e-05, "loss": 0.4129, "step": 1512 }, { "epoch": 0.6775258886090121, "grad_norm": 1.0236505270004272, "learning_rate": 3.4083564173591876e-05, "loss": 0.1439, "step": 1513 }, { "epoch": 0.6779736915757066, "grad_norm": 1.150169849395752, "learning_rate": 3.4072022160664825e-05, "loss": 0.1417, "step": 1514 }, { "epoch": 0.6784214945424013, "grad_norm": 0.9641677141189575, "learning_rate": 3.406048014773777e-05, "loss": 0.1417, "step": 1515 }, { "epoch": 0.678869297509096, "grad_norm": 1.1167751550674438, "learning_rate": 3.404893813481072e-05, "loss": 0.1216, "step": 1516 }, { "epoch": 0.6793171004757906, "grad_norm": 1.2652047872543335, "learning_rate": 3.403739612188366e-05, "loss": 0.253, "step": 1517 }, { "epoch": 0.6797649034424853, "grad_norm": 1.4064199924468994, "learning_rate": 3.402585410895661e-05, "loss": 0.3362, "step": 1518 }, { "epoch": 0.68021270640918, "grad_norm": 1.244307041168213, "learning_rate": 3.401431209602955e-05, "loss": 0.2374, "step": 1519 }, { "epoch": 0.6806605093758746, "grad_norm": 1.3351123332977295, "learning_rate": 3.4002770083102494e-05, "loss": 0.3342, "step": 1520 }, { "epoch": 0.6806605093758746, "eval_loss": 0.24461551010608673, "eval_runtime": 1740.9353, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 1520 }, { "epoch": 0.6811083123425693, "grad_norm": 1.5558222532272339, "learning_rate": 3.3991228070175444e-05, "loss": 0.1321, "step": 1521 }, { "epoch": 0.681556115309264, "grad_norm": 1.5526158809661865, "learning_rate": 3.3979686057248386e-05, "loss": 0.3917, "step": 1522 }, { "epoch": 0.6820039182759586, "grad_norm": 1.3235844373703003, "learning_rate": 3.396814404432133e-05, "loss": 0.2082, "step": 1523 }, { "epoch": 0.6824517212426532, "grad_norm": 1.226735234260559, "learning_rate": 3.395660203139427e-05, "loss": 0.1909, "step": 1524 }, { "epoch": 0.6828995242093479, "grad_norm": 1.3214516639709473, "learning_rate": 3.394506001846722e-05, "loss": 0.2406, "step": 1525 }, { "epoch": 0.6833473271760425, "grad_norm": 1.1992199420928955, "learning_rate": 3.3933518005540164e-05, "loss": 0.216, "step": 1526 }, { "epoch": 0.6837951301427372, "grad_norm": 1.197937250137329, "learning_rate": 3.392197599261311e-05, "loss": 0.193, "step": 1527 }, { "epoch": 0.6842429331094319, "grad_norm": 1.0902233123779297, "learning_rate": 3.3910433979686056e-05, "loss": 0.1702, "step": 1528 }, { "epoch": 0.6846907360761265, "grad_norm": 1.1615121364593506, "learning_rate": 3.3898891966759005e-05, "loss": 0.2149, "step": 1529 }, { "epoch": 0.6851385390428212, "grad_norm": 1.0904998779296875, "learning_rate": 3.388734995383195e-05, "loss": 0.1792, "step": 1530 }, { "epoch": 0.6851385390428212, "eval_loss": 0.24200741946697235, "eval_runtime": 1740.0794, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 1530 }, { "epoch": 0.6855863420095158, "grad_norm": 1.5489822626113892, "learning_rate": 3.38758079409049e-05, "loss": 0.3297, "step": 1531 }, { "epoch": 0.6860341449762105, "grad_norm": 1.0661543607711792, "learning_rate": 3.386426592797784e-05, "loss": 0.1182, "step": 1532 }, { "epoch": 0.6864819479429051, "grad_norm": 1.2064769268035889, "learning_rate": 3.385272391505079e-05, "loss": 0.153, "step": 1533 }, { "epoch": 0.6869297509095997, "grad_norm": 1.2851231098175049, "learning_rate": 3.384118190212373e-05, "loss": 0.4473, "step": 1534 }, { "epoch": 0.6873775538762944, "grad_norm": 1.2064697742462158, "learning_rate": 3.382963988919668e-05, "loss": 0.2223, "step": 1535 }, { "epoch": 0.6878253568429891, "grad_norm": 1.3766595125198364, "learning_rate": 3.3818097876269623e-05, "loss": 0.3017, "step": 1536 }, { "epoch": 0.6882731598096837, "grad_norm": 1.2292721271514893, "learning_rate": 3.380655586334257e-05, "loss": 0.146, "step": 1537 }, { "epoch": 0.6887209627763784, "grad_norm": 1.3682496547698975, "learning_rate": 3.3795013850415515e-05, "loss": 0.2525, "step": 1538 }, { "epoch": 0.6891687657430731, "grad_norm": 1.5135877132415771, "learning_rate": 3.3783471837488465e-05, "loss": 0.2383, "step": 1539 }, { "epoch": 0.6896165687097677, "grad_norm": 1.3297990560531616, "learning_rate": 3.377192982456141e-05, "loss": 0.2684, "step": 1540 }, { "epoch": 0.6896165687097677, "eval_loss": 0.23915019631385803, "eval_runtime": 1739.054, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 1540 }, { "epoch": 0.6900643716764624, "grad_norm": 1.10771644115448, "learning_rate": 3.376038781163435e-05, "loss": 0.1979, "step": 1541 }, { "epoch": 0.6905121746431571, "grad_norm": 0.916994571685791, "learning_rate": 3.374884579870729e-05, "loss": 0.0774, "step": 1542 }, { "epoch": 0.6909599776098516, "grad_norm": 1.2376232147216797, "learning_rate": 3.373730378578024e-05, "loss": 0.1507, "step": 1543 }, { "epoch": 0.6914077805765463, "grad_norm": 1.1735001802444458, "learning_rate": 3.3725761772853185e-05, "loss": 0.3846, "step": 1544 }, { "epoch": 0.691855583543241, "grad_norm": 1.585693359375, "learning_rate": 3.371421975992613e-05, "loss": 0.4625, "step": 1545 }, { "epoch": 0.6923033865099356, "grad_norm": 1.234339952468872, "learning_rate": 3.370267774699908e-05, "loss": 0.1853, "step": 1546 }, { "epoch": 0.6927511894766303, "grad_norm": 0.9094129204750061, "learning_rate": 3.369113573407202e-05, "loss": 0.1299, "step": 1547 }, { "epoch": 0.693198992443325, "grad_norm": 1.1160544157028198, "learning_rate": 3.367959372114497e-05, "loss": 0.1527, "step": 1548 }, { "epoch": 0.6936467954100196, "grad_norm": 1.3147051334381104, "learning_rate": 3.366805170821791e-05, "loss": 0.2995, "step": 1549 }, { "epoch": 0.6940945983767143, "grad_norm": 1.2512911558151245, "learning_rate": 3.365650969529086e-05, "loss": 0.2386, "step": 1550 }, { "epoch": 0.6940945983767143, "eval_loss": 0.23694820702075958, "eval_runtime": 1738.7269, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 1550 }, { "epoch": 0.6945424013434089, "grad_norm": 1.0559110641479492, "learning_rate": 3.36449676823638e-05, "loss": 0.2887, "step": 1551 }, { "epoch": 0.6949902043101036, "grad_norm": 1.3872283697128296, "learning_rate": 3.363342566943675e-05, "loss": 0.1972, "step": 1552 }, { "epoch": 0.6954380072767982, "grad_norm": 1.2875914573669434, "learning_rate": 3.3621883656509695e-05, "loss": 0.3023, "step": 1553 }, { "epoch": 0.6958858102434928, "grad_norm": 1.1694672107696533, "learning_rate": 3.3610341643582645e-05, "loss": 0.2716, "step": 1554 }, { "epoch": 0.6963336132101875, "grad_norm": 1.1989940404891968, "learning_rate": 3.359879963065559e-05, "loss": 0.2673, "step": 1555 }, { "epoch": 0.6967814161768822, "grad_norm": 1.1650522947311401, "learning_rate": 3.3587257617728537e-05, "loss": 0.2633, "step": 1556 }, { "epoch": 0.6972292191435768, "grad_norm": 1.125189185142517, "learning_rate": 3.357571560480148e-05, "loss": 0.1559, "step": 1557 }, { "epoch": 0.6976770221102715, "grad_norm": 0.9679086804389954, "learning_rate": 3.356417359187443e-05, "loss": 0.1219, "step": 1558 }, { "epoch": 0.6981248250769662, "grad_norm": 1.3153406381607056, "learning_rate": 3.355263157894737e-05, "loss": 0.2164, "step": 1559 }, { "epoch": 0.6985726280436608, "grad_norm": 1.0555087327957153, "learning_rate": 3.354108956602032e-05, "loss": 0.1167, "step": 1560 }, { "epoch": 0.6985726280436608, "eval_loss": 0.23435254395008087, "eval_runtime": 1735.1927, "eval_samples_per_second": 2.574, "eval_steps_per_second": 2.574, "step": 1560 }, { "epoch": 0.6990204310103555, "grad_norm": 1.350873589515686, "learning_rate": 3.352954755309326e-05, "loss": 0.2516, "step": 1561 }, { "epoch": 0.69946823397705, "grad_norm": 1.7134159803390503, "learning_rate": 3.3518005540166206e-05, "loss": 0.2061, "step": 1562 }, { "epoch": 0.6999160369437447, "grad_norm": 1.3558813333511353, "learning_rate": 3.350646352723915e-05, "loss": 0.2233, "step": 1563 }, { "epoch": 0.7003638399104394, "grad_norm": 1.2703967094421387, "learning_rate": 3.34949215143121e-05, "loss": 0.2354, "step": 1564 }, { "epoch": 0.700811642877134, "grad_norm": 1.2839316129684448, "learning_rate": 3.348337950138504e-05, "loss": 0.2084, "step": 1565 }, { "epoch": 0.7012594458438287, "grad_norm": 1.9036198854446411, "learning_rate": 3.347183748845799e-05, "loss": 0.2555, "step": 1566 }, { "epoch": 0.7017072488105234, "grad_norm": 1.2471376657485962, "learning_rate": 3.346029547553093e-05, "loss": 0.2167, "step": 1567 }, { "epoch": 0.702155051777218, "grad_norm": 1.5329457521438599, "learning_rate": 3.344875346260388e-05, "loss": 0.3522, "step": 1568 }, { "epoch": 0.7026028547439127, "grad_norm": 1.1027286052703857, "learning_rate": 3.3437211449676824e-05, "loss": 0.3117, "step": 1569 }, { "epoch": 0.7030506577106074, "grad_norm": 1.2840759754180908, "learning_rate": 3.342566943674977e-05, "loss": 0.2366, "step": 1570 }, { "epoch": 0.7030506577106074, "eval_loss": 0.23120713233947754, "eval_runtime": 1731.9849, "eval_samples_per_second": 2.579, "eval_steps_per_second": 2.579, "step": 1570 }, { "epoch": 0.703498460677302, "grad_norm": 1.1187115907669067, "learning_rate": 3.3414127423822716e-05, "loss": 0.3006, "step": 1571 }, { "epoch": 0.7039462636439966, "grad_norm": 1.0415239334106445, "learning_rate": 3.340258541089566e-05, "loss": 0.224, "step": 1572 }, { "epoch": 0.7043940666106913, "grad_norm": 1.0761853456497192, "learning_rate": 3.339104339796861e-05, "loss": 0.3569, "step": 1573 }, { "epoch": 0.7048418695773859, "grad_norm": 1.53033447265625, "learning_rate": 3.337950138504155e-05, "loss": 0.3501, "step": 1574 }, { "epoch": 0.7052896725440806, "grad_norm": 1.0321897268295288, "learning_rate": 3.33679593721145e-05, "loss": 0.194, "step": 1575 }, { "epoch": 0.7057374755107753, "grad_norm": 1.5243761539459229, "learning_rate": 3.335641735918744e-05, "loss": 0.2557, "step": 1576 }, { "epoch": 0.7061852784774699, "grad_norm": 1.264231562614441, "learning_rate": 3.334487534626039e-05, "loss": 0.179, "step": 1577 }, { "epoch": 0.7066330814441646, "grad_norm": 1.316745400428772, "learning_rate": 3.3333333333333335e-05, "loss": 0.434, "step": 1578 }, { "epoch": 0.7070808844108593, "grad_norm": 1.1383960247039795, "learning_rate": 3.3321791320406284e-05, "loss": 0.1775, "step": 1579 }, { "epoch": 0.7075286873775539, "grad_norm": 1.0016801357269287, "learning_rate": 3.331024930747923e-05, "loss": 0.1749, "step": 1580 }, { "epoch": 0.7075286873775539, "eval_loss": 0.22776223719120026, "eval_runtime": 1740.0928, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 1580 }, { "epoch": 0.7079764903442485, "grad_norm": 1.7598121166229248, "learning_rate": 3.329870729455217e-05, "loss": 0.2445, "step": 1581 }, { "epoch": 0.7084242933109431, "grad_norm": 1.9481260776519775, "learning_rate": 3.328716528162512e-05, "loss": 0.2726, "step": 1582 }, { "epoch": 0.7088720962776378, "grad_norm": 1.1749162673950195, "learning_rate": 3.327562326869806e-05, "loss": 0.103, "step": 1583 }, { "epoch": 0.7093198992443325, "grad_norm": 1.4559996128082275, "learning_rate": 3.3264081255771004e-05, "loss": 0.2297, "step": 1584 }, { "epoch": 0.7097677022110271, "grad_norm": 1.4041540622711182, "learning_rate": 3.3252539242843954e-05, "loss": 0.2676, "step": 1585 }, { "epoch": 0.7102155051777218, "grad_norm": 1.2508280277252197, "learning_rate": 3.3240997229916896e-05, "loss": 0.254, "step": 1586 }, { "epoch": 0.7106633081444165, "grad_norm": 0.9396544694900513, "learning_rate": 3.3229455216989845e-05, "loss": 0.1049, "step": 1587 }, { "epoch": 0.7111111111111111, "grad_norm": 1.3375426530838013, "learning_rate": 3.321791320406279e-05, "loss": 0.2525, "step": 1588 }, { "epoch": 0.7115589140778058, "grad_norm": 1.631959080696106, "learning_rate": 3.320637119113574e-05, "loss": 0.2584, "step": 1589 }, { "epoch": 0.7120067170445005, "grad_norm": 1.31275475025177, "learning_rate": 3.319482917820868e-05, "loss": 0.1589, "step": 1590 }, { "epoch": 0.7120067170445005, "eval_loss": 0.2232540100812912, "eval_runtime": 1739.5515, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 1590 }, { "epoch": 0.712454520011195, "grad_norm": 1.3619765043258667, "learning_rate": 3.318328716528163e-05, "loss": 0.1927, "step": 1591 }, { "epoch": 0.7129023229778897, "grad_norm": 1.2817407846450806, "learning_rate": 3.317174515235457e-05, "loss": 0.2036, "step": 1592 }, { "epoch": 0.7133501259445844, "grad_norm": 1.3498672246932983, "learning_rate": 3.316020313942752e-05, "loss": 0.1583, "step": 1593 }, { "epoch": 0.713797928911279, "grad_norm": 1.0232353210449219, "learning_rate": 3.3148661126500464e-05, "loss": 0.1753, "step": 1594 }, { "epoch": 0.7142457318779737, "grad_norm": 1.239030361175537, "learning_rate": 3.313711911357341e-05, "loss": 0.2525, "step": 1595 }, { "epoch": 0.7146935348446684, "grad_norm": 0.9768178462982178, "learning_rate": 3.3125577100646356e-05, "loss": 0.1509, "step": 1596 }, { "epoch": 0.715141337811363, "grad_norm": 1.2771555185317993, "learning_rate": 3.31140350877193e-05, "loss": 0.22, "step": 1597 }, { "epoch": 0.7155891407780577, "grad_norm": 0.8508195877075195, "learning_rate": 3.310249307479225e-05, "loss": 0.1854, "step": 1598 }, { "epoch": 0.7160369437447524, "grad_norm": 1.0700000524520874, "learning_rate": 3.309095106186519e-05, "loss": 0.2283, "step": 1599 }, { "epoch": 0.716484746711447, "grad_norm": 1.2275710105895996, "learning_rate": 3.307940904893814e-05, "loss": 0.1604, "step": 1600 }, { "epoch": 0.716484746711447, "eval_loss": 0.22277671098709106, "eval_runtime": 1739.7877, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 1600 }, { "epoch": 0.7169325496781416, "grad_norm": 0.8512494564056396, "learning_rate": 3.306786703601108e-05, "loss": 0.0913, "step": 1601 }, { "epoch": 0.7173803526448362, "grad_norm": 1.4188917875289917, "learning_rate": 3.3056325023084025e-05, "loss": 0.3004, "step": 1602 }, { "epoch": 0.7178281556115309, "grad_norm": 1.1424413919448853, "learning_rate": 3.304478301015697e-05, "loss": 0.2261, "step": 1603 }, { "epoch": 0.7182759585782256, "grad_norm": 1.630253553390503, "learning_rate": 3.303324099722992e-05, "loss": 0.3728, "step": 1604 }, { "epoch": 0.7187237615449202, "grad_norm": 1.3553986549377441, "learning_rate": 3.302169898430286e-05, "loss": 0.2097, "step": 1605 }, { "epoch": 0.7191715645116149, "grad_norm": 1.4127997159957886, "learning_rate": 3.301015697137581e-05, "loss": 0.2059, "step": 1606 }, { "epoch": 0.7196193674783096, "grad_norm": 1.2591644525527954, "learning_rate": 3.299861495844875e-05, "loss": 0.2187, "step": 1607 }, { "epoch": 0.7200671704450042, "grad_norm": 1.056017279624939, "learning_rate": 3.29870729455217e-05, "loss": 0.1817, "step": 1608 }, { "epoch": 0.7205149734116989, "grad_norm": 1.230894923210144, "learning_rate": 3.2975530932594644e-05, "loss": 0.3076, "step": 1609 }, { "epoch": 0.7209627763783935, "grad_norm": 1.300376534461975, "learning_rate": 3.296398891966759e-05, "loss": 0.2628, "step": 1610 }, { "epoch": 0.7209627763783935, "eval_loss": 0.21946477890014648, "eval_runtime": 1739.4143, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 1610 }, { "epoch": 0.7214105793450881, "grad_norm": 1.413095235824585, "learning_rate": 3.2952446906740536e-05, "loss": 0.2967, "step": 1611 }, { "epoch": 0.7218583823117828, "grad_norm": 1.1679149866104126, "learning_rate": 3.2940904893813485e-05, "loss": 0.3076, "step": 1612 }, { "epoch": 0.7223061852784775, "grad_norm": 1.018036961555481, "learning_rate": 3.292936288088643e-05, "loss": 0.1127, "step": 1613 }, { "epoch": 0.7227539882451721, "grad_norm": 0.766292929649353, "learning_rate": 3.291782086795938e-05, "loss": 0.0642, "step": 1614 }, { "epoch": 0.7232017912118668, "grad_norm": 1.2774684429168701, "learning_rate": 3.290627885503232e-05, "loss": 0.1783, "step": 1615 }, { "epoch": 0.7236495941785615, "grad_norm": 1.4727896451950073, "learning_rate": 3.289473684210527e-05, "loss": 0.2811, "step": 1616 }, { "epoch": 0.7240973971452561, "grad_norm": 1.0615532398223877, "learning_rate": 3.288319482917821e-05, "loss": 0.2028, "step": 1617 }, { "epoch": 0.7245452001119508, "grad_norm": 1.279786229133606, "learning_rate": 3.287165281625116e-05, "loss": 0.2692, "step": 1618 }, { "epoch": 0.7249930030786454, "grad_norm": 1.2355388402938843, "learning_rate": 3.2860110803324104e-05, "loss": 0.2413, "step": 1619 }, { "epoch": 0.72544080604534, "grad_norm": 1.3490946292877197, "learning_rate": 3.2848568790397046e-05, "loss": 0.2309, "step": 1620 }, { "epoch": 0.72544080604534, "eval_loss": 0.21591566503047943, "eval_runtime": 1739.1002, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 1620 }, { "epoch": 0.7258886090120347, "grad_norm": 1.1078510284423828, "learning_rate": 3.2837026777469996e-05, "loss": 0.1262, "step": 1621 }, { "epoch": 0.7263364119787293, "grad_norm": 1.130131483078003, "learning_rate": 3.282548476454294e-05, "loss": 0.146, "step": 1622 }, { "epoch": 0.726784214945424, "grad_norm": 1.545876383781433, "learning_rate": 3.281394275161588e-05, "loss": 0.3484, "step": 1623 }, { "epoch": 0.7272320179121187, "grad_norm": 1.292954444885254, "learning_rate": 3.2802400738688824e-05, "loss": 0.2202, "step": 1624 }, { "epoch": 0.7276798208788133, "grad_norm": 1.3415828943252563, "learning_rate": 3.279085872576177e-05, "loss": 0.2863, "step": 1625 }, { "epoch": 0.728127623845508, "grad_norm": 1.084219217300415, "learning_rate": 3.2779316712834716e-05, "loss": 0.2006, "step": 1626 }, { "epoch": 0.7285754268122027, "grad_norm": 1.0170210599899292, "learning_rate": 3.2767774699907665e-05, "loss": 0.1657, "step": 1627 }, { "epoch": 0.7290232297788973, "grad_norm": 1.1863627433776855, "learning_rate": 3.275623268698061e-05, "loss": 0.1701, "step": 1628 }, { "epoch": 0.7294710327455919, "grad_norm": 1.0183345079421997, "learning_rate": 3.274469067405356e-05, "loss": 0.13, "step": 1629 }, { "epoch": 0.7299188357122866, "grad_norm": 1.2636232376098633, "learning_rate": 3.27331486611265e-05, "loss": 0.3096, "step": 1630 }, { "epoch": 0.7299188357122866, "eval_loss": 0.21282623708248138, "eval_runtime": 1738.9211, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 1630 }, { "epoch": 0.7303666386789812, "grad_norm": 1.625278115272522, "learning_rate": 3.272160664819945e-05, "loss": 0.1581, "step": 1631 }, { "epoch": 0.7308144416456759, "grad_norm": 1.0692945718765259, "learning_rate": 3.271006463527239e-05, "loss": 0.1466, "step": 1632 }, { "epoch": 0.7312622446123705, "grad_norm": 1.545180320739746, "learning_rate": 3.269852262234534e-05, "loss": 0.2468, "step": 1633 }, { "epoch": 0.7317100475790652, "grad_norm": 1.2902580499649048, "learning_rate": 3.2686980609418284e-05, "loss": 0.1939, "step": 1634 }, { "epoch": 0.7321578505457599, "grad_norm": 1.4147045612335205, "learning_rate": 3.267543859649123e-05, "loss": 0.3051, "step": 1635 }, { "epoch": 0.7326056535124545, "grad_norm": 1.821250557899475, "learning_rate": 3.2663896583564176e-05, "loss": 0.5277, "step": 1636 }, { "epoch": 0.7330534564791492, "grad_norm": 1.1706265211105347, "learning_rate": 3.2652354570637125e-05, "loss": 0.2537, "step": 1637 }, { "epoch": 0.7335012594458439, "grad_norm": 1.3979747295379639, "learning_rate": 3.264081255771007e-05, "loss": 0.1647, "step": 1638 }, { "epoch": 0.7339490624125384, "grad_norm": 1.1551800966262817, "learning_rate": 3.262927054478302e-05, "loss": 0.2547, "step": 1639 }, { "epoch": 0.7343968653792331, "grad_norm": 0.986116349697113, "learning_rate": 3.261772853185596e-05, "loss": 0.1303, "step": 1640 }, { "epoch": 0.7343968653792331, "eval_loss": 0.21062257885932922, "eval_runtime": 1739.0011, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 1640 }, { "epoch": 0.7348446683459278, "grad_norm": 0.9228246808052063, "learning_rate": 3.26061865189289e-05, "loss": 0.1241, "step": 1641 }, { "epoch": 0.7352924713126224, "grad_norm": 1.1444522142410278, "learning_rate": 3.2594644506001845e-05, "loss": 0.2111, "step": 1642 }, { "epoch": 0.7357402742793171, "grad_norm": 1.3221601247787476, "learning_rate": 3.2583102493074794e-05, "loss": 0.5413, "step": 1643 }, { "epoch": 0.7361880772460118, "grad_norm": 1.3881961107254028, "learning_rate": 3.257156048014774e-05, "loss": 0.2686, "step": 1644 }, { "epoch": 0.7366358802127064, "grad_norm": 1.500131368637085, "learning_rate": 3.2560018467220686e-05, "loss": 0.3149, "step": 1645 }, { "epoch": 0.7370836831794011, "grad_norm": 1.0553194284439087, "learning_rate": 3.254847645429363e-05, "loss": 0.1628, "step": 1646 }, { "epoch": 0.7375314861460958, "grad_norm": 0.9254956841468811, "learning_rate": 3.253693444136657e-05, "loss": 0.1078, "step": 1647 }, { "epoch": 0.7379792891127904, "grad_norm": 1.1450732946395874, "learning_rate": 3.252539242843952e-05, "loss": 0.2339, "step": 1648 }, { "epoch": 0.738427092079485, "grad_norm": 0.9290254712104797, "learning_rate": 3.251385041551246e-05, "loss": 0.1774, "step": 1649 }, { "epoch": 0.7388748950461796, "grad_norm": 0.8868831396102905, "learning_rate": 3.250230840258541e-05, "loss": 0.1535, "step": 1650 }, { "epoch": 0.7388748950461796, "eval_loss": 0.2098311334848404, "eval_runtime": 1738.0385, "eval_samples_per_second": 2.57, "eval_steps_per_second": 2.57, "step": 1650 }, { "epoch": 0.7393226980128743, "grad_norm": 1.1324498653411865, "learning_rate": 3.2490766389658355e-05, "loss": 0.1097, "step": 1651 }, { "epoch": 0.739770500979569, "grad_norm": 1.3903108835220337, "learning_rate": 3.2479224376731305e-05, "loss": 0.24, "step": 1652 }, { "epoch": 0.7402183039462636, "grad_norm": 1.320954442024231, "learning_rate": 3.246768236380425e-05, "loss": 0.2528, "step": 1653 }, { "epoch": 0.7406661069129583, "grad_norm": 1.5540986061096191, "learning_rate": 3.24561403508772e-05, "loss": 0.2835, "step": 1654 }, { "epoch": 0.741113909879653, "grad_norm": 1.2737802267074585, "learning_rate": 3.244459833795014e-05, "loss": 0.2013, "step": 1655 }, { "epoch": 0.7415617128463476, "grad_norm": 1.185079574584961, "learning_rate": 3.243305632502309e-05, "loss": 0.1936, "step": 1656 }, { "epoch": 0.7420095158130423, "grad_norm": 1.8175549507141113, "learning_rate": 3.242151431209603e-05, "loss": 0.3241, "step": 1657 }, { "epoch": 0.7424573187797369, "grad_norm": 0.9585305452346802, "learning_rate": 3.240997229916898e-05, "loss": 0.1021, "step": 1658 }, { "epoch": 0.7429051217464315, "grad_norm": 1.0971755981445312, "learning_rate": 3.239843028624192e-05, "loss": 0.1455, "step": 1659 }, { "epoch": 0.7433529247131262, "grad_norm": 1.2596510648727417, "learning_rate": 3.238688827331487e-05, "loss": 0.1561, "step": 1660 }, { "epoch": 0.7433529247131262, "eval_loss": 0.2056712806224823, "eval_runtime": 1736.8376, "eval_samples_per_second": 2.572, "eval_steps_per_second": 2.572, "step": 1660 }, { "epoch": 0.7438007276798209, "grad_norm": 1.399214506149292, "learning_rate": 3.2375346260387815e-05, "loss": 0.1979, "step": 1661 }, { "epoch": 0.7442485306465155, "grad_norm": 1.5399458408355713, "learning_rate": 3.236380424746076e-05, "loss": 0.2839, "step": 1662 }, { "epoch": 0.7446963336132102, "grad_norm": 1.3398548364639282, "learning_rate": 3.23522622345337e-05, "loss": 0.2559, "step": 1663 }, { "epoch": 0.7451441365799049, "grad_norm": 1.3190560340881348, "learning_rate": 3.234072022160665e-05, "loss": 0.123, "step": 1664 }, { "epoch": 0.7455919395465995, "grad_norm": 0.886492908000946, "learning_rate": 3.232917820867959e-05, "loss": 0.1186, "step": 1665 }, { "epoch": 0.7460397425132942, "grad_norm": 1.5858815908432007, "learning_rate": 3.231763619575254e-05, "loss": 0.2775, "step": 1666 }, { "epoch": 0.7464875454799889, "grad_norm": 1.419754147529602, "learning_rate": 3.2306094182825484e-05, "loss": 0.2343, "step": 1667 }, { "epoch": 0.7469353484466834, "grad_norm": 1.1208510398864746, "learning_rate": 3.2294552169898434e-05, "loss": 0.1272, "step": 1668 }, { "epoch": 0.7473831514133781, "grad_norm": 0.9657753705978394, "learning_rate": 3.2283010156971376e-05, "loss": 0.1537, "step": 1669 }, { "epoch": 0.7478309543800727, "grad_norm": 1.117485761642456, "learning_rate": 3.2271468144044326e-05, "loss": 0.2412, "step": 1670 }, { "epoch": 0.7478309543800727, "eval_loss": 0.2037663459777832, "eval_runtime": 1736.9404, "eval_samples_per_second": 2.572, "eval_steps_per_second": 2.572, "step": 1670 }, { "epoch": 0.7482787573467674, "grad_norm": 1.138968825340271, "learning_rate": 3.225992613111727e-05, "loss": 0.221, "step": 1671 }, { "epoch": 0.7487265603134621, "grad_norm": 1.2097197771072388, "learning_rate": 3.224838411819021e-05, "loss": 0.1445, "step": 1672 }, { "epoch": 0.7491743632801567, "grad_norm": 1.4289298057556152, "learning_rate": 3.223684210526316e-05, "loss": 0.2555, "step": 1673 }, { "epoch": 0.7496221662468514, "grad_norm": 1.1956260204315186, "learning_rate": 3.22253000923361e-05, "loss": 0.2039, "step": 1674 }, { "epoch": 0.7500699692135461, "grad_norm": 1.2446436882019043, "learning_rate": 3.221375807940905e-05, "loss": 0.2794, "step": 1675 }, { "epoch": 0.7505177721802407, "grad_norm": 1.1709624528884888, "learning_rate": 3.2202216066481995e-05, "loss": 0.1036, "step": 1676 }, { "epoch": 0.7509655751469353, "grad_norm": 1.0143916606903076, "learning_rate": 3.2190674053554944e-05, "loss": 0.1106, "step": 1677 }, { "epoch": 0.75141337811363, "grad_norm": 1.3979700803756714, "learning_rate": 3.217913204062789e-05, "loss": 0.2883, "step": 1678 }, { "epoch": 0.7518611810803246, "grad_norm": 1.49893319606781, "learning_rate": 3.2167590027700836e-05, "loss": 0.3638, "step": 1679 }, { "epoch": 0.7523089840470193, "grad_norm": 1.2170891761779785, "learning_rate": 3.215604801477378e-05, "loss": 0.2359, "step": 1680 }, { "epoch": 0.7523089840470193, "eval_loss": 0.20157921314239502, "eval_runtime": 1735.3712, "eval_samples_per_second": 2.574, "eval_steps_per_second": 2.574, "step": 1680 }, { "epoch": 0.752756787013714, "grad_norm": 1.215842366218567, "learning_rate": 3.214450600184672e-05, "loss": 0.3012, "step": 1681 }, { "epoch": 0.7532045899804086, "grad_norm": 1.4815665483474731, "learning_rate": 3.213296398891967e-05, "loss": 0.1521, "step": 1682 }, { "epoch": 0.7536523929471033, "grad_norm": 1.3670876026153564, "learning_rate": 3.2121421975992614e-05, "loss": 0.1288, "step": 1683 }, { "epoch": 0.754100195913798, "grad_norm": 1.0420467853546143, "learning_rate": 3.2109879963065556e-05, "loss": 0.1539, "step": 1684 }, { "epoch": 0.7545479988804926, "grad_norm": 1.1879098415374756, "learning_rate": 3.2098337950138506e-05, "loss": 0.1517, "step": 1685 }, { "epoch": 0.7549958018471873, "grad_norm": 1.3479251861572266, "learning_rate": 3.208679593721145e-05, "loss": 0.26, "step": 1686 }, { "epoch": 0.7554436048138818, "grad_norm": 2.0593390464782715, "learning_rate": 3.20752539242844e-05, "loss": 0.5097, "step": 1687 }, { "epoch": 0.7558914077805765, "grad_norm": 1.2824532985687256, "learning_rate": 3.206371191135734e-05, "loss": 0.2245, "step": 1688 }, { "epoch": 0.7563392107472712, "grad_norm": 1.1562902927398682, "learning_rate": 3.205216989843029e-05, "loss": 0.1307, "step": 1689 }, { "epoch": 0.7567870137139658, "grad_norm": 0.6965445280075073, "learning_rate": 3.204062788550323e-05, "loss": 0.0786, "step": 1690 }, { "epoch": 0.7567870137139658, "eval_loss": 0.1975894570350647, "eval_runtime": 1736.2667, "eval_samples_per_second": 2.573, "eval_steps_per_second": 2.573, "step": 1690 }, { "epoch": 0.7572348166806605, "grad_norm": 1.2004364728927612, "learning_rate": 3.202908587257618e-05, "loss": 0.2382, "step": 1691 }, { "epoch": 0.7576826196473552, "grad_norm": 0.8067834377288818, "learning_rate": 3.2017543859649124e-05, "loss": 0.081, "step": 1692 }, { "epoch": 0.7581304226140498, "grad_norm": 1.2369734048843384, "learning_rate": 3.2006001846722074e-05, "loss": 0.1239, "step": 1693 }, { "epoch": 0.7585782255807445, "grad_norm": 1.0439966917037964, "learning_rate": 3.1994459833795016e-05, "loss": 0.1213, "step": 1694 }, { "epoch": 0.7590260285474392, "grad_norm": 0.9752621650695801, "learning_rate": 3.1982917820867966e-05, "loss": 0.1779, "step": 1695 }, { "epoch": 0.7594738315141338, "grad_norm": 1.333712100982666, "learning_rate": 3.197137580794091e-05, "loss": 0.1714, "step": 1696 }, { "epoch": 0.7599216344808284, "grad_norm": 1.0253794193267822, "learning_rate": 3.195983379501385e-05, "loss": 0.1056, "step": 1697 }, { "epoch": 0.7603694374475231, "grad_norm": 1.007927656173706, "learning_rate": 3.19482917820868e-05, "loss": 0.1837, "step": 1698 }, { "epoch": 0.7608172404142177, "grad_norm": 1.5209026336669922, "learning_rate": 3.193674976915974e-05, "loss": 0.1403, "step": 1699 }, { "epoch": 0.7612650433809124, "grad_norm": 1.6739745140075684, "learning_rate": 3.192520775623269e-05, "loss": 0.2188, "step": 1700 }, { "epoch": 0.7612650433809124, "eval_loss": 0.19731631875038147, "eval_runtime": 1732.9245, "eval_samples_per_second": 2.578, "eval_steps_per_second": 2.578, "step": 1700 }, { "epoch": 0.761712846347607, "grad_norm": 1.293473482131958, "learning_rate": 3.1913665743305635e-05, "loss": 0.1678, "step": 1701 }, { "epoch": 0.7621606493143017, "grad_norm": 1.1437653303146362, "learning_rate": 3.190212373037858e-05, "loss": 0.2575, "step": 1702 }, { "epoch": 0.7626084522809964, "grad_norm": 1.0263285636901855, "learning_rate": 3.189058171745152e-05, "loss": 0.2893, "step": 1703 }, { "epoch": 0.763056255247691, "grad_norm": 1.3024674654006958, "learning_rate": 3.187903970452447e-05, "loss": 0.1215, "step": 1704 }, { "epoch": 0.7635040582143857, "grad_norm": 1.4158358573913574, "learning_rate": 3.186749769159741e-05, "loss": 0.2106, "step": 1705 }, { "epoch": 0.7639518611810803, "grad_norm": 0.9089943170547485, "learning_rate": 3.185595567867036e-05, "loss": 0.1623, "step": 1706 }, { "epoch": 0.7643996641477749, "grad_norm": 1.3617576360702515, "learning_rate": 3.1844413665743304e-05, "loss": 0.2851, "step": 1707 }, { "epoch": 0.7648474671144696, "grad_norm": 1.1946085691452026, "learning_rate": 3.183287165281625e-05, "loss": 0.17, "step": 1708 }, { "epoch": 0.7652952700811643, "grad_norm": 1.0318683385849, "learning_rate": 3.1821329639889196e-05, "loss": 0.212, "step": 1709 }, { "epoch": 0.7657430730478589, "grad_norm": 2.042214870452881, "learning_rate": 3.1809787626962145e-05, "loss": 0.2701, "step": 1710 }, { "epoch": 0.7657430730478589, "eval_loss": 0.19515901803970337, "eval_runtime": 1728.648, "eval_samples_per_second": 2.584, "eval_steps_per_second": 2.584, "step": 1710 }, { "epoch": 0.7661908760145536, "grad_norm": 0.8775007724761963, "learning_rate": 3.179824561403509e-05, "loss": 0.2296, "step": 1711 }, { "epoch": 0.7666386789812483, "grad_norm": 0.797558605670929, "learning_rate": 3.178670360110804e-05, "loss": 0.1135, "step": 1712 }, { "epoch": 0.7670864819479429, "grad_norm": 1.1293439865112305, "learning_rate": 3.177516158818098e-05, "loss": 0.1872, "step": 1713 }, { "epoch": 0.7675342849146376, "grad_norm": 1.4189733266830444, "learning_rate": 3.176361957525393e-05, "loss": 0.2343, "step": 1714 }, { "epoch": 0.7679820878813323, "grad_norm": 1.0621099472045898, "learning_rate": 3.175207756232687e-05, "loss": 0.2167, "step": 1715 }, { "epoch": 0.7684298908480268, "grad_norm": 1.3043614625930786, "learning_rate": 3.174053554939982e-05, "loss": 0.3389, "step": 1716 }, { "epoch": 0.7688776938147215, "grad_norm": 0.9801878333091736, "learning_rate": 3.1728993536472764e-05, "loss": 0.102, "step": 1717 }, { "epoch": 0.7693254967814162, "grad_norm": 1.019263744354248, "learning_rate": 3.171745152354571e-05, "loss": 0.1738, "step": 1718 }, { "epoch": 0.7697732997481108, "grad_norm": 1.5536470413208008, "learning_rate": 3.1705909510618656e-05, "loss": 0.2715, "step": 1719 }, { "epoch": 0.7702211027148055, "grad_norm": 1.2832285165786743, "learning_rate": 3.16943674976916e-05, "loss": 0.1412, "step": 1720 }, { "epoch": 0.7702211027148055, "eval_loss": 0.19498112797737122, "eval_runtime": 1726.0687, "eval_samples_per_second": 2.588, "eval_steps_per_second": 2.588, "step": 1720 }, { "epoch": 0.7706689056815001, "grad_norm": 1.2903026342391968, "learning_rate": 3.168282548476455e-05, "loss": 0.175, "step": 1721 }, { "epoch": 0.7711167086481948, "grad_norm": 1.4443283081054688, "learning_rate": 3.167128347183749e-05, "loss": 0.245, "step": 1722 }, { "epoch": 0.7715645116148895, "grad_norm": 1.4279271364212036, "learning_rate": 3.165974145891043e-05, "loss": 0.2023, "step": 1723 }, { "epoch": 0.7720123145815841, "grad_norm": 0.9861289858818054, "learning_rate": 3.1648199445983376e-05, "loss": 0.2625, "step": 1724 }, { "epoch": 0.7724601175482787, "grad_norm": 1.038965106010437, "learning_rate": 3.1636657433056325e-05, "loss": 0.0921, "step": 1725 }, { "epoch": 0.7729079205149734, "grad_norm": 1.2597821950912476, "learning_rate": 3.162511542012927e-05, "loss": 0.1168, "step": 1726 }, { "epoch": 0.773355723481668, "grad_norm": 1.182401418685913, "learning_rate": 3.161357340720222e-05, "loss": 0.1085, "step": 1727 }, { "epoch": 0.7738035264483627, "grad_norm": 1.0120970010757446, "learning_rate": 3.160203139427516e-05, "loss": 0.0935, "step": 1728 }, { "epoch": 0.7742513294150574, "grad_norm": 1.0572340488433838, "learning_rate": 3.159048938134811e-05, "loss": 0.1821, "step": 1729 }, { "epoch": 0.774699132381752, "grad_norm": 1.1466655731201172, "learning_rate": 3.157894736842105e-05, "loss": 0.1782, "step": 1730 }, { "epoch": 0.774699132381752, "eval_loss": 0.19265195727348328, "eval_runtime": 1728.1783, "eval_samples_per_second": 2.585, "eval_steps_per_second": 2.585, "step": 1730 }, { "epoch": 0.7751469353484467, "grad_norm": 0.9281951189041138, "learning_rate": 3.1567405355494e-05, "loss": 0.0994, "step": 1731 }, { "epoch": 0.7755947383151414, "grad_norm": 1.089085578918457, "learning_rate": 3.1555863342566944e-05, "loss": 0.1445, "step": 1732 }, { "epoch": 0.776042541281836, "grad_norm": 1.286909580230713, "learning_rate": 3.154432132963989e-05, "loss": 0.2776, "step": 1733 }, { "epoch": 0.7764903442485307, "grad_norm": 1.348159909248352, "learning_rate": 3.1532779316712836e-05, "loss": 0.3719, "step": 1734 }, { "epoch": 0.7769381472152252, "grad_norm": 1.3471921682357788, "learning_rate": 3.1521237303785785e-05, "loss": 0.1505, "step": 1735 }, { "epoch": 0.7773859501819199, "grad_norm": 0.8467832207679749, "learning_rate": 3.150969529085873e-05, "loss": 0.0806, "step": 1736 }, { "epoch": 0.7778337531486146, "grad_norm": 1.291153907775879, "learning_rate": 3.149815327793168e-05, "loss": 0.2666, "step": 1737 }, { "epoch": 0.7782815561153092, "grad_norm": 1.6741520166397095, "learning_rate": 3.148661126500462e-05, "loss": 0.2447, "step": 1738 }, { "epoch": 0.7787293590820039, "grad_norm": 1.1718069314956665, "learning_rate": 3.147506925207757e-05, "loss": 0.2378, "step": 1739 }, { "epoch": 0.7791771620486986, "grad_norm": 1.080367088317871, "learning_rate": 3.146352723915051e-05, "loss": 0.2847, "step": 1740 }, { "epoch": 0.7791771620486986, "eval_loss": 0.19086259603500366, "eval_runtime": 1723.5914, "eval_samples_per_second": 2.592, "eval_steps_per_second": 2.592, "step": 1740 }, { "epoch": 0.7796249650153932, "grad_norm": 0.9889196753501892, "learning_rate": 3.1451985226223454e-05, "loss": 0.0902, "step": 1741 }, { "epoch": 0.7800727679820879, "grad_norm": 1.2149578332901, "learning_rate": 3.14404432132964e-05, "loss": 0.3468, "step": 1742 }, { "epoch": 0.7805205709487826, "grad_norm": 1.186657190322876, "learning_rate": 3.1428901200369346e-05, "loss": 0.1753, "step": 1743 }, { "epoch": 0.7809683739154772, "grad_norm": 0.8789498805999756, "learning_rate": 3.141735918744229e-05, "loss": 0.1154, "step": 1744 }, { "epoch": 0.7814161768821718, "grad_norm": 0.8804444074630737, "learning_rate": 3.140581717451524e-05, "loss": 0.1065, "step": 1745 }, { "epoch": 0.7818639798488665, "grad_norm": 1.1631603240966797, "learning_rate": 3.139427516158818e-05, "loss": 0.2699, "step": 1746 }, { "epoch": 0.7823117828155611, "grad_norm": 1.3118648529052734, "learning_rate": 3.1382733148661123e-05, "loss": 0.2221, "step": 1747 }, { "epoch": 0.7827595857822558, "grad_norm": 1.4373090267181396, "learning_rate": 3.137119113573407e-05, "loss": 0.1979, "step": 1748 }, { "epoch": 0.7832073887489505, "grad_norm": 1.4228724241256714, "learning_rate": 3.1359649122807015e-05, "loss": 0.2494, "step": 1749 }, { "epoch": 0.7836551917156451, "grad_norm": 1.2517611980438232, "learning_rate": 3.1348107109879965e-05, "loss": 0.1731, "step": 1750 }, { "epoch": 0.7836551917156451, "eval_loss": 0.18768282234668732, "eval_runtime": 1728.9333, "eval_samples_per_second": 2.584, "eval_steps_per_second": 2.584, "step": 1750 }, { "epoch": 0.7841029946823398, "grad_norm": 1.355423927307129, "learning_rate": 3.133656509695291e-05, "loss": 0.2274, "step": 1751 }, { "epoch": 0.7845507976490345, "grad_norm": 1.472356915473938, "learning_rate": 3.132502308402586e-05, "loss": 0.2816, "step": 1752 }, { "epoch": 0.7849986006157291, "grad_norm": 1.4461036920547485, "learning_rate": 3.13134810710988e-05, "loss": 0.2883, "step": 1753 }, { "epoch": 0.7854464035824237, "grad_norm": 1.3502665758132935, "learning_rate": 3.130193905817175e-05, "loss": 0.1779, "step": 1754 }, { "epoch": 0.7858942065491183, "grad_norm": 0.954616904258728, "learning_rate": 3.129039704524469e-05, "loss": 0.1198, "step": 1755 }, { "epoch": 0.786342009515813, "grad_norm": 0.916275143623352, "learning_rate": 3.127885503231764e-05, "loss": 0.1592, "step": 1756 }, { "epoch": 0.7867898124825077, "grad_norm": 1.4052035808563232, "learning_rate": 3.126731301939058e-05, "loss": 0.4243, "step": 1757 }, { "epoch": 0.7872376154492023, "grad_norm": 1.2046221494674683, "learning_rate": 3.125577100646353e-05, "loss": 0.2051, "step": 1758 }, { "epoch": 0.787685418415897, "grad_norm": 0.7101848125457764, "learning_rate": 3.1244228993536475e-05, "loss": 0.086, "step": 1759 }, { "epoch": 0.7881332213825917, "grad_norm": 1.0928889513015747, "learning_rate": 3.123268698060942e-05, "loss": 0.2118, "step": 1760 }, { "epoch": 0.7881332213825917, "eval_loss": 0.18407580256462097, "eval_runtime": 1725.0622, "eval_samples_per_second": 2.589, "eval_steps_per_second": 2.589, "step": 1760 }, { "epoch": 0.7885810243492863, "grad_norm": 1.032731294631958, "learning_rate": 3.122114496768237e-05, "loss": 0.1622, "step": 1761 }, { "epoch": 0.789028827315981, "grad_norm": 1.4532630443572998, "learning_rate": 3.120960295475531e-05, "loss": 0.1498, "step": 1762 }, { "epoch": 0.7894766302826757, "grad_norm": 0.9604041576385498, "learning_rate": 3.119806094182825e-05, "loss": 0.0749, "step": 1763 }, { "epoch": 0.7899244332493702, "grad_norm": 1.3322830200195312, "learning_rate": 3.11865189289012e-05, "loss": 0.2035, "step": 1764 }, { "epoch": 0.7903722362160649, "grad_norm": 0.9661667943000793, "learning_rate": 3.1174976915974145e-05, "loss": 0.1066, "step": 1765 }, { "epoch": 0.7908200391827596, "grad_norm": 1.23445463180542, "learning_rate": 3.1163434903047094e-05, "loss": 0.2315, "step": 1766 }, { "epoch": 0.7912678421494542, "grad_norm": 0.9666409492492676, "learning_rate": 3.1151892890120037e-05, "loss": 0.0874, "step": 1767 }, { "epoch": 0.7917156451161489, "grad_norm": 1.6275707483291626, "learning_rate": 3.1140350877192986e-05, "loss": 0.3015, "step": 1768 }, { "epoch": 0.7921634480828436, "grad_norm": 1.0952938795089722, "learning_rate": 3.112880886426593e-05, "loss": 0.1406, "step": 1769 }, { "epoch": 0.7926112510495382, "grad_norm": 1.5493882894515991, "learning_rate": 3.111726685133888e-05, "loss": 0.2373, "step": 1770 }, { "epoch": 0.7926112510495382, "eval_loss": 0.18200978636741638, "eval_runtime": 1742.6654, "eval_samples_per_second": 2.563, "eval_steps_per_second": 2.563, "step": 1770 }, { "epoch": 0.7930590540162329, "grad_norm": 1.2741529941558838, "learning_rate": 3.110572483841182e-05, "loss": 0.2427, "step": 1771 }, { "epoch": 0.7935068569829276, "grad_norm": 0.6875128149986267, "learning_rate": 3.109418282548476e-05, "loss": 0.0656, "step": 1772 }, { "epoch": 0.7939546599496222, "grad_norm": 1.2775335311889648, "learning_rate": 3.108264081255771e-05, "loss": 0.2231, "step": 1773 }, { "epoch": 0.7944024629163168, "grad_norm": 1.0050452947616577, "learning_rate": 3.1071098799630655e-05, "loss": 0.125, "step": 1774 }, { "epoch": 0.7948502658830114, "grad_norm": 1.2733467817306519, "learning_rate": 3.1059556786703604e-05, "loss": 0.1813, "step": 1775 }, { "epoch": 0.7952980688497061, "grad_norm": 1.0973080396652222, "learning_rate": 3.104801477377655e-05, "loss": 0.1617, "step": 1776 }, { "epoch": 0.7957458718164008, "grad_norm": 1.3167555332183838, "learning_rate": 3.1036472760849496e-05, "loss": 0.1869, "step": 1777 }, { "epoch": 0.7961936747830954, "grad_norm": 1.404090404510498, "learning_rate": 3.102493074792244e-05, "loss": 0.1659, "step": 1778 }, { "epoch": 0.7966414777497901, "grad_norm": 1.5618826150894165, "learning_rate": 3.101338873499539e-05, "loss": 0.2905, "step": 1779 }, { "epoch": 0.7970892807164848, "grad_norm": 1.1656345129013062, "learning_rate": 3.100184672206833e-05, "loss": 0.1532, "step": 1780 }, { "epoch": 0.7970892807164848, "eval_loss": 0.18052628636360168, "eval_runtime": 1741.697, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 1780 }, { "epoch": 0.7975370836831794, "grad_norm": 0.970252275466919, "learning_rate": 3.0990304709141274e-05, "loss": 0.1948, "step": 1781 }, { "epoch": 0.7979848866498741, "grad_norm": 1.1388535499572754, "learning_rate": 3.0978762696214216e-05, "loss": 0.1178, "step": 1782 }, { "epoch": 0.7984326896165687, "grad_norm": 1.0984439849853516, "learning_rate": 3.0967220683287166e-05, "loss": 0.1222, "step": 1783 }, { "epoch": 0.7988804925832633, "grad_norm": 1.2920303344726562, "learning_rate": 3.095567867036011e-05, "loss": 0.2673, "step": 1784 }, { "epoch": 0.799328295549958, "grad_norm": 0.9241845011711121, "learning_rate": 3.094413665743306e-05, "loss": 0.1297, "step": 1785 }, { "epoch": 0.7997760985166527, "grad_norm": 1.1384260654449463, "learning_rate": 3.0932594644506e-05, "loss": 0.1618, "step": 1786 }, { "epoch": 0.8002239014833473, "grad_norm": 1.1058518886566162, "learning_rate": 3.092105263157895e-05, "loss": 0.1894, "step": 1787 }, { "epoch": 0.800671704450042, "grad_norm": 1.4290608167648315, "learning_rate": 3.090951061865189e-05, "loss": 0.2922, "step": 1788 }, { "epoch": 0.8011195074167367, "grad_norm": 1.589939832687378, "learning_rate": 3.089796860572484e-05, "loss": 0.2531, "step": 1789 }, { "epoch": 0.8015673103834313, "grad_norm": 1.2515995502471924, "learning_rate": 3.0886426592797784e-05, "loss": 0.1397, "step": 1790 }, { "epoch": 0.8015673103834313, "eval_loss": 0.17911021411418915, "eval_runtime": 1743.1592, "eval_samples_per_second": 2.563, "eval_steps_per_second": 2.563, "step": 1790 }, { "epoch": 0.802015113350126, "grad_norm": 1.4639934301376343, "learning_rate": 3.0874884579870734e-05, "loss": 0.155, "step": 1791 }, { "epoch": 0.8024629163168207, "grad_norm": 1.5946593284606934, "learning_rate": 3.0863342566943676e-05, "loss": 0.335, "step": 1792 }, { "epoch": 0.8029107192835152, "grad_norm": 0.9911496639251709, "learning_rate": 3.0851800554016626e-05, "loss": 0.1262, "step": 1793 }, { "epoch": 0.8033585222502099, "grad_norm": 0.9423514604568481, "learning_rate": 3.084025854108957e-05, "loss": 0.1645, "step": 1794 }, { "epoch": 0.8038063252169045, "grad_norm": 1.6770713329315186, "learning_rate": 3.082871652816252e-05, "loss": 0.2112, "step": 1795 }, { "epoch": 0.8042541281835992, "grad_norm": 1.091598629951477, "learning_rate": 3.081717451523546e-05, "loss": 0.1308, "step": 1796 }, { "epoch": 0.8047019311502939, "grad_norm": 1.0683177709579468, "learning_rate": 3.08056325023084e-05, "loss": 0.2161, "step": 1797 }, { "epoch": 0.8051497341169885, "grad_norm": 1.2820827960968018, "learning_rate": 3.079409048938135e-05, "loss": 0.1894, "step": 1798 }, { "epoch": 0.8055975370836832, "grad_norm": 1.0889778137207031, "learning_rate": 3.0782548476454295e-05, "loss": 0.1841, "step": 1799 }, { "epoch": 0.8060453400503779, "grad_norm": 0.9384577870368958, "learning_rate": 3.0771006463527244e-05, "loss": 0.1522, "step": 1800 }, { "epoch": 0.8060453400503779, "eval_loss": 0.1766439974308014, "eval_runtime": 1744.9642, "eval_samples_per_second": 2.56, "eval_steps_per_second": 2.56, "step": 1800 }, { "epoch": 0.8064931430170725, "grad_norm": 1.1987580060958862, "learning_rate": 3.075946445060019e-05, "loss": 0.1167, "step": 1801 }, { "epoch": 0.8069409459837671, "grad_norm": 1.2142102718353271, "learning_rate": 3.074792243767313e-05, "loss": 0.1917, "step": 1802 }, { "epoch": 0.8073887489504618, "grad_norm": 1.063592553138733, "learning_rate": 3.073638042474607e-05, "loss": 0.196, "step": 1803 }, { "epoch": 0.8078365519171564, "grad_norm": 0.9051306843757629, "learning_rate": 3.072483841181902e-05, "loss": 0.104, "step": 1804 }, { "epoch": 0.8082843548838511, "grad_norm": 1.501448392868042, "learning_rate": 3.0713296398891964e-05, "loss": 0.1321, "step": 1805 }, { "epoch": 0.8087321578505458, "grad_norm": 1.3337589502334595, "learning_rate": 3.0701754385964913e-05, "loss": 0.1714, "step": 1806 }, { "epoch": 0.8091799608172404, "grad_norm": 1.1337471008300781, "learning_rate": 3.0690212373037856e-05, "loss": 0.1375, "step": 1807 }, { "epoch": 0.8096277637839351, "grad_norm": 0.8848599195480347, "learning_rate": 3.0678670360110805e-05, "loss": 0.0759, "step": 1808 }, { "epoch": 0.8100755667506297, "grad_norm": 1.6946122646331787, "learning_rate": 3.066712834718375e-05, "loss": 0.1689, "step": 1809 }, { "epoch": 0.8105233697173244, "grad_norm": 1.4061602354049683, "learning_rate": 3.06555863342567e-05, "loss": 0.2621, "step": 1810 }, { "epoch": 0.8105233697173244, "eval_loss": 0.17439012229442596, "eval_runtime": 1743.6105, "eval_samples_per_second": 2.562, "eval_steps_per_second": 2.562, "step": 1810 }, { "epoch": 0.8109711726840191, "grad_norm": 1.1560529470443726, "learning_rate": 3.064404432132964e-05, "loss": 0.1473, "step": 1811 }, { "epoch": 0.8114189756507136, "grad_norm": 1.1303647756576538, "learning_rate": 3.063250230840259e-05, "loss": 0.1474, "step": 1812 }, { "epoch": 0.8118667786174083, "grad_norm": 1.3138867616653442, "learning_rate": 3.062096029547553e-05, "loss": 0.1713, "step": 1813 }, { "epoch": 0.812314581584103, "grad_norm": 1.5874935388565063, "learning_rate": 3.060941828254848e-05, "loss": 0.2144, "step": 1814 }, { "epoch": 0.8127623845507976, "grad_norm": 1.0073288679122925, "learning_rate": 3.0597876269621424e-05, "loss": 0.1259, "step": 1815 }, { "epoch": 0.8132101875174923, "grad_norm": 1.4054961204528809, "learning_rate": 3.058633425669437e-05, "loss": 0.1752, "step": 1816 }, { "epoch": 0.813657990484187, "grad_norm": 1.0050817728042603, "learning_rate": 3.0574792243767316e-05, "loss": 0.1, "step": 1817 }, { "epoch": 0.8141057934508816, "grad_norm": 0.8871022462844849, "learning_rate": 3.0563250230840265e-05, "loss": 0.0854, "step": 1818 }, { "epoch": 0.8145535964175763, "grad_norm": 0.8566880822181702, "learning_rate": 3.055170821791321e-05, "loss": 0.0894, "step": 1819 }, { "epoch": 0.815001399384271, "grad_norm": 1.0436166524887085, "learning_rate": 3.054016620498615e-05, "loss": 0.1256, "step": 1820 }, { "epoch": 0.815001399384271, "eval_loss": 0.17154671251773834, "eval_runtime": 1740.5344, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 1820 }, { "epoch": 0.8154492023509656, "grad_norm": 1.0768319368362427, "learning_rate": 3.052862419205909e-05, "loss": 0.1093, "step": 1821 }, { "epoch": 0.8158970053176602, "grad_norm": 0.9550846219062805, "learning_rate": 3.051708217913204e-05, "loss": 0.1221, "step": 1822 }, { "epoch": 0.8163448082843549, "grad_norm": 1.3339128494262695, "learning_rate": 3.050554016620499e-05, "loss": 0.3042, "step": 1823 }, { "epoch": 0.8167926112510495, "grad_norm": 1.2556220293045044, "learning_rate": 3.049399815327793e-05, "loss": 0.1797, "step": 1824 }, { "epoch": 0.8172404142177442, "grad_norm": 1.1306003332138062, "learning_rate": 3.048245614035088e-05, "loss": 0.1557, "step": 1825 }, { "epoch": 0.8176882171844388, "grad_norm": 1.104524850845337, "learning_rate": 3.0470914127423823e-05, "loss": 0.1996, "step": 1826 }, { "epoch": 0.8181360201511335, "grad_norm": 1.4180043935775757, "learning_rate": 3.045937211449677e-05, "loss": 0.1997, "step": 1827 }, { "epoch": 0.8185838231178282, "grad_norm": 1.6681851148605347, "learning_rate": 3.0447830101569712e-05, "loss": 0.1433, "step": 1828 }, { "epoch": 0.8190316260845228, "grad_norm": 1.2225019931793213, "learning_rate": 3.043628808864266e-05, "loss": 0.1352, "step": 1829 }, { "epoch": 0.8194794290512175, "grad_norm": 1.310770034790039, "learning_rate": 3.0424746075715604e-05, "loss": 0.135, "step": 1830 }, { "epoch": 0.8194794290512175, "eval_loss": 0.16900593042373657, "eval_runtime": 1740.5996, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 1830 }, { "epoch": 0.8199272320179121, "grad_norm": 1.2285445928573608, "learning_rate": 3.0413204062788553e-05, "loss": 0.1948, "step": 1831 }, { "epoch": 0.8203750349846067, "grad_norm": 1.2398518323898315, "learning_rate": 3.0401662049861496e-05, "loss": 0.2515, "step": 1832 }, { "epoch": 0.8208228379513014, "grad_norm": 1.177215814590454, "learning_rate": 3.0390120036934445e-05, "loss": 0.1068, "step": 1833 }, { "epoch": 0.8212706409179961, "grad_norm": 1.2884360551834106, "learning_rate": 3.0378578024007388e-05, "loss": 0.2032, "step": 1834 }, { "epoch": 0.8217184438846907, "grad_norm": 1.3158836364746094, "learning_rate": 3.0367036011080334e-05, "loss": 0.1355, "step": 1835 }, { "epoch": 0.8221662468513854, "grad_norm": 1.5116773843765259, "learning_rate": 3.035549399815328e-05, "loss": 0.2563, "step": 1836 }, { "epoch": 0.8226140498180801, "grad_norm": 0.8042264580726624, "learning_rate": 3.0343951985226226e-05, "loss": 0.0907, "step": 1837 }, { "epoch": 0.8230618527847747, "grad_norm": 0.9426969289779663, "learning_rate": 3.0332409972299168e-05, "loss": 0.1182, "step": 1838 }, { "epoch": 0.8235096557514694, "grad_norm": 1.0019307136535645, "learning_rate": 3.0320867959372118e-05, "loss": 0.1143, "step": 1839 }, { "epoch": 0.8239574587181641, "grad_norm": 1.2967493534088135, "learning_rate": 3.030932594644506e-05, "loss": 0.1422, "step": 1840 }, { "epoch": 0.8239574587181641, "eval_loss": 0.16731417179107666, "eval_runtime": 1740.7881, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 1840 }, { "epoch": 0.8244052616848586, "grad_norm": 1.4186187982559204, "learning_rate": 3.029778393351801e-05, "loss": 0.214, "step": 1841 }, { "epoch": 0.8248530646515533, "grad_norm": 0.8762224316596985, "learning_rate": 3.0286241920590952e-05, "loss": 0.1384, "step": 1842 }, { "epoch": 0.825300867618248, "grad_norm": 1.3150262832641602, "learning_rate": 3.02746999076639e-05, "loss": 0.1528, "step": 1843 }, { "epoch": 0.8257486705849426, "grad_norm": 1.600975513458252, "learning_rate": 3.0263157894736844e-05, "loss": 0.1479, "step": 1844 }, { "epoch": 0.8261964735516373, "grad_norm": 1.3228427171707153, "learning_rate": 3.025161588180979e-05, "loss": 0.128, "step": 1845 }, { "epoch": 0.8266442765183319, "grad_norm": 1.5124461650848389, "learning_rate": 3.0240073868882733e-05, "loss": 0.1793, "step": 1846 }, { "epoch": 0.8270920794850266, "grad_norm": 1.3643213510513306, "learning_rate": 3.022853185595568e-05, "loss": 0.2281, "step": 1847 }, { "epoch": 0.8275398824517213, "grad_norm": 1.2589600086212158, "learning_rate": 3.0216989843028625e-05, "loss": 0.2255, "step": 1848 }, { "epoch": 0.8279876854184159, "grad_norm": 0.9553655385971069, "learning_rate": 3.0205447830101567e-05, "loss": 0.1416, "step": 1849 }, { "epoch": 0.8284354883851105, "grad_norm": 1.096134901046753, "learning_rate": 3.0193905817174517e-05, "loss": 0.1515, "step": 1850 }, { "epoch": 0.8284354883851105, "eval_loss": 0.1658756136894226, "eval_runtime": 1741.1344, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 1850 }, { "epoch": 0.8288832913518052, "grad_norm": 1.5498076677322388, "learning_rate": 3.018236380424746e-05, "loss": 0.1799, "step": 1851 }, { "epoch": 0.8293310943184998, "grad_norm": 1.2013968229293823, "learning_rate": 3.017082179132041e-05, "loss": 0.1325, "step": 1852 }, { "epoch": 0.8297788972851945, "grad_norm": 1.2413219213485718, "learning_rate": 3.015927977839335e-05, "loss": 0.1424, "step": 1853 }, { "epoch": 0.8302267002518892, "grad_norm": 1.4597523212432861, "learning_rate": 3.01477377654663e-05, "loss": 0.1868, "step": 1854 }, { "epoch": 0.8306745032185838, "grad_norm": 0.7172173261642456, "learning_rate": 3.0136195752539243e-05, "loss": 0.0705, "step": 1855 }, { "epoch": 0.8311223061852785, "grad_norm": 1.2041913270950317, "learning_rate": 3.012465373961219e-05, "loss": 0.2077, "step": 1856 }, { "epoch": 0.8315701091519732, "grad_norm": 1.3840017318725586, "learning_rate": 3.0113111726685132e-05, "loss": 0.2763, "step": 1857 }, { "epoch": 0.8320179121186678, "grad_norm": 1.2809327840805054, "learning_rate": 3.010156971375808e-05, "loss": 0.2566, "step": 1858 }, { "epoch": 0.8324657150853625, "grad_norm": 0.8672228455543518, "learning_rate": 3.0090027700831024e-05, "loss": 0.1191, "step": 1859 }, { "epoch": 0.832913518052057, "grad_norm": 1.0470631122589111, "learning_rate": 3.0078485687903973e-05, "loss": 0.0841, "step": 1860 }, { "epoch": 0.832913518052057, "eval_loss": 0.16294632852077484, "eval_runtime": 1740.0658, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 1860 }, { "epoch": 0.8333613210187517, "grad_norm": 1.225232720375061, "learning_rate": 3.0066943674976916e-05, "loss": 0.1085, "step": 1861 }, { "epoch": 0.8338091239854464, "grad_norm": 1.1180237531661987, "learning_rate": 3.0055401662049865e-05, "loss": 0.3731, "step": 1862 }, { "epoch": 0.834256926952141, "grad_norm": 1.4320118427276611, "learning_rate": 3.0043859649122808e-05, "loss": 0.2216, "step": 1863 }, { "epoch": 0.8347047299188357, "grad_norm": 0.8964130282402039, "learning_rate": 3.0032317636195757e-05, "loss": 0.1404, "step": 1864 }, { "epoch": 0.8351525328855304, "grad_norm": 1.3593807220458984, "learning_rate": 3.00207756232687e-05, "loss": 0.2552, "step": 1865 }, { "epoch": 0.835600335852225, "grad_norm": 1.2691015005111694, "learning_rate": 3.0009233610341646e-05, "loss": 0.151, "step": 1866 }, { "epoch": 0.8360481388189197, "grad_norm": 1.2501640319824219, "learning_rate": 2.999769159741459e-05, "loss": 0.2293, "step": 1867 }, { "epoch": 0.8364959417856144, "grad_norm": 0.9922981262207031, "learning_rate": 2.9986149584487538e-05, "loss": 0.0955, "step": 1868 }, { "epoch": 0.836943744752309, "grad_norm": 1.6564064025878906, "learning_rate": 2.997460757156048e-05, "loss": 0.2517, "step": 1869 }, { "epoch": 0.8373915477190036, "grad_norm": 1.8210774660110474, "learning_rate": 2.996306555863343e-05, "loss": 0.2969, "step": 1870 }, { "epoch": 0.8373915477190036, "eval_loss": 0.1613447070121765, "eval_runtime": 1743.648, "eval_samples_per_second": 2.562, "eval_steps_per_second": 2.562, "step": 1870 }, { "epoch": 0.8378393506856983, "grad_norm": 1.4837088584899902, "learning_rate": 2.9951523545706373e-05, "loss": 0.1553, "step": 1871 }, { "epoch": 0.8382871536523929, "grad_norm": 0.8095635175704956, "learning_rate": 2.9939981532779315e-05, "loss": 0.0953, "step": 1872 }, { "epoch": 0.8387349566190876, "grad_norm": 0.9664444327354431, "learning_rate": 2.9928439519852265e-05, "loss": 0.1132, "step": 1873 }, { "epoch": 0.8391827595857823, "grad_norm": 1.040560245513916, "learning_rate": 2.9916897506925207e-05, "loss": 0.1133, "step": 1874 }, { "epoch": 0.8396305625524769, "grad_norm": 1.034613013267517, "learning_rate": 2.9905355493998157e-05, "loss": 0.0934, "step": 1875 }, { "epoch": 0.8400783655191716, "grad_norm": 1.4855910539627075, "learning_rate": 2.98938134810711e-05, "loss": 0.1679, "step": 1876 }, { "epoch": 0.8405261684858663, "grad_norm": 1.272095799446106, "learning_rate": 2.9882271468144045e-05, "loss": 0.2133, "step": 1877 }, { "epoch": 0.8409739714525609, "grad_norm": 0.9249767661094666, "learning_rate": 2.9870729455216988e-05, "loss": 0.1258, "step": 1878 }, { "epoch": 0.8414217744192555, "grad_norm": 1.2314205169677734, "learning_rate": 2.9859187442289937e-05, "loss": 0.1198, "step": 1879 }, { "epoch": 0.8418695773859501, "grad_norm": 1.135740876197815, "learning_rate": 2.984764542936288e-05, "loss": 0.1507, "step": 1880 }, { "epoch": 0.8418695773859501, "eval_loss": 0.1596778929233551, "eval_runtime": 1744.6636, "eval_samples_per_second": 2.56, "eval_steps_per_second": 2.56, "step": 1880 }, { "epoch": 0.8423173803526448, "grad_norm": 0.9603174328804016, "learning_rate": 2.983610341643583e-05, "loss": 0.1832, "step": 1881 }, { "epoch": 0.8427651833193395, "grad_norm": 0.9922119975090027, "learning_rate": 2.9824561403508772e-05, "loss": 0.1203, "step": 1882 }, { "epoch": 0.8432129862860341, "grad_norm": 1.2410778999328613, "learning_rate": 2.981301939058172e-05, "loss": 0.2341, "step": 1883 }, { "epoch": 0.8436607892527288, "grad_norm": 1.2529115676879883, "learning_rate": 2.9801477377654664e-05, "loss": 0.1256, "step": 1884 }, { "epoch": 0.8441085922194235, "grad_norm": 1.0369658470153809, "learning_rate": 2.978993536472761e-05, "loss": 0.2077, "step": 1885 }, { "epoch": 0.8445563951861181, "grad_norm": 0.8883606791496277, "learning_rate": 2.9778393351800556e-05, "loss": 0.0912, "step": 1886 }, { "epoch": 0.8450041981528128, "grad_norm": 1.1145747900009155, "learning_rate": 2.9766851338873502e-05, "loss": 0.169, "step": 1887 }, { "epoch": 0.8454520011195075, "grad_norm": 1.1187107563018799, "learning_rate": 2.9755309325946444e-05, "loss": 0.1202, "step": 1888 }, { "epoch": 0.845899804086202, "grad_norm": 1.2027275562286377, "learning_rate": 2.9743767313019394e-05, "loss": 0.1931, "step": 1889 }, { "epoch": 0.8463476070528967, "grad_norm": 1.1199167966842651, "learning_rate": 2.9732225300092336e-05, "loss": 0.1389, "step": 1890 }, { "epoch": 0.8463476070528967, "eval_loss": 0.157403826713562, "eval_runtime": 1740.7291, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 1890 }, { "epoch": 0.8467954100195914, "grad_norm": 1.1820210218429565, "learning_rate": 2.9720683287165286e-05, "loss": 0.1506, "step": 1891 }, { "epoch": 0.847243212986286, "grad_norm": 1.314258337020874, "learning_rate": 2.970914127423823e-05, "loss": 0.18, "step": 1892 }, { "epoch": 0.8476910159529807, "grad_norm": 1.2523696422576904, "learning_rate": 2.9697599261311178e-05, "loss": 0.1401, "step": 1893 }, { "epoch": 0.8481388189196754, "grad_norm": 1.6710721254348755, "learning_rate": 2.968605724838412e-05, "loss": 0.1575, "step": 1894 }, { "epoch": 0.84858662188637, "grad_norm": 1.1848293542861938, "learning_rate": 2.9674515235457066e-05, "loss": 0.1888, "step": 1895 }, { "epoch": 0.8490344248530647, "grad_norm": 1.042680263519287, "learning_rate": 2.966297322253001e-05, "loss": 0.1032, "step": 1896 }, { "epoch": 0.8494822278197594, "grad_norm": 1.2438651323318481, "learning_rate": 2.9651431209602958e-05, "loss": 0.1433, "step": 1897 }, { "epoch": 0.8499300307864539, "grad_norm": 1.0681265592575073, "learning_rate": 2.96398891966759e-05, "loss": 0.1787, "step": 1898 }, { "epoch": 0.8503778337531486, "grad_norm": 1.2646775245666504, "learning_rate": 2.9628347183748844e-05, "loss": 0.1186, "step": 1899 }, { "epoch": 0.8508256367198432, "grad_norm": 1.3411589860916138, "learning_rate": 2.9616805170821793e-05, "loss": 0.25, "step": 1900 }, { "epoch": 0.8508256367198432, "eval_loss": 0.1552475243806839, "eval_runtime": 1740.6806, "eval_samples_per_second": 2.566, "eval_steps_per_second": 2.566, "step": 1900 }, { "epoch": 0.8512734396865379, "grad_norm": 1.544445276260376, "learning_rate": 2.9605263157894735e-05, "loss": 0.2102, "step": 1901 }, { "epoch": 0.8517212426532326, "grad_norm": 1.0015203952789307, "learning_rate": 2.9593721144967685e-05, "loss": 0.0845, "step": 1902 }, { "epoch": 0.8521690456199272, "grad_norm": 0.8234665393829346, "learning_rate": 2.9582179132040627e-05, "loss": 0.0787, "step": 1903 }, { "epoch": 0.8526168485866219, "grad_norm": 1.0346554517745972, "learning_rate": 2.9570637119113577e-05, "loss": 0.0953, "step": 1904 }, { "epoch": 0.8530646515533166, "grad_norm": 1.5251343250274658, "learning_rate": 2.955909510618652e-05, "loss": 0.1945, "step": 1905 }, { "epoch": 0.8535124545200112, "grad_norm": 1.2917299270629883, "learning_rate": 2.9547553093259465e-05, "loss": 0.191, "step": 1906 }, { "epoch": 0.8539602574867059, "grad_norm": 1.1073006391525269, "learning_rate": 2.9536011080332408e-05, "loss": 0.1684, "step": 1907 }, { "epoch": 0.8544080604534005, "grad_norm": 1.1586658954620361, "learning_rate": 2.9524469067405357e-05, "loss": 0.1412, "step": 1908 }, { "epoch": 0.8548558634200951, "grad_norm": 0.9181356430053711, "learning_rate": 2.95129270544783e-05, "loss": 0.1547, "step": 1909 }, { "epoch": 0.8553036663867898, "grad_norm": 1.1670259237289429, "learning_rate": 2.950138504155125e-05, "loss": 0.2027, "step": 1910 }, { "epoch": 0.8553036663867898, "eval_loss": 0.15608276426792145, "eval_runtime": 1740.3593, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 1910 }, { "epoch": 0.8557514693534845, "grad_norm": 0.8703057765960693, "learning_rate": 2.9489843028624192e-05, "loss": 0.1132, "step": 1911 }, { "epoch": 0.8561992723201791, "grad_norm": 1.286847472190857, "learning_rate": 2.947830101569714e-05, "loss": 0.2887, "step": 1912 }, { "epoch": 0.8566470752868738, "grad_norm": 1.367441177368164, "learning_rate": 2.9466759002770084e-05, "loss": 0.1678, "step": 1913 }, { "epoch": 0.8570948782535684, "grad_norm": 1.5393425226211548, "learning_rate": 2.9455216989843033e-05, "loss": 0.3088, "step": 1914 }, { "epoch": 0.8575426812202631, "grad_norm": 1.5887715816497803, "learning_rate": 2.9443674976915976e-05, "loss": 0.1537, "step": 1915 }, { "epoch": 0.8579904841869578, "grad_norm": 1.3844943046569824, "learning_rate": 2.9432132963988922e-05, "loss": 0.2481, "step": 1916 }, { "epoch": 0.8584382871536524, "grad_norm": 1.306236743927002, "learning_rate": 2.9420590951061865e-05, "loss": 0.1246, "step": 1917 }, { "epoch": 0.858886090120347, "grad_norm": 1.9353609085083008, "learning_rate": 2.9409048938134814e-05, "loss": 0.2403, "step": 1918 }, { "epoch": 0.8593338930870417, "grad_norm": 1.0708073377609253, "learning_rate": 2.9397506925207757e-05, "loss": 0.1256, "step": 1919 }, { "epoch": 0.8597816960537363, "grad_norm": 1.6582993268966675, "learning_rate": 2.9385964912280706e-05, "loss": 0.1708, "step": 1920 }, { "epoch": 0.8597816960537363, "eval_loss": 0.15275830030441284, "eval_runtime": 1741.7262, "eval_samples_per_second": 2.565, "eval_steps_per_second": 2.565, "step": 1920 }, { "epoch": 0.860229499020431, "grad_norm": 1.1031070947647095, "learning_rate": 2.937442289935365e-05, "loss": 0.1313, "step": 1921 }, { "epoch": 0.8606773019871257, "grad_norm": 1.6089885234832764, "learning_rate": 2.9362880886426598e-05, "loss": 0.1299, "step": 1922 }, { "epoch": 0.8611251049538203, "grad_norm": 1.2939296960830688, "learning_rate": 2.935133887349954e-05, "loss": 0.1808, "step": 1923 }, { "epoch": 0.861572907920515, "grad_norm": 0.9729406237602234, "learning_rate": 2.9339796860572483e-05, "loss": 0.1002, "step": 1924 }, { "epoch": 0.8620207108872097, "grad_norm": 1.3479326963424683, "learning_rate": 2.9328254847645433e-05, "loss": 0.1815, "step": 1925 }, { "epoch": 0.8624685138539043, "grad_norm": 1.0860453844070435, "learning_rate": 2.9316712834718375e-05, "loss": 0.2335, "step": 1926 }, { "epoch": 0.8629163168205989, "grad_norm": 1.1711890697479248, "learning_rate": 2.930517082179132e-05, "loss": 0.1488, "step": 1927 }, { "epoch": 0.8633641197872935, "grad_norm": 0.9251280426979065, "learning_rate": 2.9293628808864264e-05, "loss": 0.119, "step": 1928 }, { "epoch": 0.8638119227539882, "grad_norm": 1.112289309501648, "learning_rate": 2.9282086795937213e-05, "loss": 0.137, "step": 1929 }, { "epoch": 0.8642597257206829, "grad_norm": 1.7165166139602661, "learning_rate": 2.9270544783010156e-05, "loss": 0.1479, "step": 1930 }, { "epoch": 0.8642597257206829, "eval_loss": 0.14976796507835388, "eval_runtime": 1741.861, "eval_samples_per_second": 2.564, "eval_steps_per_second": 2.564, "step": 1930 }, { "epoch": 0.8647075286873775, "grad_norm": 0.929421067237854, "learning_rate": 2.9259002770083105e-05, "loss": 0.1143, "step": 1931 }, { "epoch": 0.8651553316540722, "grad_norm": 0.8255317211151123, "learning_rate": 2.9247460757156048e-05, "loss": 0.0891, "step": 1932 }, { "epoch": 0.8656031346207669, "grad_norm": 0.8592564463615417, "learning_rate": 2.9235918744228997e-05, "loss": 0.0861, "step": 1933 }, { "epoch": 0.8660509375874615, "grad_norm": 1.107157588005066, "learning_rate": 2.922437673130194e-05, "loss": 0.135, "step": 1934 }, { "epoch": 0.8664987405541562, "grad_norm": 1.6733407974243164, "learning_rate": 2.9212834718374886e-05, "loss": 0.1944, "step": 1935 }, { "epoch": 0.8669465435208509, "grad_norm": 1.3344576358795166, "learning_rate": 2.9201292705447832e-05, "loss": 0.1332, "step": 1936 }, { "epoch": 0.8673943464875454, "grad_norm": 1.0710251331329346, "learning_rate": 2.9189750692520778e-05, "loss": 0.2112, "step": 1937 }, { "epoch": 0.8678421494542401, "grad_norm": 1.2376161813735962, "learning_rate": 2.917820867959372e-05, "loss": 0.1467, "step": 1938 }, { "epoch": 0.8682899524209348, "grad_norm": 1.2241551876068115, "learning_rate": 2.916666666666667e-05, "loss": 0.1139, "step": 1939 }, { "epoch": 0.8687377553876294, "grad_norm": 0.8482950329780579, "learning_rate": 2.9155124653739612e-05, "loss": 0.0913, "step": 1940 }, { "epoch": 0.8687377553876294, "eval_loss": 0.1484833061695099, "eval_runtime": 1744.4985, "eval_samples_per_second": 2.561, "eval_steps_per_second": 2.561, "step": 1940 }, { "epoch": 0.8691855583543241, "grad_norm": 0.715686559677124, "learning_rate": 2.9143582640812562e-05, "loss": 0.0635, "step": 1941 }, { "epoch": 0.8696333613210188, "grad_norm": 1.4325870275497437, "learning_rate": 2.9132040627885504e-05, "loss": 0.2724, "step": 1942 }, { "epoch": 0.8700811642877134, "grad_norm": 1.6713802814483643, "learning_rate": 2.9120498614958454e-05, "loss": 0.2227, "step": 1943 }, { "epoch": 0.8705289672544081, "grad_norm": 1.6410531997680664, "learning_rate": 2.9108956602031396e-05, "loss": 0.3713, "step": 1944 }, { "epoch": 0.8709767702211028, "grad_norm": 1.0280667543411255, "learning_rate": 2.9097414589104342e-05, "loss": 0.1245, "step": 1945 }, { "epoch": 0.8714245731877974, "grad_norm": 1.0153021812438965, "learning_rate": 2.9085872576177285e-05, "loss": 0.1004, "step": 1946 }, { "epoch": 0.871872376154492, "grad_norm": 0.8973650336265564, "learning_rate": 2.9074330563250234e-05, "loss": 0.1089, "step": 1947 }, { "epoch": 0.8723201791211866, "grad_norm": 0.9956257939338684, "learning_rate": 2.9062788550323177e-05, "loss": 0.0945, "step": 1948 }, { "epoch": 0.8727679820878813, "grad_norm": 0.9029696583747864, "learning_rate": 2.905124653739612e-05, "loss": 0.1923, "step": 1949 }, { "epoch": 0.873215785054576, "grad_norm": 1.4042998552322388, "learning_rate": 2.903970452446907e-05, "loss": 0.0725, "step": 1950 }, { "epoch": 0.873215785054576, "eval_loss": 0.14708450436592102, "eval_runtime": 1740.0099, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 1950 }, { "epoch": 0.8736635880212706, "grad_norm": 1.163150668144226, "learning_rate": 2.902816251154201e-05, "loss": 0.0904, "step": 1951 }, { "epoch": 0.8741113909879653, "grad_norm": 0.999988317489624, "learning_rate": 2.901662049861496e-05, "loss": 0.1097, "step": 1952 }, { "epoch": 0.87455919395466, "grad_norm": 1.109708547592163, "learning_rate": 2.9005078485687904e-05, "loss": 0.0968, "step": 1953 }, { "epoch": 0.8750069969213546, "grad_norm": 1.0820552110671997, "learning_rate": 2.8993536472760853e-05, "loss": 0.1765, "step": 1954 }, { "epoch": 0.8754547998880493, "grad_norm": 1.5995961427688599, "learning_rate": 2.8981994459833796e-05, "loss": 0.1388, "step": 1955 }, { "epoch": 0.8759026028547439, "grad_norm": 1.3009573221206665, "learning_rate": 2.897045244690674e-05, "loss": 0.132, "step": 1956 }, { "epoch": 0.8763504058214385, "grad_norm": 1.079586386680603, "learning_rate": 2.8958910433979684e-05, "loss": 0.1795, "step": 1957 }, { "epoch": 0.8767982087881332, "grad_norm": 1.421583890914917, "learning_rate": 2.8947368421052634e-05, "loss": 0.1738, "step": 1958 }, { "epoch": 0.8772460117548279, "grad_norm": 1.9353923797607422, "learning_rate": 2.8935826408125576e-05, "loss": 0.5605, "step": 1959 }, { "epoch": 0.8776938147215225, "grad_norm": 1.5885474681854248, "learning_rate": 2.8924284395198525e-05, "loss": 0.2418, "step": 1960 }, { "epoch": 0.8776938147215225, "eval_loss": 0.14678245782852173, "eval_runtime": 1739.877, "eval_samples_per_second": 2.567, "eval_steps_per_second": 2.567, "step": 1960 }, { "epoch": 0.8781416176882172, "grad_norm": 1.3782999515533447, "learning_rate": 2.8912742382271468e-05, "loss": 0.1732, "step": 1961 }, { "epoch": 0.8785894206549119, "grad_norm": 1.1999086141586304, "learning_rate": 2.8901200369344417e-05, "loss": 0.1317, "step": 1962 }, { "epoch": 0.8790372236216065, "grad_norm": 1.050298810005188, "learning_rate": 2.888965835641736e-05, "loss": 0.1355, "step": 1963 }, { "epoch": 0.8794850265883012, "grad_norm": 1.6739578247070312, "learning_rate": 2.887811634349031e-05, "loss": 0.2034, "step": 1964 }, { "epoch": 0.8799328295549959, "grad_norm": 1.3336613178253174, "learning_rate": 2.8866574330563252e-05, "loss": 0.1467, "step": 1965 }, { "epoch": 0.8803806325216904, "grad_norm": 1.1149473190307617, "learning_rate": 2.8855032317636198e-05, "loss": 0.1364, "step": 1966 }, { "epoch": 0.8808284354883851, "grad_norm": 0.8522685766220093, "learning_rate": 2.884349030470914e-05, "loss": 0.095, "step": 1967 }, { "epoch": 0.8812762384550797, "grad_norm": 1.3482296466827393, "learning_rate": 2.883194829178209e-05, "loss": 0.2734, "step": 1968 }, { "epoch": 0.8817240414217744, "grad_norm": 0.894426703453064, "learning_rate": 2.8820406278855033e-05, "loss": 0.0841, "step": 1969 }, { "epoch": 0.8821718443884691, "grad_norm": 1.0606603622436523, "learning_rate": 2.8808864265927982e-05, "loss": 0.1532, "step": 1970 }, { "epoch": 0.8821718443884691, "eval_loss": 0.14509204030036926, "eval_runtime": 1739.6153, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 1970 }, { "epoch": 0.8826196473551637, "grad_norm": 1.30607271194458, "learning_rate": 2.8797322253000925e-05, "loss": 0.212, "step": 1971 }, { "epoch": 0.8830674503218584, "grad_norm": 0.9961697459220886, "learning_rate": 2.8785780240073874e-05, "loss": 0.143, "step": 1972 }, { "epoch": 0.8835152532885531, "grad_norm": 0.9268302321434021, "learning_rate": 2.8774238227146817e-05, "loss": 0.1197, "step": 1973 }, { "epoch": 0.8839630562552477, "grad_norm": 1.2642148733139038, "learning_rate": 2.876269621421976e-05, "loss": 0.1186, "step": 1974 }, { "epoch": 0.8844108592219423, "grad_norm": 1.2498232126235962, "learning_rate": 2.875115420129271e-05, "loss": 0.095, "step": 1975 }, { "epoch": 0.884858662188637, "grad_norm": 1.8330223560333252, "learning_rate": 2.873961218836565e-05, "loss": 0.2395, "step": 1976 }, { "epoch": 0.8853064651553316, "grad_norm": 1.2042216062545776, "learning_rate": 2.8728070175438597e-05, "loss": 0.1489, "step": 1977 }, { "epoch": 0.8857542681220263, "grad_norm": 0.9576921463012695, "learning_rate": 2.871652816251154e-05, "loss": 0.0874, "step": 1978 }, { "epoch": 0.886202071088721, "grad_norm": 1.0180977582931519, "learning_rate": 2.870498614958449e-05, "loss": 0.1121, "step": 1979 }, { "epoch": 0.8866498740554156, "grad_norm": 1.520078182220459, "learning_rate": 2.8693444136657432e-05, "loss": 0.2487, "step": 1980 }, { "epoch": 0.8866498740554156, "eval_loss": 0.14374347031116486, "eval_runtime": 1739.5902, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 1980 }, { "epoch": 0.8870976770221103, "grad_norm": 0.7219629883766174, "learning_rate": 2.868190212373038e-05, "loss": 0.0582, "step": 1981 }, { "epoch": 0.887545479988805, "grad_norm": 0.9952398538589478, "learning_rate": 2.8670360110803324e-05, "loss": 0.1541, "step": 1982 }, { "epoch": 0.8879932829554996, "grad_norm": 1.2556064128875732, "learning_rate": 2.8658818097876273e-05, "loss": 0.1759, "step": 1983 }, { "epoch": 0.8884410859221943, "grad_norm": 1.1847312450408936, "learning_rate": 2.8647276084949216e-05, "loss": 0.1542, "step": 1984 }, { "epoch": 0.8888888888888888, "grad_norm": 0.9823407530784607, "learning_rate": 2.8635734072022162e-05, "loss": 0.107, "step": 1985 }, { "epoch": 0.8893366918555835, "grad_norm": 1.1333894729614258, "learning_rate": 2.8624192059095108e-05, "loss": 0.2441, "step": 1986 }, { "epoch": 0.8897844948222782, "grad_norm": 1.157876968383789, "learning_rate": 2.8612650046168054e-05, "loss": 0.163, "step": 1987 }, { "epoch": 0.8902322977889728, "grad_norm": 0.8168591260910034, "learning_rate": 2.8601108033240996e-05, "loss": 0.07, "step": 1988 }, { "epoch": 0.8906801007556675, "grad_norm": 0.9307788014411926, "learning_rate": 2.8589566020313946e-05, "loss": 0.1237, "step": 1989 }, { "epoch": 0.8911279037223622, "grad_norm": 1.2166438102722168, "learning_rate": 2.857802400738689e-05, "loss": 0.1151, "step": 1990 }, { "epoch": 0.8911279037223622, "eval_loss": 0.14140208065509796, "eval_runtime": 1739.3049, "eval_samples_per_second": 2.568, "eval_steps_per_second": 2.568, "step": 1990 }, { "epoch": 0.8915757066890568, "grad_norm": 1.0516357421875, "learning_rate": 2.8566481994459838e-05, "loss": 0.1531, "step": 1991 }, { "epoch": 0.8920235096557515, "grad_norm": 1.1389539241790771, "learning_rate": 2.855493998153278e-05, "loss": 0.0984, "step": 1992 }, { "epoch": 0.8924713126224462, "grad_norm": 1.6578068733215332, "learning_rate": 2.854339796860573e-05, "loss": 0.1922, "step": 1993 }, { "epoch": 0.8929191155891408, "grad_norm": 0.9576694965362549, "learning_rate": 2.8531855955678672e-05, "loss": 0.0826, "step": 1994 }, { "epoch": 0.8933669185558354, "grad_norm": 1.0604113340377808, "learning_rate": 2.852031394275162e-05, "loss": 0.0933, "step": 1995 }, { "epoch": 0.89381472152253, "grad_norm": 0.9682891368865967, "learning_rate": 2.850877192982456e-05, "loss": 0.1203, "step": 1996 }, { "epoch": 0.8942625244892247, "grad_norm": 1.1396582126617432, "learning_rate": 2.849722991689751e-05, "loss": 0.0986, "step": 1997 }, { "epoch": 0.8947103274559194, "grad_norm": 0.9337264895439148, "learning_rate": 2.8485687903970453e-05, "loss": 0.1658, "step": 1998 }, { "epoch": 0.895158130422614, "grad_norm": 1.4096896648406982, "learning_rate": 2.8474145891043396e-05, "loss": 0.1466, "step": 1999 }, { "epoch": 0.8956059333893087, "grad_norm": 1.0767900943756104, "learning_rate": 2.8462603878116345e-05, "loss": 0.1029, "step": 2000 }, { "epoch": 0.8956059333893087, "eval_loss": 0.14072825014591217, "eval_runtime": 1738.5687, "eval_samples_per_second": 2.569, "eval_steps_per_second": 2.569, "step": 2000 }, { "epoch": 0.8960537363560034, "grad_norm": 1.3448169231414795, "learning_rate": 2.8451061865189288e-05, "loss": 0.1451, "step": 2001 }, { "epoch": 0.896501539322698, "grad_norm": 1.0144966840744019, "learning_rate": 2.8439519852262237e-05, "loss": 0.0834, "step": 2002 }, { "epoch": 0.8969493422893927, "grad_norm": 1.3243262767791748, "learning_rate": 2.842797783933518e-05, "loss": 0.2478, "step": 2003 }, { "epoch": 0.8973971452560873, "grad_norm": 1.26291024684906, "learning_rate": 2.841643582640813e-05, "loss": 0.1247, "step": 2004 }, { "epoch": 0.8978449482227819, "grad_norm": 1.1598271131515503, "learning_rate": 2.840489381348107e-05, "loss": 0.1975, "step": 2005 }, { "epoch": 0.8982927511894766, "grad_norm": 0.98848557472229, "learning_rate": 2.8393351800554018e-05, "loss": 0.0948, "step": 2006 }, { "epoch": 0.8987405541561713, "grad_norm": 1.0264403820037842, "learning_rate": 2.838180978762696e-05, "loss": 0.0812, "step": 2007 }, { "epoch": 0.8991883571228659, "grad_norm": 0.7366992831230164, "learning_rate": 2.837026777469991e-05, "loss": 0.0588, "step": 2008 }, { "epoch": 0.8996361600895606, "grad_norm": 0.8434129953384399, "learning_rate": 2.8358725761772852e-05, "loss": 0.0947, "step": 2009 }, { "epoch": 0.9000839630562553, "grad_norm": 0.9951444268226624, "learning_rate": 2.83471837488458e-05, "loss": 0.09, "step": 2010 }, { "epoch": 0.9000839630562553, "eval_loss": 0.13967619836330414, "eval_runtime": 1736.9399, "eval_samples_per_second": 2.572, "eval_steps_per_second": 2.572, "step": 2010 }, { "epoch": 0.9005317660229499, "grad_norm": 1.09131920337677, "learning_rate": 2.8335641735918744e-05, "loss": 0.102, "step": 2011 }, { "epoch": 0.9009795689896446, "grad_norm": 1.2195484638214111, "learning_rate": 2.8324099722991694e-05, "loss": 0.1977, "step": 2012 }, { "epoch": 0.9014273719563393, "grad_norm": 1.3848285675048828, "learning_rate": 2.8312557710064636e-05, "loss": 0.2635, "step": 2013 }, { "epoch": 0.9018751749230338, "grad_norm": 0.9678056240081787, "learning_rate": 2.8301015697137586e-05, "loss": 0.0769, "step": 2014 }, { "epoch": 0.9023229778897285, "grad_norm": 1.0768543481826782, "learning_rate": 2.8289473684210528e-05, "loss": 0.0976, "step": 2015 }, { "epoch": 0.9027707808564231, "grad_norm": 0.7608491778373718, "learning_rate": 2.8277931671283474e-05, "loss": 0.0695, "step": 2016 }, { "epoch": 0.9032185838231178, "grad_norm": 1.0845251083374023, "learning_rate": 2.8266389658356417e-05, "loss": 0.084, "step": 2017 }, { "epoch": 0.9036663867898125, "grad_norm": 1.199765920639038, "learning_rate": 2.8254847645429366e-05, "loss": 0.1161, "step": 2018 }, { "epoch": 0.9041141897565071, "grad_norm": 1.084950566291809, "learning_rate": 2.824330563250231e-05, "loss": 0.1145, "step": 2019 }, { "epoch": 0.9045619927232018, "grad_norm": 0.9587868452072144, "learning_rate": 2.8231763619575258e-05, "loss": 0.1391, "step": 2020 }, { "epoch": 0.9045619927232018, "eval_loss": 0.13916510343551636, "eval_runtime": 1737.3861, "eval_samples_per_second": 2.571, "eval_steps_per_second": 2.571, "step": 2020 }, { "epoch": 0.9050097956898965, "grad_norm": 1.1295528411865234, "learning_rate": 2.82202216066482e-05, "loss": 0.1125, "step": 2021 }, { "epoch": 0.9054575986565911, "grad_norm": 1.32404625415802, "learning_rate": 2.820867959372115e-05, "loss": 0.1223, "step": 2022 }, { "epoch": 0.9059054016232857, "grad_norm": 1.3158620595932007, "learning_rate": 2.8197137580794093e-05, "loss": 0.2674, "step": 2023 }, { "epoch": 0.9063532045899804, "grad_norm": 1.20033860206604, "learning_rate": 2.8185595567867035e-05, "loss": 0.1966, "step": 2024 }, { "epoch": 0.906801007556675, "grad_norm": 0.9327771663665771, "learning_rate": 2.8174053554939985e-05, "loss": 0.117, "step": 2025 }, { "epoch": 0.9072488105233697, "grad_norm": 1.43967866897583, "learning_rate": 2.8162511542012927e-05, "loss": 0.1195, "step": 2026 }, { "epoch": 0.9076966134900644, "grad_norm": 1.1177648305892944, "learning_rate": 2.8150969529085873e-05, "loss": 0.1073, "step": 2027 }, { "epoch": 0.908144416456759, "grad_norm": 0.8828498125076294, "learning_rate": 2.8139427516158816e-05, "loss": 0.1223, "step": 2028 }, { "epoch": 0.9085922194234537, "grad_norm": 1.082963466644287, "learning_rate": 2.8127885503231765e-05, "loss": 0.166, "step": 2029 }, { "epoch": 0.9090400223901484, "grad_norm": 0.9051773548126221, "learning_rate": 2.8116343490304708e-05, "loss": 0.0679, "step": 2030 }, { "epoch": 0.9090400223901484, "eval_loss": 0.13642385601997375, "eval_runtime": 1736.3259, "eval_samples_per_second": 2.573, "eval_steps_per_second": 2.573, "step": 2030 }, { "epoch": 0.909487825356843, "grad_norm": 1.1784230470657349, "learning_rate": 2.8104801477377657e-05, "loss": 0.1645, "step": 2031 }, { "epoch": 0.9099356283235377, "grad_norm": 1.8553470373153687, "learning_rate": 2.80932594644506e-05, "loss": 0.1982, "step": 2032 }, { "epoch": 0.9103834312902322, "grad_norm": 0.9508320093154907, "learning_rate": 2.808171745152355e-05, "loss": 0.0892, "step": 2033 }, { "epoch": 0.9108312342569269, "grad_norm": 1.0444815158843994, "learning_rate": 2.8070175438596492e-05, "loss": 0.1011, "step": 2034 }, { "epoch": 0.9112790372236216, "grad_norm": 1.3620988130569458, "learning_rate": 2.8058633425669438e-05, "loss": 0.2077, "step": 2035 }, { "epoch": 0.9117268401903162, "grad_norm": 1.7878204584121704, "learning_rate": 2.8047091412742384e-05, "loss": 0.1701, "step": 2036 }, { "epoch": 0.9121746431570109, "grad_norm": 0.9353978037834167, "learning_rate": 2.803554939981533e-05, "loss": 0.0899, "step": 2037 }, { "epoch": 0.9126224461237056, "grad_norm": 1.4356045722961426, "learning_rate": 2.8024007386888272e-05, "loss": 0.239, "step": 2038 }, { "epoch": 0.9130702490904002, "grad_norm": 1.3796677589416504, "learning_rate": 2.8012465373961222e-05, "loss": 0.1811, "step": 2039 }, { "epoch": 0.9135180520570949, "grad_norm": 1.1362677812576294, "learning_rate": 2.8000923361034164e-05, "loss": 0.1263, "step": 2040 }, { "epoch": 0.9135180520570949, "eval_loss": 0.13533499836921692, "eval_runtime": 1732.9341, "eval_samples_per_second": 2.578, "eval_steps_per_second": 2.578, "step": 2040 }, { "epoch": 0.9139658550237896, "grad_norm": 1.3013898134231567, "learning_rate": 2.7989381348107114e-05, "loss": 0.1018, "step": 2041 }, { "epoch": 0.9144136579904842, "grad_norm": 1.7660280466079712, "learning_rate": 2.7977839335180056e-05, "loss": 0.2326, "step": 2042 }, { "epoch": 0.9148614609571788, "grad_norm": 1.3348053693771362, "learning_rate": 2.7966297322253006e-05, "loss": 0.2302, "step": 2043 }, { "epoch": 0.9153092639238735, "grad_norm": 1.8742432594299316, "learning_rate": 2.795475530932595e-05, "loss": 0.1871, "step": 2044 }, { "epoch": 0.9157570668905681, "grad_norm": 0.6203439831733704, "learning_rate": 2.7943213296398894e-05, "loss": 0.0832, "step": 2045 }, { "epoch": 0.9162048698572628, "grad_norm": 1.1034528017044067, "learning_rate": 2.7931671283471837e-05, "loss": 0.1393, "step": 2046 }, { "epoch": 0.9166526728239575, "grad_norm": 1.2916886806488037, "learning_rate": 2.7920129270544786e-05, "loss": 0.1032, "step": 2047 }, { "epoch": 0.9171004757906521, "grad_norm": 1.0839390754699707, "learning_rate": 2.790858725761773e-05, "loss": 0.1538, "step": 2048 }, { "epoch": 0.9175482787573468, "grad_norm": 1.3720133304595947, "learning_rate": 2.789704524469067e-05, "loss": 0.206, "step": 2049 }, { "epoch": 0.9179960817240415, "grad_norm": 1.0583562850952148, "learning_rate": 2.788550323176362e-05, "loss": 0.0734, "step": 2050 }, { "epoch": 0.9179960817240415, "eval_loss": 0.13328154385089874, "eval_runtime": 1735.7016, "eval_samples_per_second": 2.574, "eval_steps_per_second": 2.574, "step": 2050 }, { "epoch": 0.9184438846907361, "grad_norm": 1.2125797271728516, "learning_rate": 2.7873961218836564e-05, "loss": 0.0837, "step": 2051 }, { "epoch": 0.9188916876574307, "grad_norm": 1.289035439491272, "learning_rate": 2.7862419205909513e-05, "loss": 0.1494, "step": 2052 }, { "epoch": 0.9193394906241253, "grad_norm": 1.5854977369308472, "learning_rate": 2.7850877192982456e-05, "loss": 0.2638, "step": 2053 }, { "epoch": 0.91978729359082, "grad_norm": 1.4358898401260376, "learning_rate": 2.7839335180055405e-05, "loss": 0.2973, "step": 2054 }, { "epoch": 0.9202350965575147, "grad_norm": 0.9232112169265747, "learning_rate": 2.7827793167128348e-05, "loss": 0.0715, "step": 2055 }, { "epoch": 0.9206828995242093, "grad_norm": 1.1108778715133667, "learning_rate": 2.7816251154201294e-05, "loss": 0.1234, "step": 2056 }, { "epoch": 0.921130702490904, "grad_norm": 1.1772187948226929, "learning_rate": 2.7804709141274236e-05, "loss": 0.1565, "step": 2057 }, { "epoch": 0.9215785054575987, "grad_norm": 1.3795710802078247, "learning_rate": 2.7793167128347186e-05, "loss": 0.161, "step": 2058 }, { "epoch": 0.9220263084242933, "grad_norm": 1.0228571891784668, "learning_rate": 2.7781625115420128e-05, "loss": 0.1555, "step": 2059 }, { "epoch": 0.922474111390988, "grad_norm": 0.939876139163971, "learning_rate": 2.7770083102493078e-05, "loss": 0.1374, "step": 2060 }, { "epoch": 0.922474111390988, "eval_loss": 0.13193786144256592, "eval_runtime": 1733.2853, "eval_samples_per_second": 2.577, "eval_steps_per_second": 2.577, "step": 2060 }, { "epoch": 0.9229219143576827, "grad_norm": 1.0626474618911743, "learning_rate": 2.775854108956602e-05, "loss": 0.0856, "step": 2061 }, { "epoch": 0.9233697173243772, "grad_norm": 1.2270374298095703, "learning_rate": 2.774699907663897e-05, "loss": 0.1045, "step": 2062 }, { "epoch": 0.9238175202910719, "grad_norm": 1.1409982442855835, "learning_rate": 2.7735457063711912e-05, "loss": 0.161, "step": 2063 }, { "epoch": 0.9242653232577666, "grad_norm": 1.3213545083999634, "learning_rate": 2.772391505078486e-05, "loss": 0.2275, "step": 2064 }, { "epoch": 0.9247131262244612, "grad_norm": 0.8315839767456055, "learning_rate": 2.7712373037857804e-05, "loss": 0.1097, "step": 2065 }, { "epoch": 0.9251609291911559, "grad_norm": 1.038310170173645, "learning_rate": 2.770083102493075e-05, "loss": 0.1428, "step": 2066 }, { "epoch": 0.9256087321578506, "grad_norm": 1.3389686346054077, "learning_rate": 2.7689289012003693e-05, "loss": 0.1982, "step": 2067 }, { "epoch": 0.9260565351245452, "grad_norm": 1.168298363685608, "learning_rate": 2.7677746999076642e-05, "loss": 0.1804, "step": 2068 }, { "epoch": 0.9265043380912399, "grad_norm": 0.9898531436920166, "learning_rate": 2.7666204986149585e-05, "loss": 0.0915, "step": 2069 }, { "epoch": 0.9269521410579346, "grad_norm": 0.9529014825820923, "learning_rate": 2.7654662973222534e-05, "loss": 0.0745, "step": 2070 }, { "epoch": 0.9269521410579346, "eval_loss": 0.12939994037151337, "eval_runtime": 1728.0102, "eval_samples_per_second": 2.585, "eval_steps_per_second": 2.585, "step": 2070 }, { "epoch": 0.9273999440246291, "grad_norm": 1.0990973711013794, "learning_rate": 2.7643120960295477e-05, "loss": 0.1563, "step": 2071 }, { "epoch": 0.9278477469913238, "grad_norm": 1.4120893478393555, "learning_rate": 2.7631578947368426e-05, "loss": 0.1955, "step": 2072 }, { "epoch": 0.9282955499580184, "grad_norm": 1.8996490240097046, "learning_rate": 2.762003693444137e-05, "loss": 0.1389, "step": 2073 }, { "epoch": 0.9287433529247131, "grad_norm": 1.2464399337768555, "learning_rate": 2.760849492151431e-05, "loss": 0.12, "step": 2074 }, { "epoch": 0.9291911558914078, "grad_norm": 1.1876400709152222, "learning_rate": 2.759695290858726e-05, "loss": 0.0893, "step": 2075 }, { "epoch": 0.9296389588581024, "grad_norm": 1.5894993543624878, "learning_rate": 2.7585410895660203e-05, "loss": 0.1151, "step": 2076 }, { "epoch": 0.9300867618247971, "grad_norm": 1.2473312616348267, "learning_rate": 2.757386888273315e-05, "loss": 0.0991, "step": 2077 }, { "epoch": 0.9305345647914918, "grad_norm": 0.9199130535125732, "learning_rate": 2.7562326869806092e-05, "loss": 0.0805, "step": 2078 }, { "epoch": 0.9309823677581864, "grad_norm": 1.4169944524765015, "learning_rate": 2.755078485687904e-05, "loss": 0.1683, "step": 2079 }, { "epoch": 0.9314301707248811, "grad_norm": 1.1549464464187622, "learning_rate": 2.7539242843951984e-05, "loss": 0.1183, "step": 2080 }, { "epoch": 0.9314301707248811, "eval_loss": 0.12832778692245483, "eval_runtime": 1728.5516, "eval_samples_per_second": 2.584, "eval_steps_per_second": 2.584, "step": 2080 }, { "epoch": 0.9318779736915757, "grad_norm": 1.8567004203796387, "learning_rate": 2.7527700831024933e-05, "loss": 0.2219, "step": 2081 }, { "epoch": 0.9323257766582703, "grad_norm": 0.8522270321846008, "learning_rate": 2.7516158818097876e-05, "loss": 0.0656, "step": 2082 }, { "epoch": 0.932773579624965, "grad_norm": 0.920242190361023, "learning_rate": 2.7504616805170825e-05, "loss": 0.0794, "step": 2083 }, { "epoch": 0.9332213825916597, "grad_norm": 1.5621031522750854, "learning_rate": 2.7493074792243768e-05, "loss": 0.1228, "step": 2084 }, { "epoch": 0.9336691855583543, "grad_norm": 1.3902122974395752, "learning_rate": 2.7481532779316714e-05, "loss": 0.1225, "step": 2085 }, { "epoch": 0.934116988525049, "grad_norm": 0.8981521725654602, "learning_rate": 2.746999076638966e-05, "loss": 0.1141, "step": 2086 }, { "epoch": 0.9345647914917437, "grad_norm": 1.0861537456512451, "learning_rate": 2.7458448753462606e-05, "loss": 0.0923, "step": 2087 }, { "epoch": 0.9350125944584383, "grad_norm": 0.641179084777832, "learning_rate": 2.744690674053555e-05, "loss": 0.0678, "step": 2088 }, { "epoch": 0.935460397425133, "grad_norm": 1.2074898481369019, "learning_rate": 2.7435364727608498e-05, "loss": 0.1375, "step": 2089 }, { "epoch": 0.9359082003918276, "grad_norm": 0.8622816801071167, "learning_rate": 2.742382271468144e-05, "loss": 0.1091, "step": 2090 }, { "epoch": 0.9359082003918276, "eval_loss": 0.12745097279548645, "eval_runtime": 1733.2884, "eval_samples_per_second": 2.577, "eval_steps_per_second": 2.577, "step": 2090 }, { "epoch": 0.9363560033585222, "grad_norm": 1.2994965314865112, "learning_rate": 2.741228070175439e-05, "loss": 0.1355, "step": 2091 }, { "epoch": 0.9368038063252169, "grad_norm": 0.9833651185035706, "learning_rate": 2.7400738688827332e-05, "loss": 0.0922, "step": 2092 }, { "epoch": 0.9372516092919115, "grad_norm": 0.9704849123954773, "learning_rate": 2.7389196675900282e-05, "loss": 0.0962, "step": 2093 }, { "epoch": 0.9376994122586062, "grad_norm": 0.9272742867469788, "learning_rate": 2.7377654662973224e-05, "loss": 0.0572, "step": 2094 }, { "epoch": 0.9381472152253009, "grad_norm": 0.8162627816200256, "learning_rate": 2.736611265004617e-05, "loss": 0.0705, "step": 2095 }, { "epoch": 0.9385950181919955, "grad_norm": 1.1083530187606812, "learning_rate": 2.7354570637119113e-05, "loss": 0.1832, "step": 2096 }, { "epoch": 0.9390428211586902, "grad_norm": 0.935386061668396, "learning_rate": 2.7343028624192062e-05, "loss": 0.1217, "step": 2097 }, { "epoch": 0.9394906241253849, "grad_norm": 1.1456583738327026, "learning_rate": 2.7331486611265005e-05, "loss": 0.0749, "step": 2098 }, { "epoch": 0.9399384270920795, "grad_norm": 1.2562615871429443, "learning_rate": 2.7319944598337948e-05, "loss": 0.1562, "step": 2099 }, { "epoch": 0.9403862300587741, "grad_norm": 1.1288965940475464, "learning_rate": 2.7308402585410897e-05, "loss": 0.0863, "step": 2100 }, { "epoch": 0.9403862300587741, "eval_loss": 0.1265988051891327, "eval_runtime": 1736.0228, "eval_samples_per_second": 2.573, "eval_steps_per_second": 2.573, "step": 2100 }, { "epoch": 0.9408340330254688, "grad_norm": 1.0694856643676758, "learning_rate": 2.729686057248384e-05, "loss": 0.1046, "step": 2101 }, { "epoch": 0.9412818359921634, "grad_norm": 0.8078300356864929, "learning_rate": 2.728531855955679e-05, "loss": 0.067, "step": 2102 }, { "epoch": 0.9417296389588581, "grad_norm": 1.4082826375961304, "learning_rate": 2.727377654662973e-05, "loss": 0.2408, "step": 2103 }, { "epoch": 0.9421774419255527, "grad_norm": 1.0101279020309448, "learning_rate": 2.726223453370268e-05, "loss": 0.1091, "step": 2104 }, { "epoch": 0.9426252448922474, "grad_norm": 0.7255677580833435, "learning_rate": 2.7250692520775624e-05, "loss": 0.0572, "step": 2105 }, { "epoch": 0.9430730478589421, "grad_norm": 1.0651483535766602, "learning_rate": 2.723915050784857e-05, "loss": 0.0956, "step": 2106 }, { "epoch": 0.9435208508256367, "grad_norm": 1.2711114883422852, "learning_rate": 2.7227608494921512e-05, "loss": 0.1856, "step": 2107 }, { "epoch": 0.9439686537923314, "grad_norm": 1.3673771619796753, "learning_rate": 2.721606648199446e-05, "loss": 0.1094, "step": 2108 }, { "epoch": 0.9444164567590261, "grad_norm": 0.7726939916610718, "learning_rate": 2.7204524469067404e-05, "loss": 0.0995, "step": 2109 }, { "epoch": 0.9448642597257206, "grad_norm": 0.930659294128418, "learning_rate": 2.7192982456140354e-05, "loss": 0.1202, "step": 2110 }, { "epoch": 0.9448642597257206, "eval_loss": 0.1258155256509781, "eval_runtime": 1731.0481, "eval_samples_per_second": 2.581, "eval_steps_per_second": 2.581, "step": 2110 }, { "epoch": 0.9453120626924153, "grad_norm": 0.9811476469039917, "learning_rate": 2.7181440443213296e-05, "loss": 0.124, "step": 2111 }, { "epoch": 0.94575986565911, "grad_norm": 1.239196538925171, "learning_rate": 2.7169898430286246e-05, "loss": 0.1605, "step": 2112 }, { "epoch": 0.9462076686258046, "grad_norm": 1.453579306602478, "learning_rate": 2.7158356417359188e-05, "loss": 0.1452, "step": 2113 }, { "epoch": 0.9466554715924993, "grad_norm": 0.9727533459663391, "learning_rate": 2.7146814404432138e-05, "loss": 0.0827, "step": 2114 }, { "epoch": 0.947103274559194, "grad_norm": 0.8124465346336365, "learning_rate": 2.713527239150508e-05, "loss": 0.0863, "step": 2115 }, { "epoch": 0.9475510775258886, "grad_norm": 0.8164709806442261, "learning_rate": 2.7123730378578026e-05, "loss": 0.0735, "step": 2116 }, { "epoch": 0.9479988804925833, "grad_norm": 1.3132597208023071, "learning_rate": 2.711218836565097e-05, "loss": 0.1077, "step": 2117 }, { "epoch": 0.948446683459278, "grad_norm": 1.138953685760498, "learning_rate": 2.7100646352723918e-05, "loss": 0.1137, "step": 2118 }, { "epoch": 0.9488944864259726, "grad_norm": 1.3828215599060059, "learning_rate": 2.708910433979686e-05, "loss": 0.0948, "step": 2119 }, { "epoch": 0.9493422893926672, "grad_norm": 0.6844764947891235, "learning_rate": 2.707756232686981e-05, "loss": 0.0961, "step": 2120 }, { "epoch": 0.9493422893926672, "eval_loss": 0.1241694986820221, "eval_runtime": 1734.9872, "eval_samples_per_second": 2.575, "eval_steps_per_second": 2.575, "step": 2120 }, { "epoch": 0.9497900923593618, "grad_norm": 0.9134883284568787, "learning_rate": 2.7066020313942753e-05, "loss": 0.0868, "step": 2121 }, { "epoch": 0.9502378953260565, "grad_norm": 1.5742448568344116, "learning_rate": 2.7054478301015702e-05, "loss": 0.2265, "step": 2122 }, { "epoch": 0.9506856982927512, "grad_norm": 1.5208790302276611, "learning_rate": 2.7042936288088645e-05, "loss": 0.2737, "step": 2123 }, { "epoch": 0.9511335012594458, "grad_norm": 1.644930124282837, "learning_rate": 2.7031394275161587e-05, "loss": 0.1309, "step": 2124 }, { "epoch": 0.9515813042261405, "grad_norm": 0.7968838810920715, "learning_rate": 2.7019852262234537e-05, "loss": 0.0825, "step": 2125 }, { "epoch": 0.9520291071928352, "grad_norm": 1.451269507408142, "learning_rate": 2.700831024930748e-05, "loss": 0.1362, "step": 2126 }, { "epoch": 0.9524769101595298, "grad_norm": 1.1176960468292236, "learning_rate": 2.6996768236380425e-05, "loss": 0.1107, "step": 2127 }, { "epoch": 0.9529247131262245, "grad_norm": 0.9091848731040955, "learning_rate": 2.6985226223453368e-05, "loss": 0.1113, "step": 2128 }, { "epoch": 0.9533725160929191, "grad_norm": 1.2914046049118042, "learning_rate": 2.6973684210526317e-05, "loss": 0.1142, "step": 2129 }, { "epoch": 0.9538203190596137, "grad_norm": 0.965366005897522, "learning_rate": 2.696214219759926e-05, "loss": 0.1009, "step": 2130 }, { "epoch": 0.9538203190596137, "eval_loss": 0.1229698657989502, "eval_runtime": 1728.6065, "eval_samples_per_second": 2.584, "eval_steps_per_second": 2.584, "step": 2130 }, { "epoch": 0.9542681220263084, "grad_norm": 1.3068053722381592, "learning_rate": 2.695060018467221e-05, "loss": 0.1627, "step": 2131 }, { "epoch": 0.9547159249930031, "grad_norm": 1.0245745182037354, "learning_rate": 2.6939058171745152e-05, "loss": 0.0736, "step": 2132 }, { "epoch": 0.9551637279596977, "grad_norm": 0.8566816449165344, "learning_rate": 2.69275161588181e-05, "loss": 0.1103, "step": 2133 }, { "epoch": 0.9556115309263924, "grad_norm": 0.952111005783081, "learning_rate": 2.6915974145891044e-05, "loss": 0.09, "step": 2134 }, { "epoch": 0.9560593338930871, "grad_norm": 1.0342718362808228, "learning_rate": 2.690443213296399e-05, "loss": 0.0958, "step": 2135 }, { "epoch": 0.9565071368597817, "grad_norm": 1.3856199979782104, "learning_rate": 2.6892890120036936e-05, "loss": 0.1149, "step": 2136 }, { "epoch": 0.9569549398264764, "grad_norm": 1.2831979990005493, "learning_rate": 2.6881348107109882e-05, "loss": 0.1712, "step": 2137 }, { "epoch": 0.9574027427931711, "grad_norm": 1.151147484779358, "learning_rate": 2.6869806094182825e-05, "loss": 0.106, "step": 2138 }, { "epoch": 0.9578505457598656, "grad_norm": 1.2741520404815674, "learning_rate": 2.6858264081255774e-05, "loss": 0.1151, "step": 2139 }, { "epoch": 0.9582983487265603, "grad_norm": 0.9963933229446411, "learning_rate": 2.6846722068328717e-05, "loss": 0.1024, "step": 2140 }, { "epoch": 0.9582983487265603, "eval_loss": 0.12240637838840485, "eval_runtime": 1732.9445, "eval_samples_per_second": 2.578, "eval_steps_per_second": 2.578, "step": 2140 }, { "epoch": 0.9587461516932549, "grad_norm": 1.1386899948120117, "learning_rate": 2.6835180055401666e-05, "loss": 0.0772, "step": 2141 }, { "epoch": 0.9591939546599496, "grad_norm": 0.8593908548355103, "learning_rate": 2.682363804247461e-05, "loss": 0.0883, "step": 2142 }, { "epoch": 0.9596417576266443, "grad_norm": 1.1071313619613647, "learning_rate": 2.6812096029547558e-05, "loss": 0.125, "step": 2143 }, { "epoch": 0.9600895605933389, "grad_norm": 0.6254713535308838, "learning_rate": 2.68005540166205e-05, "loss": 0.0546, "step": 2144 }, { "epoch": 0.9605373635600336, "grad_norm": 0.7606832981109619, "learning_rate": 2.6789012003693446e-05, "loss": 0.0801, "step": 2145 }, { "epoch": 0.9609851665267283, "grad_norm": 1.3213703632354736, "learning_rate": 2.677746999076639e-05, "loss": 0.1557, "step": 2146 }, { "epoch": 0.9614329694934229, "grad_norm": 1.2782256603240967, "learning_rate": 2.676592797783934e-05, "loss": 0.1, "step": 2147 }, { "epoch": 0.9618807724601175, "grad_norm": 0.9655989408493042, "learning_rate": 2.675438596491228e-05, "loss": 0.1161, "step": 2148 }, { "epoch": 0.9623285754268122, "grad_norm": 0.8622291088104248, "learning_rate": 2.6742843951985224e-05, "loss": 0.0622, "step": 2149 }, { "epoch": 0.9627763783935068, "grad_norm": 1.0358092784881592, "learning_rate": 2.6731301939058173e-05, "loss": 0.1415, "step": 2150 }, { "epoch": 0.9627763783935068, "eval_loss": 0.12110047042369843, "eval_runtime": 1729.86, "eval_samples_per_second": 2.582, "eval_steps_per_second": 2.582, "step": 2150 }, { "epoch": 0.9632241813602015, "grad_norm": 1.3379656076431274, "learning_rate": 2.6719759926131116e-05, "loss": 0.1732, "step": 2151 }, { "epoch": 0.9636719843268962, "grad_norm": 1.2671988010406494, "learning_rate": 2.6708217913204065e-05, "loss": 0.1829, "step": 2152 }, { "epoch": 0.9641197872935908, "grad_norm": 1.749251365661621, "learning_rate": 2.6696675900277008e-05, "loss": 0.2139, "step": 2153 }, { "epoch": 0.9645675902602855, "grad_norm": 0.9710353016853333, "learning_rate": 2.6685133887349957e-05, "loss": 0.0672, "step": 2154 }, { "epoch": 0.9650153932269802, "grad_norm": 1.379391074180603, "learning_rate": 2.66735918744229e-05, "loss": 0.2084, "step": 2155 }, { "epoch": 0.9654631961936748, "grad_norm": 0.9271833896636963, "learning_rate": 2.6662049861495846e-05, "loss": 0.0758, "step": 2156 }, { "epoch": 0.9659109991603695, "grad_norm": 1.5926554203033447, "learning_rate": 2.6650507848568788e-05, "loss": 0.347, "step": 2157 }, { "epoch": 0.966358802127064, "grad_norm": 0.7312039732933044, "learning_rate": 2.6638965835641738e-05, "loss": 0.0647, "step": 2158 }, { "epoch": 0.9668066050937587, "grad_norm": 0.9427241683006287, "learning_rate": 2.662742382271468e-05, "loss": 0.0663, "step": 2159 }, { "epoch": 0.9672544080604534, "grad_norm": 1.3281080722808838, "learning_rate": 2.661588180978763e-05, "loss": 0.161, "step": 2160 }, { "epoch": 0.9672544080604534, "eval_loss": 0.11987407505512238, "eval_runtime": 1729.868, "eval_samples_per_second": 2.582, "eval_steps_per_second": 2.582, "step": 2160 }, { "epoch": 0.967702211027148, "grad_norm": 1.2127536535263062, "learning_rate": 2.6604339796860572e-05, "loss": 0.1544, "step": 2161 }, { "epoch": 0.9681500139938427, "grad_norm": 1.1174790859222412, "learning_rate": 2.659279778393352e-05, "loss": 0.0893, "step": 2162 }, { "epoch": 0.9685978169605374, "grad_norm": 1.026856780052185, "learning_rate": 2.6581255771006464e-05, "loss": 0.096, "step": 2163 }, { "epoch": 0.969045619927232, "grad_norm": 1.323331356048584, "learning_rate": 2.6569713758079414e-05, "loss": 0.2058, "step": 2164 }, { "epoch": 0.9694934228939267, "grad_norm": 1.1892203092575073, "learning_rate": 2.6558171745152356e-05, "loss": 0.0926, "step": 2165 }, { "epoch": 0.9699412258606214, "grad_norm": 0.855096161365509, "learning_rate": 2.6546629732225302e-05, "loss": 0.0613, "step": 2166 }, { "epoch": 0.970389028827316, "grad_norm": 1.0742472410202026, "learning_rate": 2.6535087719298245e-05, "loss": 0.084, "step": 2167 }, { "epoch": 0.9708368317940106, "grad_norm": 0.9125453233718872, "learning_rate": 2.6523545706371194e-05, "loss": 0.0604, "step": 2168 }, { "epoch": 0.9712846347607053, "grad_norm": 1.4418150186538696, "learning_rate": 2.6512003693444137e-05, "loss": 0.1148, "step": 2169 }, { "epoch": 0.9717324377273999, "grad_norm": 0.7091181874275208, "learning_rate": 2.6500461680517086e-05, "loss": 0.0828, "step": 2170 }, { "epoch": 0.9717324377273999, "eval_loss": 0.11846964806318283, "eval_runtime": 1731.7718, "eval_samples_per_second": 2.579, "eval_steps_per_second": 2.579, "step": 2170 }, { "epoch": 0.9721802406940946, "grad_norm": 0.848183274269104, "learning_rate": 2.648891966759003e-05, "loss": 0.0814, "step": 2171 }, { "epoch": 0.9726280436607893, "grad_norm": 1.014132022857666, "learning_rate": 2.6477377654662978e-05, "loss": 0.0882, "step": 2172 }, { "epoch": 0.9730758466274839, "grad_norm": 0.807933509349823, "learning_rate": 2.646583564173592e-05, "loss": 0.0792, "step": 2173 }, { "epoch": 0.9735236495941786, "grad_norm": 1.0491405725479126, "learning_rate": 2.6454293628808867e-05, "loss": 0.0683, "step": 2174 }, { "epoch": 0.9739714525608733, "grad_norm": 1.0016448497772217, "learning_rate": 2.6442751615881813e-05, "loss": 0.0677, "step": 2175 }, { "epoch": 0.9744192555275679, "grad_norm": 1.008288025856018, "learning_rate": 2.6431209602954755e-05, "loss": 0.0913, "step": 2176 }, { "epoch": 0.9748670584942625, "grad_norm": 0.911280632019043, "learning_rate": 2.64196675900277e-05, "loss": 0.0986, "step": 2177 }, { "epoch": 0.9753148614609571, "grad_norm": 1.6649196147918701, "learning_rate": 2.6408125577100644e-05, "loss": 0.1501, "step": 2178 }, { "epoch": 0.9757626644276518, "grad_norm": 1.039286494255066, "learning_rate": 2.6396583564173593e-05, "loss": 0.0721, "step": 2179 }, { "epoch": 0.9762104673943465, "grad_norm": 1.164796233177185, "learning_rate": 2.6385041551246536e-05, "loss": 0.1483, "step": 2180 }, { "epoch": 0.9762104673943465, "eval_loss": 0.11873579770326614, "eval_runtime": 1728.0248, "eval_samples_per_second": 2.585, "eval_steps_per_second": 2.585, "step": 2180 }, { "epoch": 0.9766582703610411, "grad_norm": 1.7558592557907104, "learning_rate": 2.6373499538319485e-05, "loss": 0.2055, "step": 2181 }, { "epoch": 0.9771060733277358, "grad_norm": 1.5141433477401733, "learning_rate": 2.6361957525392428e-05, "loss": 0.1019, "step": 2182 }, { "epoch": 0.9775538762944305, "grad_norm": 1.5504546165466309, "learning_rate": 2.6350415512465377e-05, "loss": 0.1048, "step": 2183 }, { "epoch": 0.9780016792611251, "grad_norm": 0.9713227152824402, "learning_rate": 2.633887349953832e-05, "loss": 0.0762, "step": 2184 }, { "epoch": 0.9784494822278198, "grad_norm": 1.3416686058044434, "learning_rate": 2.6327331486611266e-05, "loss": 0.1602, "step": 2185 }, { "epoch": 0.9788972851945145, "grad_norm": 0.9782314896583557, "learning_rate": 2.6315789473684212e-05, "loss": 0.1174, "step": 2186 }, { "epoch": 0.979345088161209, "grad_norm": 0.8174190521240234, "learning_rate": 2.6304247460757158e-05, "loss": 0.0682, "step": 2187 }, { "epoch": 0.9797928911279037, "grad_norm": 1.2250410318374634, "learning_rate": 2.62927054478301e-05, "loss": 0.0933, "step": 2188 }, { "epoch": 0.9802406940945984, "grad_norm": 1.0255568027496338, "learning_rate": 2.628116343490305e-05, "loss": 0.0666, "step": 2189 }, { "epoch": 0.980688497061293, "grad_norm": 1.3774869441986084, "learning_rate": 2.6269621421975993e-05, "loss": 0.1049, "step": 2190 }, { "epoch": 0.980688497061293, "eval_loss": 0.11641395092010498, "eval_runtime": 1733.033, "eval_samples_per_second": 2.578, "eval_steps_per_second": 2.578, "step": 2190 }, { "epoch": 0.9811363000279877, "grad_norm": 0.9319576025009155, "learning_rate": 2.6258079409048942e-05, "loss": 0.1294, "step": 2191 }, { "epoch": 0.9815841029946824, "grad_norm": 0.9967508316040039, "learning_rate": 2.6246537396121885e-05, "loss": 0.1042, "step": 2192 }, { "epoch": 0.982031905961377, "grad_norm": 1.1014225482940674, "learning_rate": 2.6234995383194834e-05, "loss": 0.0992, "step": 2193 }, { "epoch": 0.9824797089280717, "grad_norm": 1.0176576375961304, "learning_rate": 2.6223453370267777e-05, "loss": 0.0895, "step": 2194 }, { "epoch": 0.9829275118947663, "grad_norm": 1.006609559059143, "learning_rate": 2.6211911357340723e-05, "loss": 0.1612, "step": 2195 }, { "epoch": 0.9833753148614609, "grad_norm": 0.9863540530204773, "learning_rate": 2.6200369344413665e-05, "loss": 0.0949, "step": 2196 }, { "epoch": 0.9838231178281556, "grad_norm": 0.6519593596458435, "learning_rate": 2.6188827331486615e-05, "loss": 0.2872, "step": 2197 }, { "epoch": 0.9842709207948502, "grad_norm": 0.6476958394050598, "learning_rate": 2.6177285318559557e-05, "loss": 0.0532, "step": 2198 }, { "epoch": 0.9847187237615449, "grad_norm": 1.404766321182251, "learning_rate": 2.6165743305632507e-05, "loss": 0.1264, "step": 2199 }, { "epoch": 0.9851665267282396, "grad_norm": 0.9817789196968079, "learning_rate": 2.615420129270545e-05, "loss": 0.1291, "step": 2200 }, { "epoch": 0.9851665267282396, "eval_loss": 0.1149076521396637, "eval_runtime": 1727.9785, "eval_samples_per_second": 2.585, "eval_steps_per_second": 2.585, "step": 2200 } ], "logging_steps": 1, "max_steps": 4466, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.476867796544799e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }