{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5998188640186312, "eval_steps": 2318, "global_step": 13908, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009919351360676242, "grad_norm": 0.91796875, "learning_rate": 0.001, "loss": 9.0124, "step": 23 }, { "epoch": 0.0019838702721352484, "grad_norm": 0.1376953125, "learning_rate": 0.0009990070799516491, "loss": 4.9678, "step": 46 }, { "epoch": 0.0029758054082028723, "grad_norm": 0.10400390625, "learning_rate": 0.0009980141599032983, "loss": 4.8546, "step": 69 }, { "epoch": 0.003967740544270497, "grad_norm": 0.2109375, "learning_rate": 0.0009970212398549474, "loss": 4.7284, "step": 92 }, { "epoch": 0.00495967568033812, "grad_norm": 0.201171875, "learning_rate": 0.0009960283198065965, "loss": 4.5063, "step": 115 }, { "epoch": 0.005951610816405745, "grad_norm": 0.302734375, "learning_rate": 0.0009950353997582456, "loss": 4.2107, "step": 138 }, { "epoch": 0.006943545952473369, "grad_norm": 0.2314453125, "learning_rate": 0.0009940424797098947, "loss": 4.0923, "step": 161 }, { "epoch": 0.007935481088540993, "grad_norm": 0.310546875, "learning_rate": 0.0009930495596615439, "loss": 3.7695, "step": 184 }, { "epoch": 0.008927416224608617, "grad_norm": 0.30078125, "learning_rate": 0.000992056639613193, "loss": 3.6712, "step": 207 }, { "epoch": 0.00991935136067624, "grad_norm": 0.322265625, "learning_rate": 0.0009910637195648419, "loss": 3.5194, "step": 230 }, { "epoch": 0.010911286496743866, "grad_norm": 0.314453125, "learning_rate": 0.0009900707995164912, "loss": 3.4792, "step": 253 }, { "epoch": 0.01190322163281149, "grad_norm": 0.296875, "learning_rate": 0.0009890778794681401, "loss": 3.3372, "step": 276 }, { "epoch": 0.012895156768879113, "grad_norm": 0.310546875, "learning_rate": 0.0009880849594197895, "loss": 3.2479, "step": 299 }, { "epoch": 0.013887091904946738, "grad_norm": 0.34765625, "learning_rate": 0.0009870920393714384, "loss": 3.2265, "step": 322 }, { "epoch": 0.014879027041014362, "grad_norm": 0.3515625, "learning_rate": 0.0009860991193230877, "loss": 3.1277, "step": 345 }, { "epoch": 0.015870962177081987, "grad_norm": 0.2578125, "learning_rate": 0.0009851061992747366, "loss": 3.0156, "step": 368 }, { "epoch": 0.01686289731314961, "grad_norm": 0.2578125, "learning_rate": 0.0009841132792263857, "loss": 2.9304, "step": 391 }, { "epoch": 0.017854832449217234, "grad_norm": 0.486328125, "learning_rate": 0.0009831203591780348, "loss": 2.9364, "step": 414 }, { "epoch": 0.01884676758528486, "grad_norm": 0.37890625, "learning_rate": 0.000982127439129684, "loss": 2.8848, "step": 437 }, { "epoch": 0.01983870272135248, "grad_norm": 0.28125, "learning_rate": 0.000981134519081333, "loss": 2.8368, "step": 460 }, { "epoch": 0.020830637857420106, "grad_norm": 0.310546875, "learning_rate": 0.0009801415990329822, "loss": 2.8024, "step": 483 }, { "epoch": 0.02182257299348773, "grad_norm": 0.498046875, "learning_rate": 0.0009791486789846313, "loss": 2.7152, "step": 506 }, { "epoch": 0.022814508129555353, "grad_norm": 0.357421875, "learning_rate": 0.0009781557589362804, "loss": 2.6818, "step": 529 }, { "epoch": 0.02380644326562298, "grad_norm": 0.353515625, "learning_rate": 0.0009771628388879296, "loss": 2.6625, "step": 552 }, { "epoch": 0.024798378401690604, "grad_norm": 0.390625, "learning_rate": 0.0009761699188395787, "loss": 2.5967, "step": 575 }, { "epoch": 0.025790313537758226, "grad_norm": 0.427734375, "learning_rate": 0.0009751769987912278, "loss": 2.541, "step": 598 }, { "epoch": 0.02678224867382585, "grad_norm": 0.314453125, "learning_rate": 0.0009741840787428768, "loss": 2.5455, "step": 621 }, { "epoch": 0.027774183809893476, "grad_norm": 0.302734375, "learning_rate": 0.000973191158694526, "loss": 2.5566, "step": 644 }, { "epoch": 0.028766118945961098, "grad_norm": 0.357421875, "learning_rate": 0.000972198238646175, "loss": 2.4318, "step": 667 }, { "epoch": 0.029758054082028723, "grad_norm": 0.3125, "learning_rate": 0.0009712053185978243, "loss": 2.4449, "step": 690 }, { "epoch": 0.03074998921809635, "grad_norm": 0.302734375, "learning_rate": 0.0009702123985494733, "loss": 2.4232, "step": 713 }, { "epoch": 0.031741924354163974, "grad_norm": 0.53125, "learning_rate": 0.0009692194785011224, "loss": 2.3352, "step": 736 }, { "epoch": 0.032733859490231595, "grad_norm": 0.30859375, "learning_rate": 0.0009682265584527715, "loss": 2.332, "step": 759 }, { "epoch": 0.03372579462629922, "grad_norm": 0.42578125, "learning_rate": 0.0009672336384044207, "loss": 2.3185, "step": 782 }, { "epoch": 0.034717729762366846, "grad_norm": 0.314453125, "learning_rate": 0.0009662407183560698, "loss": 2.2945, "step": 805 }, { "epoch": 0.03570966489843447, "grad_norm": 0.3203125, "learning_rate": 0.0009652477983077189, "loss": 2.2821, "step": 828 }, { "epoch": 0.03670160003450209, "grad_norm": 0.265625, "learning_rate": 0.000964254878259368, "loss": 2.2184, "step": 851 }, { "epoch": 0.03769353517056972, "grad_norm": 0.40234375, "learning_rate": 0.0009632619582110171, "loss": 2.2383, "step": 874 }, { "epoch": 0.03868547030663734, "grad_norm": 0.30859375, "learning_rate": 0.0009622690381626662, "loss": 2.1755, "step": 897 }, { "epoch": 0.03967740544270496, "grad_norm": 0.3203125, "learning_rate": 0.0009612761181143154, "loss": 2.1569, "step": 920 }, { "epoch": 0.04066934057877259, "grad_norm": 0.314453125, "learning_rate": 0.0009602831980659644, "loss": 2.1272, "step": 943 }, { "epoch": 0.04166127571484021, "grad_norm": 0.54296875, "learning_rate": 0.0009592902780176136, "loss": 2.1358, "step": 966 }, { "epoch": 0.042653210850907834, "grad_norm": 0.279296875, "learning_rate": 0.0009582973579692626, "loss": 2.1388, "step": 989 }, { "epoch": 0.04364514598697546, "grad_norm": 0.376953125, "learning_rate": 0.0009573044379209118, "loss": 2.0937, "step": 1012 }, { "epoch": 0.044637081123043085, "grad_norm": 0.4140625, "learning_rate": 0.0009563115178725609, "loss": 2.0818, "step": 1035 }, { "epoch": 0.045629016259110707, "grad_norm": 0.384765625, "learning_rate": 0.0009553185978242101, "loss": 2.0259, "step": 1058 }, { "epoch": 0.046620951395178335, "grad_norm": 0.3359375, "learning_rate": 0.0009543256777758591, "loss": 2.0098, "step": 1081 }, { "epoch": 0.04761288653124596, "grad_norm": 0.30078125, "learning_rate": 0.0009533327577275082, "loss": 2.0276, "step": 1104 }, { "epoch": 0.04860482166731358, "grad_norm": 0.3046875, "learning_rate": 0.0009523398376791573, "loss": 2.0285, "step": 1127 }, { "epoch": 0.04959675680338121, "grad_norm": 0.306640625, "learning_rate": 0.0009513469176308065, "loss": 1.9488, "step": 1150 }, { "epoch": 0.05058869193944883, "grad_norm": 0.32421875, "learning_rate": 0.0009503539975824556, "loss": 1.9922, "step": 1173 }, { "epoch": 0.05158062707551645, "grad_norm": 0.267578125, "learning_rate": 0.0009493610775341046, "loss": 1.9759, "step": 1196 }, { "epoch": 0.05257256221158408, "grad_norm": 0.41015625, "learning_rate": 0.0009483681574857538, "loss": 1.93, "step": 1219 }, { "epoch": 0.0535644973476517, "grad_norm": 0.36328125, "learning_rate": 0.0009473752374374028, "loss": 1.9264, "step": 1242 }, { "epoch": 0.05455643248371932, "grad_norm": 0.25390625, "learning_rate": 0.0009463823173890521, "loss": 1.8782, "step": 1265 }, { "epoch": 0.05554836761978695, "grad_norm": 0.51171875, "learning_rate": 0.0009453893973407011, "loss": 1.9079, "step": 1288 }, { "epoch": 0.056540302755854574, "grad_norm": 0.27734375, "learning_rate": 0.0009443964772923502, "loss": 1.9279, "step": 1311 }, { "epoch": 0.057532237891922196, "grad_norm": 0.291015625, "learning_rate": 0.0009434035572439993, "loss": 1.8501, "step": 1334 }, { "epoch": 0.058524173027989825, "grad_norm": 0.484375, "learning_rate": 0.0009424106371956484, "loss": 1.8732, "step": 1357 }, { "epoch": 0.059516108164057446, "grad_norm": 0.27734375, "learning_rate": 0.0009414177171472975, "loss": 1.8397, "step": 1380 }, { "epoch": 0.06050804330012507, "grad_norm": 0.361328125, "learning_rate": 0.0009404247970989467, "loss": 1.8288, "step": 1403 }, { "epoch": 0.0614999784361927, "grad_norm": 0.298828125, "learning_rate": 0.0009394318770505958, "loss": 1.8162, "step": 1426 }, { "epoch": 0.06249191357226032, "grad_norm": 0.27734375, "learning_rate": 0.0009384389570022449, "loss": 1.8204, "step": 1449 }, { "epoch": 0.06348384870832795, "grad_norm": 0.248046875, "learning_rate": 0.0009374460369538939, "loss": 1.8123, "step": 1472 }, { "epoch": 0.06447578384439556, "grad_norm": 0.275390625, "learning_rate": 0.0009364531169055431, "loss": 1.8225, "step": 1495 }, { "epoch": 0.06546771898046319, "grad_norm": 0.349609375, "learning_rate": 0.0009354601968571922, "loss": 1.7787, "step": 1518 }, { "epoch": 0.06645965411653082, "grad_norm": 0.3671875, "learning_rate": 0.0009344672768088414, "loss": 1.7703, "step": 1541 }, { "epoch": 0.06745158925259843, "grad_norm": 0.2578125, "learning_rate": 0.0009334743567604904, "loss": 1.7955, "step": 1564 }, { "epoch": 0.06844352438866606, "grad_norm": 0.392578125, "learning_rate": 0.0009324814367121396, "loss": 1.7675, "step": 1587 }, { "epoch": 0.06943545952473369, "grad_norm": 0.31640625, "learning_rate": 0.0009314885166637886, "loss": 1.7548, "step": 1610 }, { "epoch": 0.0704273946608013, "grad_norm": 0.255859375, "learning_rate": 0.0009304955966154378, "loss": 1.7549, "step": 1633 }, { "epoch": 0.07141932979686894, "grad_norm": 0.2578125, "learning_rate": 0.0009295026765670869, "loss": 1.7057, "step": 1656 }, { "epoch": 0.07241126493293656, "grad_norm": 0.3125, "learning_rate": 0.000928509756518736, "loss": 1.7128, "step": 1679 }, { "epoch": 0.07340320006900418, "grad_norm": 0.267578125, "learning_rate": 0.0009275168364703851, "loss": 1.7606, "step": 1702 }, { "epoch": 0.07439513520507181, "grad_norm": 0.271484375, "learning_rate": 0.0009265239164220342, "loss": 1.6989, "step": 1725 }, { "epoch": 0.07538707034113944, "grad_norm": 0.302734375, "learning_rate": 0.0009255309963736834, "loss": 1.7052, "step": 1748 }, { "epoch": 0.07637900547720705, "grad_norm": 0.28515625, "learning_rate": 0.0009245380763253324, "loss": 1.7021, "step": 1771 }, { "epoch": 0.07737094061327468, "grad_norm": 0.2314453125, "learning_rate": 0.0009235451562769816, "loss": 1.7096, "step": 1794 }, { "epoch": 0.07836287574934231, "grad_norm": 0.29296875, "learning_rate": 0.0009225522362286306, "loss": 1.6537, "step": 1817 }, { "epoch": 0.07935481088540992, "grad_norm": 0.26953125, "learning_rate": 0.0009215593161802797, "loss": 1.6359, "step": 1840 }, { "epoch": 0.08034674602147755, "grad_norm": 0.298828125, "learning_rate": 0.0009205663961319288, "loss": 1.6107, "step": 1863 }, { "epoch": 0.08133868115754518, "grad_norm": 0.271484375, "learning_rate": 0.000919573476083578, "loss": 1.6137, "step": 1886 }, { "epoch": 0.0823306162936128, "grad_norm": 0.3359375, "learning_rate": 0.0009185805560352271, "loss": 1.6428, "step": 1909 }, { "epoch": 0.08332255142968042, "grad_norm": 0.421875, "learning_rate": 0.0009175876359868762, "loss": 1.6146, "step": 1932 }, { "epoch": 0.08431448656574805, "grad_norm": 0.283203125, "learning_rate": 0.0009165947159385253, "loss": 1.5996, "step": 1955 }, { "epoch": 0.08530642170181567, "grad_norm": 0.3203125, "learning_rate": 0.0009156017958901744, "loss": 1.6153, "step": 1978 }, { "epoch": 0.0862983568378833, "grad_norm": 0.283203125, "learning_rate": 0.0009146088758418235, "loss": 1.6178, "step": 2001 }, { "epoch": 0.08729029197395093, "grad_norm": 0.3671875, "learning_rate": 0.0009136159557934727, "loss": 1.5988, "step": 2024 }, { "epoch": 0.08828222711001854, "grad_norm": 0.279296875, "learning_rate": 0.0009126230357451217, "loss": 1.578, "step": 2047 }, { "epoch": 0.08927416224608617, "grad_norm": 0.30859375, "learning_rate": 0.0009116301156967709, "loss": 1.5755, "step": 2070 }, { "epoch": 0.0902660973821538, "grad_norm": 0.24609375, "learning_rate": 0.0009106371956484199, "loss": 1.5772, "step": 2093 }, { "epoch": 0.09125803251822141, "grad_norm": 0.2734375, "learning_rate": 0.0009096442756000692, "loss": 1.5405, "step": 2116 }, { "epoch": 0.09224996765428904, "grad_norm": 0.333984375, "learning_rate": 0.0009086513555517182, "loss": 1.5458, "step": 2139 }, { "epoch": 0.09324190279035667, "grad_norm": 0.36328125, "learning_rate": 0.0009076584355033673, "loss": 1.5783, "step": 2162 }, { "epoch": 0.09423383792642429, "grad_norm": 0.314453125, "learning_rate": 0.0009066655154550164, "loss": 1.5237, "step": 2185 }, { "epoch": 0.09522577306249191, "grad_norm": 0.302734375, "learning_rate": 0.0009056725954066655, "loss": 1.5361, "step": 2208 }, { "epoch": 0.09621770819855954, "grad_norm": 0.28125, "learning_rate": 0.0009046796753583147, "loss": 1.5493, "step": 2231 }, { "epoch": 0.09720964333462716, "grad_norm": 0.369140625, "learning_rate": 0.0009036867553099638, "loss": 1.5243, "step": 2254 }, { "epoch": 0.09820157847069479, "grad_norm": 0.41796875, "learning_rate": 0.0009026938352616129, "loss": 1.5061, "step": 2277 }, { "epoch": 0.09919351360676242, "grad_norm": 0.3203125, "learning_rate": 0.000901700915213262, "loss": 1.5141, "step": 2300 }, { "epoch": 0.09996981066977186, "eval_runtime": 163.8773, "eval_samples_per_second": 610.212, "eval_steps_per_second": 7.628, "step": 2318 }, { "epoch": 0.10018544874283003, "grad_norm": 0.27734375, "learning_rate": 0.0009007079951649111, "loss": 1.5069, "step": 2323 }, { "epoch": 0.10117738387889766, "grad_norm": 0.310546875, "learning_rate": 0.0008997150751165601, "loss": 1.513, "step": 2346 }, { "epoch": 0.10216931901496529, "grad_norm": 0.33203125, "learning_rate": 0.0008987221550682093, "loss": 1.5015, "step": 2369 }, { "epoch": 0.1031612541510329, "grad_norm": 0.3359375, "learning_rate": 0.0008977292350198584, "loss": 1.4961, "step": 2392 }, { "epoch": 0.10415318928710053, "grad_norm": 0.271484375, "learning_rate": 0.0008967363149715075, "loss": 1.4802, "step": 2415 }, { "epoch": 0.10514512442316816, "grad_norm": 0.369140625, "learning_rate": 0.0008957433949231566, "loss": 1.4725, "step": 2438 }, { "epoch": 0.10613705955923577, "grad_norm": 0.23828125, "learning_rate": 0.0008947504748748057, "loss": 1.4698, "step": 2461 }, { "epoch": 0.1071289946953034, "grad_norm": 0.357421875, "learning_rate": 0.0008937575548264549, "loss": 1.4954, "step": 2484 }, { "epoch": 0.10812092983137103, "grad_norm": 0.43359375, "learning_rate": 0.000892764634778104, "loss": 1.4221, "step": 2507 }, { "epoch": 0.10911286496743865, "grad_norm": 0.376953125, "learning_rate": 0.000891771714729753, "loss": 1.4364, "step": 2530 }, { "epoch": 0.11010480010350628, "grad_norm": 0.29296875, "learning_rate": 0.0008907787946814022, "loss": 1.4479, "step": 2553 }, { "epoch": 0.1110967352395739, "grad_norm": 0.341796875, "learning_rate": 0.0008897858746330512, "loss": 1.4662, "step": 2576 }, { "epoch": 0.11208867037564152, "grad_norm": 0.31640625, "learning_rate": 0.0008887929545847005, "loss": 1.4463, "step": 2599 }, { "epoch": 0.11308060551170915, "grad_norm": 0.322265625, "learning_rate": 0.0008878000345363495, "loss": 1.4342, "step": 2622 }, { "epoch": 0.11407254064777678, "grad_norm": 0.283203125, "learning_rate": 0.0008868071144879987, "loss": 1.4457, "step": 2645 }, { "epoch": 0.11506447578384439, "grad_norm": 0.34765625, "learning_rate": 0.0008858141944396477, "loss": 1.4413, "step": 2668 }, { "epoch": 0.11605641091991202, "grad_norm": 0.35546875, "learning_rate": 0.000884821274391297, "loss": 1.4272, "step": 2691 }, { "epoch": 0.11704834605597965, "grad_norm": 0.31640625, "learning_rate": 0.000883828354342946, "loss": 1.4108, "step": 2714 }, { "epoch": 0.11804028119204726, "grad_norm": 0.3359375, "learning_rate": 0.0008828354342945951, "loss": 1.4226, "step": 2737 }, { "epoch": 0.11903221632811489, "grad_norm": 0.287109375, "learning_rate": 0.0008818425142462442, "loss": 1.4096, "step": 2760 }, { "epoch": 0.12002415146418252, "grad_norm": 0.3125, "learning_rate": 0.0008808495941978933, "loss": 1.3829, "step": 2783 }, { "epoch": 0.12101608660025014, "grad_norm": 0.396484375, "learning_rate": 0.0008798566741495424, "loss": 1.3841, "step": 2806 }, { "epoch": 0.12200802173631776, "grad_norm": 0.27734375, "learning_rate": 0.0008788637541011916, "loss": 1.4068, "step": 2829 }, { "epoch": 0.1229999568723854, "grad_norm": 0.380859375, "learning_rate": 0.0008778708340528407, "loss": 1.3872, "step": 2852 }, { "epoch": 0.12399189200845301, "grad_norm": 0.2890625, "learning_rate": 0.0008768779140044898, "loss": 1.3999, "step": 2875 }, { "epoch": 0.12498382714452064, "grad_norm": 0.3359375, "learning_rate": 0.0008758849939561388, "loss": 1.3889, "step": 2898 }, { "epoch": 0.12597576228058827, "grad_norm": 0.271484375, "learning_rate": 0.0008748920739077879, "loss": 1.3763, "step": 2921 }, { "epoch": 0.1269676974166559, "grad_norm": 0.328125, "learning_rate": 0.000873899153859437, "loss": 1.39, "step": 2944 }, { "epoch": 0.1279596325527235, "grad_norm": 0.3203125, "learning_rate": 0.0008729062338110862, "loss": 1.3858, "step": 2967 }, { "epoch": 0.12895156768879112, "grad_norm": 0.2734375, "learning_rate": 0.0008719133137627353, "loss": 1.3674, "step": 2990 }, { "epoch": 0.12994350282485875, "grad_norm": 0.30078125, "learning_rate": 0.0008709203937143844, "loss": 1.3458, "step": 3013 }, { "epoch": 0.13093543796092638, "grad_norm": 0.349609375, "learning_rate": 0.0008699274736660335, "loss": 1.3549, "step": 3036 }, { "epoch": 0.131927373096994, "grad_norm": 0.2890625, "learning_rate": 0.0008689345536176825, "loss": 1.3683, "step": 3059 }, { "epoch": 0.13291930823306164, "grad_norm": 0.298828125, "learning_rate": 0.0008679416335693318, "loss": 1.3625, "step": 3082 }, { "epoch": 0.13391124336912924, "grad_norm": 0.318359375, "learning_rate": 0.0008669487135209808, "loss": 1.3462, "step": 3105 }, { "epoch": 0.13490317850519687, "grad_norm": 0.34765625, "learning_rate": 0.00086595579347263, "loss": 1.345, "step": 3128 }, { "epoch": 0.1358951136412645, "grad_norm": 0.259765625, "learning_rate": 0.000864962873424279, "loss": 1.3552, "step": 3151 }, { "epoch": 0.13688704877733213, "grad_norm": 0.240234375, "learning_rate": 0.0008639699533759282, "loss": 1.3356, "step": 3174 }, { "epoch": 0.13787898391339976, "grad_norm": 0.255859375, "learning_rate": 0.0008629770333275773, "loss": 1.3373, "step": 3197 }, { "epoch": 0.13887091904946738, "grad_norm": 0.35546875, "learning_rate": 0.0008619841132792265, "loss": 1.3514, "step": 3220 }, { "epoch": 0.13986285418553498, "grad_norm": 0.310546875, "learning_rate": 0.0008609911932308755, "loss": 1.3291, "step": 3243 }, { "epoch": 0.1408547893216026, "grad_norm": 0.36328125, "learning_rate": 0.0008599982731825246, "loss": 1.3233, "step": 3266 }, { "epoch": 0.14184672445767024, "grad_norm": 0.38671875, "learning_rate": 0.0008590053531341737, "loss": 1.3174, "step": 3289 }, { "epoch": 0.14283865959373787, "grad_norm": 0.25, "learning_rate": 0.0008580124330858229, "loss": 1.3304, "step": 3312 }, { "epoch": 0.1438305947298055, "grad_norm": 0.3125, "learning_rate": 0.000857019513037472, "loss": 1.3287, "step": 3335 }, { "epoch": 0.14482252986587313, "grad_norm": 0.34375, "learning_rate": 0.0008560265929891211, "loss": 1.3201, "step": 3358 }, { "epoch": 0.14581446500194073, "grad_norm": 0.330078125, "learning_rate": 0.0008550336729407702, "loss": 1.3027, "step": 3381 }, { "epoch": 0.14680640013800836, "grad_norm": 0.34765625, "learning_rate": 0.0008540407528924193, "loss": 1.3128, "step": 3404 }, { "epoch": 0.147798335274076, "grad_norm": 0.3828125, "learning_rate": 0.0008530478328440683, "loss": 1.3182, "step": 3427 }, { "epoch": 0.14879027041014362, "grad_norm": 0.287109375, "learning_rate": 0.0008520549127957176, "loss": 1.2906, "step": 3450 }, { "epoch": 0.14978220554621124, "grad_norm": 0.271484375, "learning_rate": 0.0008510619927473666, "loss": 1.2837, "step": 3473 }, { "epoch": 0.15077414068227887, "grad_norm": 0.365234375, "learning_rate": 0.0008500690726990157, "loss": 1.3114, "step": 3496 }, { "epoch": 0.15176607581834647, "grad_norm": 0.365234375, "learning_rate": 0.0008490761526506648, "loss": 1.3047, "step": 3519 }, { "epoch": 0.1527580109544141, "grad_norm": 0.322265625, "learning_rate": 0.000848083232602314, "loss": 1.2654, "step": 3542 }, { "epoch": 0.15374994609048173, "grad_norm": 0.302734375, "learning_rate": 0.0008470903125539631, "loss": 1.2675, "step": 3565 }, { "epoch": 0.15474188122654936, "grad_norm": 0.3671875, "learning_rate": 0.0008460973925056121, "loss": 1.2671, "step": 3588 }, { "epoch": 0.155733816362617, "grad_norm": 0.26953125, "learning_rate": 0.0008451044724572613, "loss": 1.2719, "step": 3611 }, { "epoch": 0.15672575149868462, "grad_norm": 0.29296875, "learning_rate": 0.0008441115524089103, "loss": 1.2755, "step": 3634 }, { "epoch": 0.15771768663475222, "grad_norm": 0.318359375, "learning_rate": 0.0008431186323605595, "loss": 1.2876, "step": 3657 }, { "epoch": 0.15870962177081985, "grad_norm": 0.35546875, "learning_rate": 0.0008421257123122086, "loss": 1.2928, "step": 3680 }, { "epoch": 0.15970155690688748, "grad_norm": 0.3046875, "learning_rate": 0.0008411327922638578, "loss": 1.2855, "step": 3703 }, { "epoch": 0.1606934920429551, "grad_norm": 0.341796875, "learning_rate": 0.0008401398722155068, "loss": 1.2558, "step": 3726 }, { "epoch": 0.16168542717902273, "grad_norm": 0.34375, "learning_rate": 0.000839146952167156, "loss": 1.2561, "step": 3749 }, { "epoch": 0.16267736231509036, "grad_norm": 0.361328125, "learning_rate": 0.000838154032118805, "loss": 1.2851, "step": 3772 }, { "epoch": 0.16366929745115796, "grad_norm": 0.3359375, "learning_rate": 0.0008371611120704542, "loss": 1.2682, "step": 3795 }, { "epoch": 0.1646612325872256, "grad_norm": 0.33203125, "learning_rate": 0.0008361681920221033, "loss": 1.2709, "step": 3818 }, { "epoch": 0.16565316772329322, "grad_norm": 0.302734375, "learning_rate": 0.0008351752719737524, "loss": 1.2459, "step": 3841 }, { "epoch": 0.16664510285936085, "grad_norm": 0.3203125, "learning_rate": 0.0008341823519254015, "loss": 1.2134, "step": 3864 }, { "epoch": 0.16763703799542848, "grad_norm": 0.365234375, "learning_rate": 0.0008331894318770506, "loss": 1.2445, "step": 3887 }, { "epoch": 0.1686289731314961, "grad_norm": 0.306640625, "learning_rate": 0.0008321965118286998, "loss": 1.2522, "step": 3910 }, { "epoch": 0.1696209082675637, "grad_norm": 0.314453125, "learning_rate": 0.0008312035917803489, "loss": 1.2296, "step": 3933 }, { "epoch": 0.17061284340363134, "grad_norm": 0.33203125, "learning_rate": 0.0008302106717319979, "loss": 1.252, "step": 3956 }, { "epoch": 0.17160477853969897, "grad_norm": 0.29296875, "learning_rate": 0.0008292177516836471, "loss": 1.2501, "step": 3979 }, { "epoch": 0.1725967136757666, "grad_norm": 0.337890625, "learning_rate": 0.0008282248316352961, "loss": 1.2328, "step": 4002 }, { "epoch": 0.17358864881183422, "grad_norm": 0.306640625, "learning_rate": 0.0008272319115869454, "loss": 1.2182, "step": 4025 }, { "epoch": 0.17458058394790185, "grad_norm": 0.400390625, "learning_rate": 0.0008262389915385944, "loss": 1.2515, "step": 4048 }, { "epoch": 0.17557251908396945, "grad_norm": 0.32421875, "learning_rate": 0.0008252460714902435, "loss": 1.2076, "step": 4071 }, { "epoch": 0.17656445422003708, "grad_norm": 0.322265625, "learning_rate": 0.0008242531514418926, "loss": 1.2219, "step": 4094 }, { "epoch": 0.1775563893561047, "grad_norm": 0.2734375, "learning_rate": 0.0008232602313935416, "loss": 1.2307, "step": 4117 }, { "epoch": 0.17854832449217234, "grad_norm": 0.41015625, "learning_rate": 0.0008222673113451908, "loss": 1.2368, "step": 4140 }, { "epoch": 0.17954025962823997, "grad_norm": 0.322265625, "learning_rate": 0.0008212743912968399, "loss": 1.2332, "step": 4163 }, { "epoch": 0.1805321947643076, "grad_norm": 0.2734375, "learning_rate": 0.0008202814712484891, "loss": 1.2086, "step": 4186 }, { "epoch": 0.1815241299003752, "grad_norm": 0.4375, "learning_rate": 0.0008192885512001381, "loss": 1.2204, "step": 4209 }, { "epoch": 0.18251606503644283, "grad_norm": 0.365234375, "learning_rate": 0.0008182956311517873, "loss": 1.2379, "step": 4232 }, { "epoch": 0.18350800017251045, "grad_norm": 0.294921875, "learning_rate": 0.0008173027111034363, "loss": 1.2387, "step": 4255 }, { "epoch": 0.18449993530857808, "grad_norm": 0.28125, "learning_rate": 0.0008163097910550856, "loss": 1.2008, "step": 4278 }, { "epoch": 0.1854918704446457, "grad_norm": 0.28515625, "learning_rate": 0.0008153168710067346, "loss": 1.2191, "step": 4301 }, { "epoch": 0.18648380558071334, "grad_norm": 0.30859375, "learning_rate": 0.0008143239509583837, "loss": 1.2153, "step": 4324 }, { "epoch": 0.18747574071678094, "grad_norm": 0.404296875, "learning_rate": 0.0008133310309100328, "loss": 1.2171, "step": 4347 }, { "epoch": 0.18846767585284857, "grad_norm": 0.298828125, "learning_rate": 0.0008123381108616819, "loss": 1.2154, "step": 4370 }, { "epoch": 0.1894596109889162, "grad_norm": 0.353515625, "learning_rate": 0.0008113451908133311, "loss": 1.2055, "step": 4393 }, { "epoch": 0.19045154612498383, "grad_norm": 0.337890625, "learning_rate": 0.0008103522707649802, "loss": 1.2215, "step": 4416 }, { "epoch": 0.19144348126105146, "grad_norm": 0.3203125, "learning_rate": 0.0008093593507166293, "loss": 1.2035, "step": 4439 }, { "epoch": 0.19243541639711909, "grad_norm": 0.326171875, "learning_rate": 0.0008083664306682784, "loss": 1.183, "step": 4462 }, { "epoch": 0.1934273515331867, "grad_norm": 0.34765625, "learning_rate": 0.0008073735106199274, "loss": 1.1892, "step": 4485 }, { "epoch": 0.19441928666925432, "grad_norm": 0.337890625, "learning_rate": 0.0008063805905715767, "loss": 1.2006, "step": 4508 }, { "epoch": 0.19541122180532194, "grad_norm": 0.41796875, "learning_rate": 0.0008053876705232257, "loss": 1.2081, "step": 4531 }, { "epoch": 0.19640315694138957, "grad_norm": 0.357421875, "learning_rate": 0.0008043947504748749, "loss": 1.1878, "step": 4554 }, { "epoch": 0.1973950920774572, "grad_norm": 0.3125, "learning_rate": 0.0008034018304265239, "loss": 1.1701, "step": 4577 }, { "epoch": 0.19838702721352483, "grad_norm": 0.337890625, "learning_rate": 0.0008024089103781731, "loss": 1.179, "step": 4600 }, { "epoch": 0.19937896234959243, "grad_norm": 0.361328125, "learning_rate": 0.0008014159903298221, "loss": 1.1794, "step": 4623 }, { "epoch": 0.1999396213395437, "eval_runtime": 164.1241, "eval_samples_per_second": 609.295, "eval_steps_per_second": 7.616, "step": 4636 }, { "epoch": 0.20037089748566006, "grad_norm": 0.357421875, "learning_rate": 0.0008004230702814712, "loss": 1.181, "step": 4646 }, { "epoch": 0.2013628326217277, "grad_norm": 0.369140625, "learning_rate": 0.0007994301502331204, "loss": 1.1706, "step": 4669 }, { "epoch": 0.20235476775779532, "grad_norm": 0.30078125, "learning_rate": 0.0007984372301847694, "loss": 1.2005, "step": 4692 }, { "epoch": 0.20334670289386295, "grad_norm": 0.34375, "learning_rate": 0.0007974443101364186, "loss": 1.1784, "step": 4715 }, { "epoch": 0.20433863802993057, "grad_norm": 0.310546875, "learning_rate": 0.0007964513900880676, "loss": 1.1774, "step": 4738 }, { "epoch": 0.20533057316599818, "grad_norm": 0.26953125, "learning_rate": 0.0007954584700397169, "loss": 1.1589, "step": 4761 }, { "epoch": 0.2063225083020658, "grad_norm": 0.345703125, "learning_rate": 0.0007944655499913659, "loss": 1.1821, "step": 4784 }, { "epoch": 0.20731444343813343, "grad_norm": 0.287109375, "learning_rate": 0.0007934726299430151, "loss": 1.1496, "step": 4807 }, { "epoch": 0.20830637857420106, "grad_norm": 0.3671875, "learning_rate": 0.0007924797098946641, "loss": 1.1666, "step": 4830 }, { "epoch": 0.2092983137102687, "grad_norm": 0.3046875, "learning_rate": 0.0007914867898463132, "loss": 1.1546, "step": 4853 }, { "epoch": 0.21029024884633632, "grad_norm": 0.361328125, "learning_rate": 0.0007904938697979624, "loss": 1.2045, "step": 4876 }, { "epoch": 0.21128218398240392, "grad_norm": 0.56640625, "learning_rate": 0.0007895009497496115, "loss": 1.1757, "step": 4899 }, { "epoch": 0.21227411911847155, "grad_norm": 0.36328125, "learning_rate": 0.0007885080297012606, "loss": 1.1894, "step": 4922 }, { "epoch": 0.21326605425453918, "grad_norm": 0.3046875, "learning_rate": 0.0007875151096529097, "loss": 1.174, "step": 4945 }, { "epoch": 0.2142579893906068, "grad_norm": 0.322265625, "learning_rate": 0.0007865221896045588, "loss": 1.1423, "step": 4968 }, { "epoch": 0.21524992452667444, "grad_norm": 0.421875, "learning_rate": 0.000785529269556208, "loss": 1.1532, "step": 4991 }, { "epoch": 0.21624185966274206, "grad_norm": 0.3203125, "learning_rate": 0.000784536349507857, "loss": 1.1688, "step": 5014 }, { "epoch": 0.21723379479880967, "grad_norm": 0.2734375, "learning_rate": 0.0007835434294595062, "loss": 1.1446, "step": 5037 }, { "epoch": 0.2182257299348773, "grad_norm": 0.365234375, "learning_rate": 0.0007825505094111552, "loss": 1.1792, "step": 5060 }, { "epoch": 0.21921766507094492, "grad_norm": 0.33203125, "learning_rate": 0.0007815575893628044, "loss": 1.1516, "step": 5083 }, { "epoch": 0.22020960020701255, "grad_norm": 0.296875, "learning_rate": 0.0007805646693144534, "loss": 1.1712, "step": 5106 }, { "epoch": 0.22120153534308018, "grad_norm": 0.341796875, "learning_rate": 0.0007795717492661027, "loss": 1.1562, "step": 5129 }, { "epoch": 0.2221934704791478, "grad_norm": 0.470703125, "learning_rate": 0.0007785788292177517, "loss": 1.1641, "step": 5152 }, { "epoch": 0.2231854056152154, "grad_norm": 0.4765625, "learning_rate": 0.0007775859091694009, "loss": 1.1649, "step": 5175 }, { "epoch": 0.22417734075128304, "grad_norm": 0.333984375, "learning_rate": 0.0007765929891210499, "loss": 1.1684, "step": 5198 }, { "epoch": 0.22516927588735067, "grad_norm": 0.33984375, "learning_rate": 0.0007756000690726989, "loss": 1.1659, "step": 5221 }, { "epoch": 0.2261612110234183, "grad_norm": 0.419921875, "learning_rate": 0.0007746071490243482, "loss": 1.1683, "step": 5244 }, { "epoch": 0.22715314615948592, "grad_norm": 0.33203125, "learning_rate": 0.0007736142289759972, "loss": 1.1536, "step": 5267 }, { "epoch": 0.22814508129555355, "grad_norm": 0.3203125, "learning_rate": 0.0007726213089276464, "loss": 1.1451, "step": 5290 }, { "epoch": 0.22913701643162115, "grad_norm": 0.35546875, "learning_rate": 0.0007716283888792954, "loss": 1.1453, "step": 5313 }, { "epoch": 0.23012895156768878, "grad_norm": 0.298828125, "learning_rate": 0.0007706354688309446, "loss": 1.1585, "step": 5336 }, { "epoch": 0.2311208867037564, "grad_norm": 0.3828125, "learning_rate": 0.0007696425487825937, "loss": 1.1585, "step": 5359 }, { "epoch": 0.23211282183982404, "grad_norm": 0.298828125, "learning_rate": 0.0007686496287342428, "loss": 1.1396, "step": 5382 }, { "epoch": 0.23310475697589167, "grad_norm": 0.35546875, "learning_rate": 0.0007676567086858919, "loss": 1.1527, "step": 5405 }, { "epoch": 0.2340966921119593, "grad_norm": 0.310546875, "learning_rate": 0.000766663788637541, "loss": 1.1157, "step": 5428 }, { "epoch": 0.2350886272480269, "grad_norm": 0.31640625, "learning_rate": 0.0007656708685891901, "loss": 1.1232, "step": 5451 }, { "epoch": 0.23608056238409453, "grad_norm": 0.3359375, "learning_rate": 0.0007646779485408393, "loss": 1.1411, "step": 5474 }, { "epoch": 0.23707249752016216, "grad_norm": 0.314453125, "learning_rate": 0.0007636850284924884, "loss": 1.1329, "step": 5497 }, { "epoch": 0.23806443265622979, "grad_norm": 0.314453125, "learning_rate": 0.0007626921084441375, "loss": 1.1262, "step": 5520 }, { "epoch": 0.23905636779229741, "grad_norm": 0.3359375, "learning_rate": 0.0007616991883957865, "loss": 1.1346, "step": 5543 }, { "epoch": 0.24004830292836504, "grad_norm": 0.306640625, "learning_rate": 0.0007607062683474357, "loss": 1.139, "step": 5566 }, { "epoch": 0.24104023806443264, "grad_norm": 0.396484375, "learning_rate": 0.0007597133482990847, "loss": 1.1318, "step": 5589 }, { "epoch": 0.24203217320050027, "grad_norm": 0.35546875, "learning_rate": 0.000758720428250734, "loss": 1.1178, "step": 5612 }, { "epoch": 0.2430241083365679, "grad_norm": 0.357421875, "learning_rate": 0.000757727508202383, "loss": 1.1312, "step": 5635 }, { "epoch": 0.24401604347263553, "grad_norm": 0.373046875, "learning_rate": 0.0007567345881540322, "loss": 1.1263, "step": 5658 }, { "epoch": 0.24500797860870316, "grad_norm": 0.306640625, "learning_rate": 0.0007557416681056812, "loss": 1.1303, "step": 5681 }, { "epoch": 0.2459999137447708, "grad_norm": 0.412109375, "learning_rate": 0.0007547487480573305, "loss": 1.1365, "step": 5704 }, { "epoch": 0.2469918488808384, "grad_norm": 0.375, "learning_rate": 0.0007537558280089795, "loss": 1.1239, "step": 5727 }, { "epoch": 0.24798378401690602, "grad_norm": 0.33984375, "learning_rate": 0.0007527629079606286, "loss": 1.1212, "step": 5750 }, { "epoch": 0.24897571915297365, "grad_norm": 0.396484375, "learning_rate": 0.0007517699879122777, "loss": 1.1128, "step": 5773 }, { "epoch": 0.24996765428904127, "grad_norm": 0.345703125, "learning_rate": 0.0007507770678639267, "loss": 1.1153, "step": 5796 }, { "epoch": 0.2509595894251089, "grad_norm": 0.337890625, "learning_rate": 0.0007497841478155759, "loss": 1.1191, "step": 5819 }, { "epoch": 0.25195152456117653, "grad_norm": 0.357421875, "learning_rate": 0.000748791227767225, "loss": 1.1115, "step": 5842 }, { "epoch": 0.25294345969724413, "grad_norm": 0.484375, "learning_rate": 0.0007477983077188742, "loss": 1.1377, "step": 5865 }, { "epoch": 0.2539353948333118, "grad_norm": 0.3671875, "learning_rate": 0.0007468053876705232, "loss": 1.1345, "step": 5888 }, { "epoch": 0.2549273299693794, "grad_norm": 0.3359375, "learning_rate": 0.0007458124676221723, "loss": 1.1226, "step": 5911 }, { "epoch": 0.255919265105447, "grad_norm": 0.361328125, "learning_rate": 0.0007448195475738214, "loss": 1.1163, "step": 5934 }, { "epoch": 0.25691120024151465, "grad_norm": 0.361328125, "learning_rate": 0.0007438266275254706, "loss": 1.1255, "step": 5957 }, { "epoch": 0.25790313537758225, "grad_norm": 0.322265625, "learning_rate": 0.0007428337074771197, "loss": 1.1049, "step": 5980 }, { "epoch": 0.2588950705136499, "grad_norm": 0.427734375, "learning_rate": 0.0007418407874287688, "loss": 1.1294, "step": 6003 }, { "epoch": 0.2598870056497175, "grad_norm": 0.34765625, "learning_rate": 0.0007408478673804179, "loss": 1.1167, "step": 6026 }, { "epoch": 0.26087894078578516, "grad_norm": 0.388671875, "learning_rate": 0.000739854947332067, "loss": 1.1041, "step": 6049 }, { "epoch": 0.26187087592185276, "grad_norm": 0.302734375, "learning_rate": 0.000738862027283716, "loss": 1.1146, "step": 6072 }, { "epoch": 0.26286281105792036, "grad_norm": 0.341796875, "learning_rate": 0.0007378691072353653, "loss": 1.104, "step": 6095 }, { "epoch": 0.263854746193988, "grad_norm": 0.416015625, "learning_rate": 0.0007368761871870143, "loss": 1.1091, "step": 6118 }, { "epoch": 0.2648466813300556, "grad_norm": 0.326171875, "learning_rate": 0.0007358832671386635, "loss": 1.1295, "step": 6141 }, { "epoch": 0.2658386164661233, "grad_norm": 0.341796875, "learning_rate": 0.0007348903470903125, "loss": 1.0913, "step": 6164 }, { "epoch": 0.2668305516021909, "grad_norm": 0.32421875, "learning_rate": 0.0007338974270419618, "loss": 1.0806, "step": 6187 }, { "epoch": 0.2678224867382585, "grad_norm": 0.380859375, "learning_rate": 0.0007329045069936108, "loss": 1.099, "step": 6210 }, { "epoch": 0.26881442187432614, "grad_norm": 0.453125, "learning_rate": 0.00073191158694526, "loss": 1.0982, "step": 6233 }, { "epoch": 0.26980635701039374, "grad_norm": 0.3515625, "learning_rate": 0.000730918666896909, "loss": 1.093, "step": 6256 }, { "epoch": 0.2707982921464614, "grad_norm": 0.3359375, "learning_rate": 0.0007299257468485581, "loss": 1.0773, "step": 6279 }, { "epoch": 0.271790227282529, "grad_norm": 0.33984375, "learning_rate": 0.0007289328268002072, "loss": 1.101, "step": 6302 }, { "epoch": 0.27278216241859665, "grad_norm": 0.328125, "learning_rate": 0.0007279399067518564, "loss": 1.1084, "step": 6325 }, { "epoch": 0.27377409755466425, "grad_norm": 0.359375, "learning_rate": 0.0007269469867035055, "loss": 1.0869, "step": 6348 }, { "epoch": 0.27476603269073185, "grad_norm": 0.3203125, "learning_rate": 0.0007259540666551545, "loss": 1.088, "step": 6371 }, { "epoch": 0.2757579678267995, "grad_norm": 0.3984375, "learning_rate": 0.0007249611466068037, "loss": 1.1002, "step": 6394 }, { "epoch": 0.2767499029628671, "grad_norm": 0.30859375, "learning_rate": 0.0007239682265584527, "loss": 1.083, "step": 6417 }, { "epoch": 0.27774183809893477, "grad_norm": 0.306640625, "learning_rate": 0.0007229753065101019, "loss": 1.0884, "step": 6440 }, { "epoch": 0.27873377323500237, "grad_norm": 0.41015625, "learning_rate": 0.000721982386461751, "loss": 1.092, "step": 6463 }, { "epoch": 0.27972570837106997, "grad_norm": 0.3359375, "learning_rate": 0.0007209894664134001, "loss": 1.0736, "step": 6486 }, { "epoch": 0.2807176435071376, "grad_norm": 0.353515625, "learning_rate": 0.0007199965463650492, "loss": 1.091, "step": 6509 }, { "epoch": 0.2817095786432052, "grad_norm": 0.34765625, "learning_rate": 0.0007190036263166983, "loss": 1.0893, "step": 6532 }, { "epoch": 0.2827015137792729, "grad_norm": 0.41015625, "learning_rate": 0.0007180107062683475, "loss": 1.0936, "step": 6555 }, { "epoch": 0.2836934489153405, "grad_norm": 0.423828125, "learning_rate": 0.0007170177862199966, "loss": 1.0955, "step": 6578 }, { "epoch": 0.28468538405140814, "grad_norm": 0.34765625, "learning_rate": 0.0007160248661716456, "loss": 1.1016, "step": 6601 }, { "epoch": 0.28567731918747574, "grad_norm": 0.34765625, "learning_rate": 0.0007150319461232948, "loss": 1.102, "step": 6624 }, { "epoch": 0.28666925432354334, "grad_norm": 0.404296875, "learning_rate": 0.0007140390260749438, "loss": 1.08, "step": 6647 }, { "epoch": 0.287661189459611, "grad_norm": 0.44140625, "learning_rate": 0.000713046106026593, "loss": 1.0785, "step": 6670 }, { "epoch": 0.2886531245956786, "grad_norm": 0.392578125, "learning_rate": 0.0007120531859782421, "loss": 1.0944, "step": 6693 }, { "epoch": 0.28964505973174626, "grad_norm": 0.34375, "learning_rate": 0.0007110602659298913, "loss": 1.0851, "step": 6716 }, { "epoch": 0.29063699486781386, "grad_norm": 0.40234375, "learning_rate": 0.0007100673458815403, "loss": 1.0942, "step": 6739 }, { "epoch": 0.29162893000388146, "grad_norm": 0.42578125, "learning_rate": 0.0007090744258331895, "loss": 1.0831, "step": 6762 }, { "epoch": 0.2926208651399491, "grad_norm": 0.408203125, "learning_rate": 0.0007080815057848385, "loss": 1.0724, "step": 6785 }, { "epoch": 0.2936128002760167, "grad_norm": 0.31640625, "learning_rate": 0.0007070885857364877, "loss": 1.0875, "step": 6808 }, { "epoch": 0.2946047354120844, "grad_norm": 0.466796875, "learning_rate": 0.0007060956656881368, "loss": 1.0896, "step": 6831 }, { "epoch": 0.295596670548152, "grad_norm": 0.3984375, "learning_rate": 0.0007051027456397859, "loss": 1.0702, "step": 6854 }, { "epoch": 0.29658860568421963, "grad_norm": 0.390625, "learning_rate": 0.000704109825591435, "loss": 1.0887, "step": 6877 }, { "epoch": 0.29758054082028723, "grad_norm": 0.341796875, "learning_rate": 0.0007031169055430841, "loss": 1.0708, "step": 6900 }, { "epoch": 0.29857247595635483, "grad_norm": 0.41796875, "learning_rate": 0.0007021239854947333, "loss": 1.0828, "step": 6923 }, { "epoch": 0.2995644110924225, "grad_norm": 0.33984375, "learning_rate": 0.0007011310654463823, "loss": 1.061, "step": 6946 }, { "epoch": 0.2999094320093156, "eval_runtime": 164.053, "eval_samples_per_second": 609.559, "eval_steps_per_second": 7.619, "step": 6954 }, { "epoch": 0.3005563462284901, "grad_norm": 0.345703125, "learning_rate": 0.0007001381453980314, "loss": 1.0634, "step": 6969 }, { "epoch": 0.30154828136455775, "grad_norm": 0.359375, "learning_rate": 0.0006991452253496805, "loss": 1.071, "step": 6992 }, { "epoch": 0.30254021650062535, "grad_norm": 0.3828125, "learning_rate": 0.0006981523053013296, "loss": 1.0822, "step": 7015 }, { "epoch": 0.30353215163669295, "grad_norm": 0.4140625, "learning_rate": 0.0006971593852529788, "loss": 1.0618, "step": 7038 }, { "epoch": 0.3045240867727606, "grad_norm": 0.322265625, "learning_rate": 0.0006961664652046279, "loss": 1.0722, "step": 7061 }, { "epoch": 0.3055160219088282, "grad_norm": 0.3359375, "learning_rate": 0.000695173545156277, "loss": 1.0651, "step": 7084 }, { "epoch": 0.30650795704489586, "grad_norm": 0.34765625, "learning_rate": 0.0006941806251079261, "loss": 1.0648, "step": 7107 }, { "epoch": 0.30749989218096346, "grad_norm": 0.359375, "learning_rate": 0.0006931877050595752, "loss": 1.0615, "step": 7130 }, { "epoch": 0.3084918273170311, "grad_norm": 0.349609375, "learning_rate": 0.0006921947850112244, "loss": 1.0673, "step": 7153 }, { "epoch": 0.3094837624530987, "grad_norm": 0.38671875, "learning_rate": 0.0006912018649628734, "loss": 1.0596, "step": 7176 }, { "epoch": 0.3104756975891663, "grad_norm": 0.337890625, "learning_rate": 0.0006902089449145226, "loss": 1.0665, "step": 7199 }, { "epoch": 0.311467632725234, "grad_norm": 0.388671875, "learning_rate": 0.0006892160248661716, "loss": 1.0784, "step": 7222 }, { "epoch": 0.3124595678613016, "grad_norm": 0.33203125, "learning_rate": 0.0006882231048178208, "loss": 1.0737, "step": 7245 }, { "epoch": 0.31345150299736924, "grad_norm": 0.326171875, "learning_rate": 0.0006872301847694698, "loss": 1.0647, "step": 7268 }, { "epoch": 0.31444343813343684, "grad_norm": 0.33984375, "learning_rate": 0.0006862372647211191, "loss": 1.0859, "step": 7291 }, { "epoch": 0.31543537326950444, "grad_norm": 0.40625, "learning_rate": 0.0006852443446727681, "loss": 1.0703, "step": 7314 }, { "epoch": 0.3164273084055721, "grad_norm": 0.37890625, "learning_rate": 0.0006842514246244172, "loss": 1.0581, "step": 7337 }, { "epoch": 0.3174192435416397, "grad_norm": 0.408203125, "learning_rate": 0.0006832585045760663, "loss": 1.0557, "step": 7360 }, { "epoch": 0.31841117867770735, "grad_norm": 0.3203125, "learning_rate": 0.0006822655845277154, "loss": 1.0853, "step": 7383 }, { "epoch": 0.31940311381377495, "grad_norm": 0.4296875, "learning_rate": 0.0006812726644793646, "loss": 1.0602, "step": 7406 }, { "epoch": 0.3203950489498426, "grad_norm": 0.41015625, "learning_rate": 0.0006802797444310137, "loss": 1.0611, "step": 7429 }, { "epoch": 0.3213869840859102, "grad_norm": 0.3203125, "learning_rate": 0.0006792868243826628, "loss": 1.0703, "step": 7452 }, { "epoch": 0.3223789192219778, "grad_norm": 0.44140625, "learning_rate": 0.0006782939043343119, "loss": 1.0357, "step": 7475 }, { "epoch": 0.32337085435804547, "grad_norm": 0.345703125, "learning_rate": 0.0006773009842859609, "loss": 1.0556, "step": 7498 }, { "epoch": 0.32436278949411307, "grad_norm": 0.37109375, "learning_rate": 0.00067630806423761, "loss": 1.0561, "step": 7521 }, { "epoch": 0.3253547246301807, "grad_norm": 0.341796875, "learning_rate": 0.0006753151441892592, "loss": 1.0693, "step": 7544 }, { "epoch": 0.3263466597662483, "grad_norm": 0.48046875, "learning_rate": 0.0006743222241409083, "loss": 1.0668, "step": 7567 }, { "epoch": 0.3273385949023159, "grad_norm": 0.33984375, "learning_rate": 0.0006733293040925574, "loss": 1.038, "step": 7590 }, { "epoch": 0.3283305300383836, "grad_norm": 0.34765625, "learning_rate": 0.0006723363840442065, "loss": 1.056, "step": 7613 }, { "epoch": 0.3293224651744512, "grad_norm": 0.376953125, "learning_rate": 0.0006713434639958557, "loss": 1.0444, "step": 7636 }, { "epoch": 0.33031440031051884, "grad_norm": 0.34375, "learning_rate": 0.0006703505439475048, "loss": 1.0534, "step": 7659 }, { "epoch": 0.33130633544658644, "grad_norm": 0.341796875, "learning_rate": 0.0006693576238991539, "loss": 1.0597, "step": 7682 }, { "epoch": 0.3322982705826541, "grad_norm": 0.41796875, "learning_rate": 0.0006683647038508029, "loss": 1.0305, "step": 7705 }, { "epoch": 0.3332902057187217, "grad_norm": 0.3515625, "learning_rate": 0.0006673717838024521, "loss": 1.0632, "step": 7728 }, { "epoch": 0.3342821408547893, "grad_norm": 0.341796875, "learning_rate": 0.0006663788637541011, "loss": 1.0462, "step": 7751 }, { "epoch": 0.33527407599085696, "grad_norm": 0.423828125, "learning_rate": 0.0006653859437057504, "loss": 1.0534, "step": 7774 }, { "epoch": 0.33626601112692456, "grad_norm": 0.37109375, "learning_rate": 0.0006643930236573994, "loss": 1.0468, "step": 7797 }, { "epoch": 0.3372579462629922, "grad_norm": 0.37890625, "learning_rate": 0.0006634001036090486, "loss": 1.0473, "step": 7820 }, { "epoch": 0.3382498813990598, "grad_norm": 0.349609375, "learning_rate": 0.0006624071835606976, "loss": 1.0402, "step": 7843 }, { "epoch": 0.3392418165351274, "grad_norm": 0.384765625, "learning_rate": 0.0006614142635123467, "loss": 1.0713, "step": 7866 }, { "epoch": 0.3402337516711951, "grad_norm": 0.388671875, "learning_rate": 0.0006604213434639959, "loss": 1.0549, "step": 7889 }, { "epoch": 0.3412256868072627, "grad_norm": 0.3359375, "learning_rate": 0.000659428423415645, "loss": 1.0527, "step": 7912 }, { "epoch": 0.34221762194333033, "grad_norm": 0.421875, "learning_rate": 0.0006584355033672941, "loss": 1.0599, "step": 7935 }, { "epoch": 0.34320955707939793, "grad_norm": 0.322265625, "learning_rate": 0.0006574425833189432, "loss": 1.0503, "step": 7958 }, { "epoch": 0.3442014922154656, "grad_norm": 0.388671875, "learning_rate": 0.0006564496632705923, "loss": 1.0465, "step": 7981 }, { "epoch": 0.3451934273515332, "grad_norm": 0.314453125, "learning_rate": 0.0006554567432222415, "loss": 1.0246, "step": 8004 }, { "epoch": 0.3461853624876008, "grad_norm": 0.36328125, "learning_rate": 0.0006544638231738905, "loss": 1.0463, "step": 8027 }, { "epoch": 0.34717729762366845, "grad_norm": 0.42578125, "learning_rate": 0.0006534709031255397, "loss": 1.0471, "step": 8050 }, { "epoch": 0.34816923275973605, "grad_norm": 0.470703125, "learning_rate": 0.0006524779830771887, "loss": 1.0414, "step": 8073 }, { "epoch": 0.3491611678958037, "grad_norm": 0.515625, "learning_rate": 0.0006514850630288378, "loss": 1.0431, "step": 8096 }, { "epoch": 0.3501531030318713, "grad_norm": 0.37109375, "learning_rate": 0.000650492142980487, "loss": 1.0382, "step": 8119 }, { "epoch": 0.3511450381679389, "grad_norm": 0.39453125, "learning_rate": 0.0006494992229321361, "loss": 1.0557, "step": 8142 }, { "epoch": 0.35213697330400656, "grad_norm": 0.376953125, "learning_rate": 0.0006485063028837852, "loss": 1.0414, "step": 8165 }, { "epoch": 0.35312890844007416, "grad_norm": 0.365234375, "learning_rate": 0.0006475133828354343, "loss": 1.0221, "step": 8188 }, { "epoch": 0.3541208435761418, "grad_norm": 0.384765625, "learning_rate": 0.0006465204627870834, "loss": 1.0207, "step": 8211 }, { "epoch": 0.3551127787122094, "grad_norm": 0.486328125, "learning_rate": 0.0006455275427387324, "loss": 1.0399, "step": 8234 }, { "epoch": 0.3561047138482771, "grad_norm": 0.43359375, "learning_rate": 0.0006445346226903817, "loss": 1.0256, "step": 8257 }, { "epoch": 0.3570966489843447, "grad_norm": 0.30859375, "learning_rate": 0.0006435417026420307, "loss": 1.0452, "step": 8280 }, { "epoch": 0.3580885841204123, "grad_norm": 0.412109375, "learning_rate": 0.0006425487825936799, "loss": 1.0388, "step": 8303 }, { "epoch": 0.35908051925647994, "grad_norm": 0.3828125, "learning_rate": 0.0006415558625453289, "loss": 1.0401, "step": 8326 }, { "epoch": 0.36007245439254754, "grad_norm": 0.474609375, "learning_rate": 0.0006405629424969782, "loss": 1.0345, "step": 8349 }, { "epoch": 0.3610643895286152, "grad_norm": 0.375, "learning_rate": 0.0006395700224486272, "loss": 1.0362, "step": 8372 }, { "epoch": 0.3620563246646828, "grad_norm": 0.3984375, "learning_rate": 0.0006385771024002763, "loss": 1.045, "step": 8395 }, { "epoch": 0.3630482598007504, "grad_norm": 0.357421875, "learning_rate": 0.0006375841823519254, "loss": 1.0417, "step": 8418 }, { "epoch": 0.36404019493681805, "grad_norm": 0.306640625, "learning_rate": 0.0006365912623035745, "loss": 1.0388, "step": 8441 }, { "epoch": 0.36503213007288565, "grad_norm": 0.34765625, "learning_rate": 0.0006355983422552236, "loss": 1.0303, "step": 8464 }, { "epoch": 0.3660240652089533, "grad_norm": 0.3359375, "learning_rate": 0.0006346054222068728, "loss": 1.017, "step": 8487 }, { "epoch": 0.3670160003450209, "grad_norm": 0.384765625, "learning_rate": 0.0006336125021585219, "loss": 1.0274, "step": 8510 }, { "epoch": 0.36800793548108857, "grad_norm": 0.357421875, "learning_rate": 0.000632619582110171, "loss": 1.0257, "step": 8533 }, { "epoch": 0.36899987061715617, "grad_norm": 0.33203125, "learning_rate": 0.0006316266620618201, "loss": 1.0389, "step": 8556 }, { "epoch": 0.36999180575322377, "grad_norm": 0.369140625, "learning_rate": 0.0006306337420134692, "loss": 1.0295, "step": 8579 }, { "epoch": 0.3709837408892914, "grad_norm": 0.4140625, "learning_rate": 0.0006296408219651183, "loss": 1.045, "step": 8602 }, { "epoch": 0.371975676025359, "grad_norm": 0.392578125, "learning_rate": 0.0006286479019167675, "loss": 1.0581, "step": 8625 }, { "epoch": 0.3729676111614267, "grad_norm": 0.44140625, "learning_rate": 0.0006276549818684165, "loss": 1.0521, "step": 8648 }, { "epoch": 0.3739595462974943, "grad_norm": 0.36328125, "learning_rate": 0.0006266620618200657, "loss": 1.0355, "step": 8671 }, { "epoch": 0.3749514814335619, "grad_norm": 0.349609375, "learning_rate": 0.0006256691417717147, "loss": 1.0366, "step": 8694 }, { "epoch": 0.37594341656962954, "grad_norm": 0.349609375, "learning_rate": 0.0006246762217233639, "loss": 1.0148, "step": 8717 }, { "epoch": 0.37693535170569714, "grad_norm": 0.421875, "learning_rate": 0.000623683301675013, "loss": 1.0401, "step": 8740 }, { "epoch": 0.3779272868417648, "grad_norm": 0.3203125, "learning_rate": 0.000622690381626662, "loss": 1.0346, "step": 8763 }, { "epoch": 0.3789192219778324, "grad_norm": 0.34375, "learning_rate": 0.0006216974615783112, "loss": 1.0218, "step": 8786 }, { "epoch": 0.37991115711390006, "grad_norm": 0.427734375, "learning_rate": 0.0006207045415299602, "loss": 1.0405, "step": 8809 }, { "epoch": 0.38090309224996766, "grad_norm": 0.392578125, "learning_rate": 0.0006197116214816095, "loss": 1.0253, "step": 8832 }, { "epoch": 0.38189502738603526, "grad_norm": 0.345703125, "learning_rate": 0.0006187187014332585, "loss": 1.0409, "step": 8855 }, { "epoch": 0.3828869625221029, "grad_norm": 0.349609375, "learning_rate": 0.0006177257813849077, "loss": 1.0349, "step": 8878 }, { "epoch": 0.3838788976581705, "grad_norm": 0.49609375, "learning_rate": 0.0006167328613365567, "loss": 1.029, "step": 8901 }, { "epoch": 0.38487083279423817, "grad_norm": 0.447265625, "learning_rate": 0.0006157399412882058, "loss": 1.0338, "step": 8924 }, { "epoch": 0.3858627679303058, "grad_norm": 0.416015625, "learning_rate": 0.0006147470212398549, "loss": 1.0335, "step": 8947 }, { "epoch": 0.3868547030663734, "grad_norm": 0.40234375, "learning_rate": 0.0006137541011915041, "loss": 1.0512, "step": 8970 }, { "epoch": 0.38784663820244103, "grad_norm": 0.388671875, "learning_rate": 0.0006127611811431532, "loss": 1.0187, "step": 8993 }, { "epoch": 0.38883857333850863, "grad_norm": 0.298828125, "learning_rate": 0.0006117682610948023, "loss": 1.0321, "step": 9016 }, { "epoch": 0.3898305084745763, "grad_norm": 0.353515625, "learning_rate": 0.0006107753410464514, "loss": 1.0314, "step": 9039 }, { "epoch": 0.3908224436106439, "grad_norm": 0.3984375, "learning_rate": 0.0006097824209981005, "loss": 1.011, "step": 9062 }, { "epoch": 0.39181437874671154, "grad_norm": 0.37890625, "learning_rate": 0.0006087895009497497, "loss": 1.0127, "step": 9085 }, { "epoch": 0.39280631388277915, "grad_norm": 0.40625, "learning_rate": 0.0006077965809013988, "loss": 1.0045, "step": 9108 }, { "epoch": 0.39379824901884675, "grad_norm": 0.349609375, "learning_rate": 0.0006068036608530478, "loss": 0.9966, "step": 9131 }, { "epoch": 0.3947901841549144, "grad_norm": 0.318359375, "learning_rate": 0.000605810740804697, "loss": 1.0235, "step": 9154 }, { "epoch": 0.395782119290982, "grad_norm": 0.427734375, "learning_rate": 0.000604817820756346, "loss": 1.0448, "step": 9177 }, { "epoch": 0.39677405442704966, "grad_norm": 0.337890625, "learning_rate": 0.0006038249007079953, "loss": 1.0197, "step": 9200 }, { "epoch": 0.39776598956311726, "grad_norm": 0.447265625, "learning_rate": 0.0006028319806596443, "loss": 1.0093, "step": 9223 }, { "epoch": 0.39875792469918486, "grad_norm": 0.412109375, "learning_rate": 0.0006018390606112935, "loss": 1.014, "step": 9246 }, { "epoch": 0.3997498598352525, "grad_norm": 0.462890625, "learning_rate": 0.0006008461405629425, "loss": 1.0269, "step": 9269 }, { "epoch": 0.3998792426790874, "eval_runtime": 163.9225, "eval_samples_per_second": 610.045, "eval_steps_per_second": 7.626, "step": 9272 }, { "epoch": 0.4007417949713201, "grad_norm": 0.408203125, "learning_rate": 0.0005998532205145915, "loss": 1.0191, "step": 9292 }, { "epoch": 0.4017337301073878, "grad_norm": 0.46484375, "learning_rate": 0.0005988603004662408, "loss": 1.0074, "step": 9315 }, { "epoch": 0.4027256652434554, "grad_norm": 0.353515625, "learning_rate": 0.0005978673804178898, "loss": 1.0052, "step": 9338 }, { "epoch": 0.40371760037952303, "grad_norm": 0.361328125, "learning_rate": 0.000596874460369539, "loss": 1.0248, "step": 9361 }, { "epoch": 0.40470953551559063, "grad_norm": 0.376953125, "learning_rate": 0.000595881540321188, "loss": 1.0328, "step": 9384 }, { "epoch": 0.40570147065165824, "grad_norm": 0.4453125, "learning_rate": 0.0005948886202728372, "loss": 1.0254, "step": 9407 }, { "epoch": 0.4066934057877259, "grad_norm": 0.302734375, "learning_rate": 0.0005938957002244862, "loss": 1.0285, "step": 9430 }, { "epoch": 0.4076853409237935, "grad_norm": 0.439453125, "learning_rate": 0.0005929027801761354, "loss": 1.0077, "step": 9453 }, { "epoch": 0.40867727605986115, "grad_norm": 0.3515625, "learning_rate": 0.0005919098601277845, "loss": 1.0181, "step": 9476 }, { "epoch": 0.40966921119592875, "grad_norm": 0.33203125, "learning_rate": 0.0005909169400794336, "loss": 1.0427, "step": 9499 }, { "epoch": 0.41066114633199635, "grad_norm": 0.380859375, "learning_rate": 0.0005899240200310827, "loss": 1.0133, "step": 9522 }, { "epoch": 0.411653081468064, "grad_norm": 0.40234375, "learning_rate": 0.0005889310999827318, "loss": 1.0187, "step": 9545 }, { "epoch": 0.4126450166041316, "grad_norm": 0.44921875, "learning_rate": 0.000587938179934381, "loss": 1.0091, "step": 9568 }, { "epoch": 0.41363695174019927, "grad_norm": 0.37109375, "learning_rate": 0.0005869452598860301, "loss": 1.002, "step": 9591 }, { "epoch": 0.41462888687626687, "grad_norm": 0.47265625, "learning_rate": 0.0005859523398376792, "loss": 1.0321, "step": 9614 }, { "epoch": 0.4156208220123345, "grad_norm": 0.4453125, "learning_rate": 0.0005849594197893283, "loss": 1.0244, "step": 9637 }, { "epoch": 0.4166127571484021, "grad_norm": 0.4375, "learning_rate": 0.0005839664997409773, "loss": 1.0345, "step": 9660 }, { "epoch": 0.4176046922844697, "grad_norm": 0.357421875, "learning_rate": 0.0005829735796926266, "loss": 1.0157, "step": 9683 }, { "epoch": 0.4185966274205374, "grad_norm": 0.3515625, "learning_rate": 0.0005819806596442756, "loss": 0.9774, "step": 9706 }, { "epoch": 0.419588562556605, "grad_norm": 0.408203125, "learning_rate": 0.0005809877395959248, "loss": 1.0011, "step": 9729 }, { "epoch": 0.42058049769267264, "grad_norm": 0.41015625, "learning_rate": 0.0005799948195475738, "loss": 1.0148, "step": 9752 }, { "epoch": 0.42157243282874024, "grad_norm": 0.53515625, "learning_rate": 0.000579001899499223, "loss": 1.0058, "step": 9775 }, { "epoch": 0.42256436796480784, "grad_norm": 0.37890625, "learning_rate": 0.000578008979450872, "loss": 1.0241, "step": 9798 }, { "epoch": 0.4235563031008755, "grad_norm": 0.38671875, "learning_rate": 0.0005770160594025212, "loss": 1.0067, "step": 9821 }, { "epoch": 0.4245482382369431, "grad_norm": 0.3359375, "learning_rate": 0.0005760231393541703, "loss": 1.0105, "step": 9844 }, { "epoch": 0.42554017337301075, "grad_norm": 0.40625, "learning_rate": 0.0005750302193058193, "loss": 1.0226, "step": 9867 }, { "epoch": 0.42653210850907836, "grad_norm": 0.392578125, "learning_rate": 0.0005740372992574685, "loss": 1.0243, "step": 9890 }, { "epoch": 0.427524043645146, "grad_norm": 0.419921875, "learning_rate": 0.0005730443792091175, "loss": 1.0128, "step": 9913 }, { "epoch": 0.4285159787812136, "grad_norm": 0.373046875, "learning_rate": 0.0005720514591607668, "loss": 0.9993, "step": 9936 }, { "epoch": 0.4295079139172812, "grad_norm": 0.365234375, "learning_rate": 0.0005710585391124158, "loss": 1.012, "step": 9959 }, { "epoch": 0.43049984905334887, "grad_norm": 0.302734375, "learning_rate": 0.000570065619064065, "loss": 1.0178, "step": 9982 }, { "epoch": 0.43149178418941647, "grad_norm": 0.3671875, "learning_rate": 0.000569072699015714, "loss": 1.0192, "step": 10005 }, { "epoch": 0.43248371932548413, "grad_norm": 0.345703125, "learning_rate": 0.0005680797789673631, "loss": 0.9992, "step": 10028 }, { "epoch": 0.43347565446155173, "grad_norm": 0.4140625, "learning_rate": 0.0005670868589190123, "loss": 1.0033, "step": 10051 }, { "epoch": 0.43446758959761933, "grad_norm": 0.359375, "learning_rate": 0.0005660939388706614, "loss": 1.0153, "step": 10074 }, { "epoch": 0.435459524733687, "grad_norm": 0.36328125, "learning_rate": 0.0005651010188223105, "loss": 0.997, "step": 10097 }, { "epoch": 0.4364514598697546, "grad_norm": 0.3671875, "learning_rate": 0.0005641080987739596, "loss": 1.0147, "step": 10120 }, { "epoch": 0.43744339500582224, "grad_norm": 0.3515625, "learning_rate": 0.0005631151787256087, "loss": 1.0046, "step": 10143 }, { "epoch": 0.43843533014188985, "grad_norm": 0.37890625, "learning_rate": 0.0005621222586772579, "loss": 1.0008, "step": 10166 }, { "epoch": 0.4394272652779575, "grad_norm": 0.41796875, "learning_rate": 0.0005611293386289069, "loss": 1.0315, "step": 10189 }, { "epoch": 0.4404192004140251, "grad_norm": 0.365234375, "learning_rate": 0.0005601364185805561, "loss": 1.0164, "step": 10212 }, { "epoch": 0.4414111355500927, "grad_norm": 0.361328125, "learning_rate": 0.0005591434985322051, "loss": 1.0044, "step": 10235 }, { "epoch": 0.44240307068616036, "grad_norm": 0.439453125, "learning_rate": 0.0005581505784838543, "loss": 1.0236, "step": 10258 }, { "epoch": 0.44339500582222796, "grad_norm": 0.341796875, "learning_rate": 0.0005571576584355034, "loss": 1.0156, "step": 10281 }, { "epoch": 0.4443869409582956, "grad_norm": 0.373046875, "learning_rate": 0.0005561647383871526, "loss": 0.9916, "step": 10304 }, { "epoch": 0.4453788760943632, "grad_norm": 0.33203125, "learning_rate": 0.0005551718183388016, "loss": 0.9961, "step": 10327 }, { "epoch": 0.4463708112304308, "grad_norm": 0.392578125, "learning_rate": 0.0005541788982904507, "loss": 1.0021, "step": 10350 }, { "epoch": 0.4473627463664985, "grad_norm": 0.375, "learning_rate": 0.0005531859782420998, "loss": 1.0219, "step": 10373 }, { "epoch": 0.4483546815025661, "grad_norm": 0.4140625, "learning_rate": 0.000552193058193749, "loss": 0.9982, "step": 10396 }, { "epoch": 0.44934661663863373, "grad_norm": 0.392578125, "learning_rate": 0.0005512001381453981, "loss": 0.994, "step": 10419 }, { "epoch": 0.45033855177470133, "grad_norm": 0.34765625, "learning_rate": 0.0005502072180970471, "loss": 0.9899, "step": 10442 }, { "epoch": 0.451330486910769, "grad_norm": 0.3828125, "learning_rate": 0.0005492142980486963, "loss": 1.0096, "step": 10465 }, { "epoch": 0.4523224220468366, "grad_norm": 0.3984375, "learning_rate": 0.0005482213780003453, "loss": 0.9882, "step": 10488 }, { "epoch": 0.4533143571829042, "grad_norm": 0.390625, "learning_rate": 0.0005472284579519945, "loss": 0.999, "step": 10511 }, { "epoch": 0.45430629231897185, "grad_norm": 0.3984375, "learning_rate": 0.0005462355379036436, "loss": 1.0087, "step": 10534 }, { "epoch": 0.45529822745503945, "grad_norm": 0.388671875, "learning_rate": 0.0005452426178552927, "loss": 0.9985, "step": 10557 }, { "epoch": 0.4562901625911071, "grad_norm": 0.455078125, "learning_rate": 0.0005442496978069418, "loss": 1.0104, "step": 10580 }, { "epoch": 0.4572820977271747, "grad_norm": 0.61328125, "learning_rate": 0.0005432567777585909, "loss": 1.0056, "step": 10603 }, { "epoch": 0.4582740328632423, "grad_norm": 0.3359375, "learning_rate": 0.00054226385771024, "loss": 1.0115, "step": 10626 }, { "epoch": 0.45926596799930997, "grad_norm": 0.3515625, "learning_rate": 0.0005412709376618892, "loss": 1.0143, "step": 10649 }, { "epoch": 0.46025790313537757, "grad_norm": 0.388671875, "learning_rate": 0.0005402780176135383, "loss": 0.9916, "step": 10672 }, { "epoch": 0.4612498382714452, "grad_norm": 0.396484375, "learning_rate": 0.0005392850975651874, "loss": 0.9967, "step": 10695 }, { "epoch": 0.4622417734075128, "grad_norm": 0.41796875, "learning_rate": 0.0005382921775168364, "loss": 1.0009, "step": 10718 }, { "epoch": 0.4632337085435805, "grad_norm": 0.34765625, "learning_rate": 0.0005372992574684856, "loss": 0.9919, "step": 10741 }, { "epoch": 0.4642256436796481, "grad_norm": 0.3515625, "learning_rate": 0.0005363063374201347, "loss": 1.0128, "step": 10764 }, { "epoch": 0.4652175788157157, "grad_norm": 0.400390625, "learning_rate": 0.0005353134173717839, "loss": 0.9982, "step": 10787 }, { "epoch": 0.46620951395178334, "grad_norm": 0.3515625, "learning_rate": 0.0005343204973234329, "loss": 0.9998, "step": 10810 }, { "epoch": 0.46720144908785094, "grad_norm": 0.5390625, "learning_rate": 0.0005333275772750821, "loss": 1.0177, "step": 10833 }, { "epoch": 0.4681933842239186, "grad_norm": 0.37890625, "learning_rate": 0.0005323346572267311, "loss": 0.9899, "step": 10856 }, { "epoch": 0.4691853193599862, "grad_norm": 0.38671875, "learning_rate": 0.0005313417371783802, "loss": 1.0052, "step": 10879 }, { "epoch": 0.4701772544960538, "grad_norm": 0.36328125, "learning_rate": 0.0005303488171300294, "loss": 0.9741, "step": 10902 }, { "epoch": 0.47116918963212145, "grad_norm": 0.4453125, "learning_rate": 0.0005293558970816785, "loss": 1.0021, "step": 10925 }, { "epoch": 0.47216112476818906, "grad_norm": 0.322265625, "learning_rate": 0.0005283629770333276, "loss": 0.9896, "step": 10948 }, { "epoch": 0.4731530599042567, "grad_norm": 0.36328125, "learning_rate": 0.0005273700569849767, "loss": 1.0046, "step": 10971 }, { "epoch": 0.4741449950403243, "grad_norm": 0.345703125, "learning_rate": 0.0005263771369366258, "loss": 1.0004, "step": 10994 }, { "epoch": 0.47513693017639197, "grad_norm": 0.357421875, "learning_rate": 0.0005253842168882749, "loss": 1.0031, "step": 11017 }, { "epoch": 0.47612886531245957, "grad_norm": 0.359375, "learning_rate": 0.0005243912968399241, "loss": 1.007, "step": 11040 }, { "epoch": 0.47712080044852717, "grad_norm": 0.38671875, "learning_rate": 0.0005233983767915731, "loss": 1.0046, "step": 11063 }, { "epoch": 0.47811273558459483, "grad_norm": 0.341796875, "learning_rate": 0.0005224054567432222, "loss": 0.9956, "step": 11086 }, { "epoch": 0.47910467072066243, "grad_norm": 0.3515625, "learning_rate": 0.0005214125366948713, "loss": 1.01, "step": 11109 }, { "epoch": 0.4800966058567301, "grad_norm": 0.431640625, "learning_rate": 0.0005204196166465205, "loss": 1.0211, "step": 11132 }, { "epoch": 0.4810885409927977, "grad_norm": 0.375, "learning_rate": 0.0005194266965981696, "loss": 1.0039, "step": 11155 }, { "epoch": 0.4820804761288653, "grad_norm": 0.392578125, "learning_rate": 0.0005184337765498187, "loss": 0.9886, "step": 11178 }, { "epoch": 0.48307241126493294, "grad_norm": 0.515625, "learning_rate": 0.0005174408565014678, "loss": 0.9973, "step": 11201 }, { "epoch": 0.48406434640100054, "grad_norm": 0.396484375, "learning_rate": 0.0005164479364531169, "loss": 1.013, "step": 11224 }, { "epoch": 0.4850562815370682, "grad_norm": 0.451171875, "learning_rate": 0.000515455016404766, "loss": 0.9876, "step": 11247 }, { "epoch": 0.4860482166731358, "grad_norm": 0.375, "learning_rate": 0.0005144620963564152, "loss": 0.9984, "step": 11270 }, { "epoch": 0.48704015180920346, "grad_norm": 0.416015625, "learning_rate": 0.0005134691763080642, "loss": 0.9875, "step": 11293 }, { "epoch": 0.48803208694527106, "grad_norm": 0.369140625, "learning_rate": 0.0005124762562597134, "loss": 0.9954, "step": 11316 }, { "epoch": 0.48902402208133866, "grad_norm": 0.337890625, "learning_rate": 0.0005114833362113624, "loss": 0.9825, "step": 11339 }, { "epoch": 0.4900159572174063, "grad_norm": 0.37890625, "learning_rate": 0.0005104904161630117, "loss": 0.9983, "step": 11362 }, { "epoch": 0.4910078923534739, "grad_norm": 0.328125, "learning_rate": 0.0005094974961146607, "loss": 0.9818, "step": 11385 }, { "epoch": 0.4919998274895416, "grad_norm": 0.357421875, "learning_rate": 0.0005085045760663098, "loss": 0.9928, "step": 11408 }, { "epoch": 0.4929917626256092, "grad_norm": 0.4921875, "learning_rate": 0.0005075116560179589, "loss": 0.9771, "step": 11431 }, { "epoch": 0.4939836977616768, "grad_norm": 0.341796875, "learning_rate": 0.000506518735969608, "loss": 1.0059, "step": 11454 }, { "epoch": 0.49497563289774443, "grad_norm": 0.400390625, "learning_rate": 0.0005055258159212571, "loss": 1.0058, "step": 11477 }, { "epoch": 0.49596756803381203, "grad_norm": 0.328125, "learning_rate": 0.0005045328958729063, "loss": 0.9962, "step": 11500 }, { "epoch": 0.4969595031698797, "grad_norm": 0.326171875, "learning_rate": 0.0005035399758245554, "loss": 0.9828, "step": 11523 }, { "epoch": 0.4979514383059473, "grad_norm": 0.5078125, "learning_rate": 0.0005025470557762045, "loss": 0.9881, "step": 11546 }, { "epoch": 0.49894337344201495, "grad_norm": 0.43359375, "learning_rate": 0.0005015541357278536, "loss": 0.9863, "step": 11569 }, { "epoch": 0.49984905334885926, "eval_runtime": 163.9862, "eval_samples_per_second": 609.807, "eval_steps_per_second": 7.623, "step": 11590 }, { "epoch": 0.49993530857808255, "grad_norm": 0.353515625, "learning_rate": 0.0005005612156795026, "loss": 0.9764, "step": 11592 }, { "epoch": 0.5009272437141502, "grad_norm": 0.36328125, "learning_rate": 0.0004995682956311518, "loss": 0.9923, "step": 11615 }, { "epoch": 0.5019191788502178, "grad_norm": 0.39453125, "learning_rate": 0.0004985753755828009, "loss": 0.9738, "step": 11638 }, { "epoch": 0.5029111139862854, "grad_norm": 0.48828125, "learning_rate": 0.00049758245553445, "loss": 0.973, "step": 11661 }, { "epoch": 0.5039030491223531, "grad_norm": 0.384765625, "learning_rate": 0.0004965895354860991, "loss": 0.9741, "step": 11684 }, { "epoch": 0.5048949842584207, "grad_norm": 0.359375, "learning_rate": 0.0004955966154377482, "loss": 0.9842, "step": 11707 }, { "epoch": 0.5058869193944883, "grad_norm": 0.443359375, "learning_rate": 0.0004946036953893974, "loss": 0.9927, "step": 11730 }, { "epoch": 0.5068788545305559, "grad_norm": 0.44921875, "learning_rate": 0.0004936107753410465, "loss": 0.9921, "step": 11753 }, { "epoch": 0.5078707896666236, "grad_norm": 0.40625, "learning_rate": 0.0004926178552926956, "loss": 0.9827, "step": 11776 }, { "epoch": 0.5088627248026911, "grad_norm": 0.416015625, "learning_rate": 0.0004916249352443447, "loss": 0.9836, "step": 11799 }, { "epoch": 0.5098546599387588, "grad_norm": 0.36328125, "learning_rate": 0.0004906320151959938, "loss": 0.9783, "step": 11822 }, { "epoch": 0.5108465950748264, "grad_norm": 0.357421875, "learning_rate": 0.0004896390951476428, "loss": 1.0003, "step": 11845 }, { "epoch": 0.511838530210894, "grad_norm": 0.39453125, "learning_rate": 0.000488646175099292, "loss": 0.995, "step": 11868 }, { "epoch": 0.5128304653469616, "grad_norm": 0.376953125, "learning_rate": 0.0004876532550509411, "loss": 0.9952, "step": 11891 }, { "epoch": 0.5138224004830293, "grad_norm": 0.3828125, "learning_rate": 0.0004866603350025902, "loss": 0.9912, "step": 11914 }, { "epoch": 0.514814335619097, "grad_norm": 0.34375, "learning_rate": 0.00048566741495423933, "loss": 0.995, "step": 11937 }, { "epoch": 0.5158062707551645, "grad_norm": 0.408203125, "learning_rate": 0.00048467449490588845, "loss": 0.9856, "step": 11960 }, { "epoch": 0.5167982058912322, "grad_norm": 0.427734375, "learning_rate": 0.00048368157485753757, "loss": 0.9887, "step": 11983 }, { "epoch": 0.5177901410272998, "grad_norm": 0.376953125, "learning_rate": 0.0004826886548091867, "loss": 0.9815, "step": 12006 }, { "epoch": 0.5187820761633674, "grad_norm": 0.369140625, "learning_rate": 0.00048169573476083575, "loss": 0.9944, "step": 12029 }, { "epoch": 0.519774011299435, "grad_norm": 0.427734375, "learning_rate": 0.0004807028147124849, "loss": 0.9714, "step": 12052 }, { "epoch": 0.5207659464355027, "grad_norm": 0.326171875, "learning_rate": 0.000479709894664134, "loss": 0.9849, "step": 12075 }, { "epoch": 0.5217578815715703, "grad_norm": 0.427734375, "learning_rate": 0.0004787169746157831, "loss": 0.9861, "step": 12098 }, { "epoch": 0.5227498167076379, "grad_norm": 0.4765625, "learning_rate": 0.00047772405456743223, "loss": 1.0009, "step": 12121 }, { "epoch": 0.5237417518437055, "grad_norm": 0.345703125, "learning_rate": 0.00047673113451908135, "loss": 0.9892, "step": 12144 }, { "epoch": 0.5247336869797732, "grad_norm": 0.345703125, "learning_rate": 0.00047573821447073047, "loss": 0.9843, "step": 12167 }, { "epoch": 0.5257256221158407, "grad_norm": 0.40234375, "learning_rate": 0.0004747452944223796, "loss": 0.9767, "step": 12190 }, { "epoch": 0.5267175572519084, "grad_norm": 0.359375, "learning_rate": 0.00047375237437402866, "loss": 0.9599, "step": 12213 }, { "epoch": 0.527709492387976, "grad_norm": 0.388671875, "learning_rate": 0.0004727594543256778, "loss": 0.9797, "step": 12236 }, { "epoch": 0.5287014275240437, "grad_norm": 0.3359375, "learning_rate": 0.0004717665342773269, "loss": 0.9952, "step": 12259 }, { "epoch": 0.5296933626601112, "grad_norm": 0.359375, "learning_rate": 0.000470773614228976, "loss": 0.9851, "step": 12282 }, { "epoch": 0.5306852977961789, "grad_norm": 0.4140625, "learning_rate": 0.00046978069418062514, "loss": 0.9728, "step": 12305 }, { "epoch": 0.5316772329322466, "grad_norm": 0.376953125, "learning_rate": 0.00046878777413227426, "loss": 0.9813, "step": 12328 }, { "epoch": 0.5326691680683141, "grad_norm": 0.326171875, "learning_rate": 0.0004677948540839234, "loss": 0.9729, "step": 12351 }, { "epoch": 0.5336611032043818, "grad_norm": 0.33203125, "learning_rate": 0.0004668019340355725, "loss": 0.969, "step": 12374 }, { "epoch": 0.5346530383404494, "grad_norm": 0.43359375, "learning_rate": 0.00046580901398722156, "loss": 0.9786, "step": 12397 }, { "epoch": 0.535644973476517, "grad_norm": 0.388671875, "learning_rate": 0.00046481609393887063, "loss": 0.9773, "step": 12420 }, { "epoch": 0.5366369086125846, "grad_norm": 0.451171875, "learning_rate": 0.00046382317389051975, "loss": 0.9972, "step": 12443 }, { "epoch": 0.5376288437486523, "grad_norm": 0.408203125, "learning_rate": 0.00046283025384216887, "loss": 0.9893, "step": 12466 }, { "epoch": 0.5386207788847199, "grad_norm": 0.400390625, "learning_rate": 0.000461837333793818, "loss": 0.9747, "step": 12489 }, { "epoch": 0.5396127140207875, "grad_norm": 0.4921875, "learning_rate": 0.0004608444137454671, "loss": 0.9795, "step": 12512 }, { "epoch": 0.5406046491568551, "grad_norm": 0.37109375, "learning_rate": 0.00045985149369711623, "loss": 0.9608, "step": 12535 }, { "epoch": 0.5415965842929228, "grad_norm": 0.3515625, "learning_rate": 0.00045885857364876535, "loss": 0.966, "step": 12558 }, { "epoch": 0.5425885194289903, "grad_norm": 0.361328125, "learning_rate": 0.0004578656536004144, "loss": 0.9689, "step": 12581 }, { "epoch": 0.543580454565058, "grad_norm": 0.404296875, "learning_rate": 0.00045687273355206353, "loss": 0.9655, "step": 12604 }, { "epoch": 0.5445723897011256, "grad_norm": 0.37890625, "learning_rate": 0.00045587981350371265, "loss": 0.9693, "step": 12627 }, { "epoch": 0.5455643248371933, "grad_norm": 0.41015625, "learning_rate": 0.00045488689345536177, "loss": 0.9986, "step": 12650 }, { "epoch": 0.5465562599732608, "grad_norm": 0.345703125, "learning_rate": 0.0004538939734070109, "loss": 0.9715, "step": 12673 }, { "epoch": 0.5475481951093285, "grad_norm": 0.37890625, "learning_rate": 0.00045290105335866, "loss": 0.9781, "step": 12696 }, { "epoch": 0.5485401302453962, "grad_norm": 0.42578125, "learning_rate": 0.00045190813331030913, "loss": 1.0001, "step": 12719 }, { "epoch": 0.5495320653814637, "grad_norm": 0.43359375, "learning_rate": 0.0004509152132619582, "loss": 0.9811, "step": 12742 }, { "epoch": 0.5505240005175314, "grad_norm": 0.341796875, "learning_rate": 0.0004499222932136073, "loss": 0.9584, "step": 12765 }, { "epoch": 0.551515935653599, "grad_norm": 0.419921875, "learning_rate": 0.00044892937316525644, "loss": 0.977, "step": 12788 }, { "epoch": 0.5525078707896667, "grad_norm": 0.416015625, "learning_rate": 0.00044793645311690556, "loss": 0.9746, "step": 12811 }, { "epoch": 0.5534998059257342, "grad_norm": 0.390625, "learning_rate": 0.0004469435330685547, "loss": 0.9811, "step": 12834 }, { "epoch": 0.5544917410618019, "grad_norm": 0.35546875, "learning_rate": 0.0004459506130202038, "loss": 0.9523, "step": 12857 }, { "epoch": 0.5554836761978695, "grad_norm": 0.37890625, "learning_rate": 0.0004449576929718529, "loss": 0.9641, "step": 12880 }, { "epoch": 0.5564756113339371, "grad_norm": 0.36328125, "learning_rate": 0.00044396477292350204, "loss": 0.9845, "step": 12903 }, { "epoch": 0.5574675464700047, "grad_norm": 0.365234375, "learning_rate": 0.0004429718528751511, "loss": 0.9788, "step": 12926 }, { "epoch": 0.5584594816060724, "grad_norm": 0.390625, "learning_rate": 0.0004419789328268002, "loss": 0.9795, "step": 12949 }, { "epoch": 0.5594514167421399, "grad_norm": 0.37109375, "learning_rate": 0.00044098601277844934, "loss": 0.9716, "step": 12972 }, { "epoch": 0.5604433518782076, "grad_norm": 0.38671875, "learning_rate": 0.00043999309273009846, "loss": 0.9814, "step": 12995 }, { "epoch": 0.5614352870142753, "grad_norm": 0.34765625, "learning_rate": 0.00043900017268174753, "loss": 0.9724, "step": 13018 }, { "epoch": 0.5624272221503429, "grad_norm": 0.44921875, "learning_rate": 0.00043800725263339665, "loss": 0.9538, "step": 13041 }, { "epoch": 0.5634191572864105, "grad_norm": 0.3828125, "learning_rate": 0.00043701433258504577, "loss": 0.9744, "step": 13064 }, { "epoch": 0.5644110924224781, "grad_norm": 0.423828125, "learning_rate": 0.0004360214125366949, "loss": 0.9777, "step": 13087 }, { "epoch": 0.5654030275585458, "grad_norm": 0.365234375, "learning_rate": 0.00043502849248834395, "loss": 0.9688, "step": 13110 }, { "epoch": 0.5663949626946133, "grad_norm": 0.470703125, "learning_rate": 0.00043403557243999307, "loss": 0.988, "step": 13133 }, { "epoch": 0.567386897830681, "grad_norm": 0.341796875, "learning_rate": 0.0004330426523916422, "loss": 0.9678, "step": 13156 }, { "epoch": 0.5683788329667486, "grad_norm": 0.345703125, "learning_rate": 0.0004320497323432913, "loss": 0.9735, "step": 13179 }, { "epoch": 0.5693707681028163, "grad_norm": 0.416015625, "learning_rate": 0.00043105681229494043, "loss": 0.9612, "step": 13202 }, { "epoch": 0.5703627032388838, "grad_norm": 0.375, "learning_rate": 0.00043006389224658955, "loss": 0.9428, "step": 13225 }, { "epoch": 0.5713546383749515, "grad_norm": 0.4296875, "learning_rate": 0.00042907097219823867, "loss": 0.9654, "step": 13248 }, { "epoch": 0.5723465735110191, "grad_norm": 0.353515625, "learning_rate": 0.0004280780521498878, "loss": 0.9739, "step": 13271 }, { "epoch": 0.5733385086470867, "grad_norm": 0.380859375, "learning_rate": 0.00042708513210153686, "loss": 0.9755, "step": 13294 }, { "epoch": 0.5743304437831543, "grad_norm": 0.357421875, "learning_rate": 0.000426092212053186, "loss": 0.9784, "step": 13317 }, { "epoch": 0.575322378919222, "grad_norm": 0.3125, "learning_rate": 0.0004250992920048351, "loss": 0.9625, "step": 13340 }, { "epoch": 0.5763143140552897, "grad_norm": 0.345703125, "learning_rate": 0.0004241063719564842, "loss": 0.9521, "step": 13363 }, { "epoch": 0.5773062491913572, "grad_norm": 0.333984375, "learning_rate": 0.00042311345190813334, "loss": 0.984, "step": 13386 }, { "epoch": 0.5782981843274249, "grad_norm": 0.45703125, "learning_rate": 0.00042212053185978246, "loss": 0.9794, "step": 13409 }, { "epoch": 0.5792901194634925, "grad_norm": 0.396484375, "learning_rate": 0.0004211276118114316, "loss": 0.9705, "step": 13432 }, { "epoch": 0.5802820545995601, "grad_norm": 0.400390625, "learning_rate": 0.00042013469176308064, "loss": 0.97, "step": 13455 }, { "epoch": 0.5812739897356277, "grad_norm": 0.37890625, "learning_rate": 0.00041914177171472976, "loss": 0.968, "step": 13478 }, { "epoch": 0.5822659248716954, "grad_norm": 0.365234375, "learning_rate": 0.0004181488516663789, "loss": 0.9664, "step": 13501 }, { "epoch": 0.5832578600077629, "grad_norm": 0.361328125, "learning_rate": 0.000417155931618028, "loss": 0.9722, "step": 13524 }, { "epoch": 0.5842497951438306, "grad_norm": 0.369140625, "learning_rate": 0.0004161630115696771, "loss": 0.9695, "step": 13547 }, { "epoch": 0.5852417302798982, "grad_norm": 0.337890625, "learning_rate": 0.00041517009152132624, "loss": 0.9628, "step": 13570 }, { "epoch": 0.5862336654159659, "grad_norm": 0.330078125, "learning_rate": 0.0004141771714729753, "loss": 0.9515, "step": 13593 }, { "epoch": 0.5872256005520334, "grad_norm": 0.359375, "learning_rate": 0.0004131842514246244, "loss": 0.965, "step": 13616 }, { "epoch": 0.5882175356881011, "grad_norm": 0.392578125, "learning_rate": 0.0004121913313762735, "loss": 0.9598, "step": 13639 }, { "epoch": 0.5892094708241687, "grad_norm": 0.41796875, "learning_rate": 0.0004111984113279226, "loss": 0.9575, "step": 13662 }, { "epoch": 0.5902014059602363, "grad_norm": 0.5234375, "learning_rate": 0.00041020549127957173, "loss": 0.9933, "step": 13685 }, { "epoch": 0.591193341096304, "grad_norm": 0.423828125, "learning_rate": 0.00040921257123122085, "loss": 0.9621, "step": 13708 }, { "epoch": 0.5921852762323716, "grad_norm": 0.33203125, "learning_rate": 0.00040821965118286997, "loss": 0.964, "step": 13731 }, { "epoch": 0.5931772113684393, "grad_norm": 0.423828125, "learning_rate": 0.0004072267311345191, "loss": 0.9854, "step": 13754 }, { "epoch": 0.5941691465045068, "grad_norm": 0.3515625, "learning_rate": 0.0004062338110861682, "loss": 0.9883, "step": 13777 }, { "epoch": 0.5951610816405745, "grad_norm": 0.408203125, "learning_rate": 0.00040524089103781733, "loss": 0.9853, "step": 13800 }, { "epoch": 0.5961530167766421, "grad_norm": 0.408203125, "learning_rate": 0.0004042479709894664, "loss": 0.9557, "step": 13823 }, { "epoch": 0.5971449519127097, "grad_norm": 0.42578125, "learning_rate": 0.0004032550509411155, "loss": 0.9587, "step": 13846 }, { "epoch": 0.5981368870487773, "grad_norm": 0.44921875, "learning_rate": 0.00040226213089276464, "loss": 0.9771, "step": 13869 }, { "epoch": 0.599128822184845, "grad_norm": 0.431640625, "learning_rate": 0.00040126921084441376, "loss": 0.9661, "step": 13892 }, { "epoch": 0.5998188640186312, "eval_runtime": 163.7921, "eval_samples_per_second": 610.53, "eval_steps_per_second": 7.632, "step": 13908 } ], "logging_steps": 23, "max_steps": 23187, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2318, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0167159364234772e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }