80M-0.0060-cosmopedia / trainer_state.json
rs545837's picture
Upload folder using huggingface_hub
11e1887 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5998188640186312,
"eval_steps": 2318,
"global_step": 13908,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009919351360676242,
"grad_norm": 0.91796875,
"learning_rate": 0.001,
"loss": 9.0124,
"step": 23
},
{
"epoch": 0.0019838702721352484,
"grad_norm": 0.1376953125,
"learning_rate": 0.0009990070799516491,
"loss": 4.9678,
"step": 46
},
{
"epoch": 0.0029758054082028723,
"grad_norm": 0.10400390625,
"learning_rate": 0.0009980141599032983,
"loss": 4.8546,
"step": 69
},
{
"epoch": 0.003967740544270497,
"grad_norm": 0.2109375,
"learning_rate": 0.0009970212398549474,
"loss": 4.7284,
"step": 92
},
{
"epoch": 0.00495967568033812,
"grad_norm": 0.201171875,
"learning_rate": 0.0009960283198065965,
"loss": 4.5063,
"step": 115
},
{
"epoch": 0.005951610816405745,
"grad_norm": 0.302734375,
"learning_rate": 0.0009950353997582456,
"loss": 4.2107,
"step": 138
},
{
"epoch": 0.006943545952473369,
"grad_norm": 0.2314453125,
"learning_rate": 0.0009940424797098947,
"loss": 4.0923,
"step": 161
},
{
"epoch": 0.007935481088540993,
"grad_norm": 0.310546875,
"learning_rate": 0.0009930495596615439,
"loss": 3.7695,
"step": 184
},
{
"epoch": 0.008927416224608617,
"grad_norm": 0.30078125,
"learning_rate": 0.000992056639613193,
"loss": 3.6712,
"step": 207
},
{
"epoch": 0.00991935136067624,
"grad_norm": 0.322265625,
"learning_rate": 0.0009910637195648419,
"loss": 3.5194,
"step": 230
},
{
"epoch": 0.010911286496743866,
"grad_norm": 0.314453125,
"learning_rate": 0.0009900707995164912,
"loss": 3.4792,
"step": 253
},
{
"epoch": 0.01190322163281149,
"grad_norm": 0.296875,
"learning_rate": 0.0009890778794681401,
"loss": 3.3372,
"step": 276
},
{
"epoch": 0.012895156768879113,
"grad_norm": 0.310546875,
"learning_rate": 0.0009880849594197895,
"loss": 3.2479,
"step": 299
},
{
"epoch": 0.013887091904946738,
"grad_norm": 0.34765625,
"learning_rate": 0.0009870920393714384,
"loss": 3.2265,
"step": 322
},
{
"epoch": 0.014879027041014362,
"grad_norm": 0.3515625,
"learning_rate": 0.0009860991193230877,
"loss": 3.1277,
"step": 345
},
{
"epoch": 0.015870962177081987,
"grad_norm": 0.2578125,
"learning_rate": 0.0009851061992747366,
"loss": 3.0156,
"step": 368
},
{
"epoch": 0.01686289731314961,
"grad_norm": 0.2578125,
"learning_rate": 0.0009841132792263857,
"loss": 2.9304,
"step": 391
},
{
"epoch": 0.017854832449217234,
"grad_norm": 0.486328125,
"learning_rate": 0.0009831203591780348,
"loss": 2.9364,
"step": 414
},
{
"epoch": 0.01884676758528486,
"grad_norm": 0.37890625,
"learning_rate": 0.000982127439129684,
"loss": 2.8848,
"step": 437
},
{
"epoch": 0.01983870272135248,
"grad_norm": 0.28125,
"learning_rate": 0.000981134519081333,
"loss": 2.8368,
"step": 460
},
{
"epoch": 0.020830637857420106,
"grad_norm": 0.310546875,
"learning_rate": 0.0009801415990329822,
"loss": 2.8024,
"step": 483
},
{
"epoch": 0.02182257299348773,
"grad_norm": 0.498046875,
"learning_rate": 0.0009791486789846313,
"loss": 2.7152,
"step": 506
},
{
"epoch": 0.022814508129555353,
"grad_norm": 0.357421875,
"learning_rate": 0.0009781557589362804,
"loss": 2.6818,
"step": 529
},
{
"epoch": 0.02380644326562298,
"grad_norm": 0.353515625,
"learning_rate": 0.0009771628388879296,
"loss": 2.6625,
"step": 552
},
{
"epoch": 0.024798378401690604,
"grad_norm": 0.390625,
"learning_rate": 0.0009761699188395787,
"loss": 2.5967,
"step": 575
},
{
"epoch": 0.025790313537758226,
"grad_norm": 0.427734375,
"learning_rate": 0.0009751769987912278,
"loss": 2.541,
"step": 598
},
{
"epoch": 0.02678224867382585,
"grad_norm": 0.314453125,
"learning_rate": 0.0009741840787428768,
"loss": 2.5455,
"step": 621
},
{
"epoch": 0.027774183809893476,
"grad_norm": 0.302734375,
"learning_rate": 0.000973191158694526,
"loss": 2.5566,
"step": 644
},
{
"epoch": 0.028766118945961098,
"grad_norm": 0.357421875,
"learning_rate": 0.000972198238646175,
"loss": 2.4318,
"step": 667
},
{
"epoch": 0.029758054082028723,
"grad_norm": 0.3125,
"learning_rate": 0.0009712053185978243,
"loss": 2.4449,
"step": 690
},
{
"epoch": 0.03074998921809635,
"grad_norm": 0.302734375,
"learning_rate": 0.0009702123985494733,
"loss": 2.4232,
"step": 713
},
{
"epoch": 0.031741924354163974,
"grad_norm": 0.53125,
"learning_rate": 0.0009692194785011224,
"loss": 2.3352,
"step": 736
},
{
"epoch": 0.032733859490231595,
"grad_norm": 0.30859375,
"learning_rate": 0.0009682265584527715,
"loss": 2.332,
"step": 759
},
{
"epoch": 0.03372579462629922,
"grad_norm": 0.42578125,
"learning_rate": 0.0009672336384044207,
"loss": 2.3185,
"step": 782
},
{
"epoch": 0.034717729762366846,
"grad_norm": 0.314453125,
"learning_rate": 0.0009662407183560698,
"loss": 2.2945,
"step": 805
},
{
"epoch": 0.03570966489843447,
"grad_norm": 0.3203125,
"learning_rate": 0.0009652477983077189,
"loss": 2.2821,
"step": 828
},
{
"epoch": 0.03670160003450209,
"grad_norm": 0.265625,
"learning_rate": 0.000964254878259368,
"loss": 2.2184,
"step": 851
},
{
"epoch": 0.03769353517056972,
"grad_norm": 0.40234375,
"learning_rate": 0.0009632619582110171,
"loss": 2.2383,
"step": 874
},
{
"epoch": 0.03868547030663734,
"grad_norm": 0.30859375,
"learning_rate": 0.0009622690381626662,
"loss": 2.1755,
"step": 897
},
{
"epoch": 0.03967740544270496,
"grad_norm": 0.3203125,
"learning_rate": 0.0009612761181143154,
"loss": 2.1569,
"step": 920
},
{
"epoch": 0.04066934057877259,
"grad_norm": 0.314453125,
"learning_rate": 0.0009602831980659644,
"loss": 2.1272,
"step": 943
},
{
"epoch": 0.04166127571484021,
"grad_norm": 0.54296875,
"learning_rate": 0.0009592902780176136,
"loss": 2.1358,
"step": 966
},
{
"epoch": 0.042653210850907834,
"grad_norm": 0.279296875,
"learning_rate": 0.0009582973579692626,
"loss": 2.1388,
"step": 989
},
{
"epoch": 0.04364514598697546,
"grad_norm": 0.376953125,
"learning_rate": 0.0009573044379209118,
"loss": 2.0937,
"step": 1012
},
{
"epoch": 0.044637081123043085,
"grad_norm": 0.4140625,
"learning_rate": 0.0009563115178725609,
"loss": 2.0818,
"step": 1035
},
{
"epoch": 0.045629016259110707,
"grad_norm": 0.384765625,
"learning_rate": 0.0009553185978242101,
"loss": 2.0259,
"step": 1058
},
{
"epoch": 0.046620951395178335,
"grad_norm": 0.3359375,
"learning_rate": 0.0009543256777758591,
"loss": 2.0098,
"step": 1081
},
{
"epoch": 0.04761288653124596,
"grad_norm": 0.30078125,
"learning_rate": 0.0009533327577275082,
"loss": 2.0276,
"step": 1104
},
{
"epoch": 0.04860482166731358,
"grad_norm": 0.3046875,
"learning_rate": 0.0009523398376791573,
"loss": 2.0285,
"step": 1127
},
{
"epoch": 0.04959675680338121,
"grad_norm": 0.306640625,
"learning_rate": 0.0009513469176308065,
"loss": 1.9488,
"step": 1150
},
{
"epoch": 0.05058869193944883,
"grad_norm": 0.32421875,
"learning_rate": 0.0009503539975824556,
"loss": 1.9922,
"step": 1173
},
{
"epoch": 0.05158062707551645,
"grad_norm": 0.267578125,
"learning_rate": 0.0009493610775341046,
"loss": 1.9759,
"step": 1196
},
{
"epoch": 0.05257256221158408,
"grad_norm": 0.41015625,
"learning_rate": 0.0009483681574857538,
"loss": 1.93,
"step": 1219
},
{
"epoch": 0.0535644973476517,
"grad_norm": 0.36328125,
"learning_rate": 0.0009473752374374028,
"loss": 1.9264,
"step": 1242
},
{
"epoch": 0.05455643248371932,
"grad_norm": 0.25390625,
"learning_rate": 0.0009463823173890521,
"loss": 1.8782,
"step": 1265
},
{
"epoch": 0.05554836761978695,
"grad_norm": 0.51171875,
"learning_rate": 0.0009453893973407011,
"loss": 1.9079,
"step": 1288
},
{
"epoch": 0.056540302755854574,
"grad_norm": 0.27734375,
"learning_rate": 0.0009443964772923502,
"loss": 1.9279,
"step": 1311
},
{
"epoch": 0.057532237891922196,
"grad_norm": 0.291015625,
"learning_rate": 0.0009434035572439993,
"loss": 1.8501,
"step": 1334
},
{
"epoch": 0.058524173027989825,
"grad_norm": 0.484375,
"learning_rate": 0.0009424106371956484,
"loss": 1.8732,
"step": 1357
},
{
"epoch": 0.059516108164057446,
"grad_norm": 0.27734375,
"learning_rate": 0.0009414177171472975,
"loss": 1.8397,
"step": 1380
},
{
"epoch": 0.06050804330012507,
"grad_norm": 0.361328125,
"learning_rate": 0.0009404247970989467,
"loss": 1.8288,
"step": 1403
},
{
"epoch": 0.0614999784361927,
"grad_norm": 0.298828125,
"learning_rate": 0.0009394318770505958,
"loss": 1.8162,
"step": 1426
},
{
"epoch": 0.06249191357226032,
"grad_norm": 0.27734375,
"learning_rate": 0.0009384389570022449,
"loss": 1.8204,
"step": 1449
},
{
"epoch": 0.06348384870832795,
"grad_norm": 0.248046875,
"learning_rate": 0.0009374460369538939,
"loss": 1.8123,
"step": 1472
},
{
"epoch": 0.06447578384439556,
"grad_norm": 0.275390625,
"learning_rate": 0.0009364531169055431,
"loss": 1.8225,
"step": 1495
},
{
"epoch": 0.06546771898046319,
"grad_norm": 0.349609375,
"learning_rate": 0.0009354601968571922,
"loss": 1.7787,
"step": 1518
},
{
"epoch": 0.06645965411653082,
"grad_norm": 0.3671875,
"learning_rate": 0.0009344672768088414,
"loss": 1.7703,
"step": 1541
},
{
"epoch": 0.06745158925259843,
"grad_norm": 0.2578125,
"learning_rate": 0.0009334743567604904,
"loss": 1.7955,
"step": 1564
},
{
"epoch": 0.06844352438866606,
"grad_norm": 0.392578125,
"learning_rate": 0.0009324814367121396,
"loss": 1.7675,
"step": 1587
},
{
"epoch": 0.06943545952473369,
"grad_norm": 0.31640625,
"learning_rate": 0.0009314885166637886,
"loss": 1.7548,
"step": 1610
},
{
"epoch": 0.0704273946608013,
"grad_norm": 0.255859375,
"learning_rate": 0.0009304955966154378,
"loss": 1.7549,
"step": 1633
},
{
"epoch": 0.07141932979686894,
"grad_norm": 0.2578125,
"learning_rate": 0.0009295026765670869,
"loss": 1.7057,
"step": 1656
},
{
"epoch": 0.07241126493293656,
"grad_norm": 0.3125,
"learning_rate": 0.000928509756518736,
"loss": 1.7128,
"step": 1679
},
{
"epoch": 0.07340320006900418,
"grad_norm": 0.267578125,
"learning_rate": 0.0009275168364703851,
"loss": 1.7606,
"step": 1702
},
{
"epoch": 0.07439513520507181,
"grad_norm": 0.271484375,
"learning_rate": 0.0009265239164220342,
"loss": 1.6989,
"step": 1725
},
{
"epoch": 0.07538707034113944,
"grad_norm": 0.302734375,
"learning_rate": 0.0009255309963736834,
"loss": 1.7052,
"step": 1748
},
{
"epoch": 0.07637900547720705,
"grad_norm": 0.28515625,
"learning_rate": 0.0009245380763253324,
"loss": 1.7021,
"step": 1771
},
{
"epoch": 0.07737094061327468,
"grad_norm": 0.2314453125,
"learning_rate": 0.0009235451562769816,
"loss": 1.7096,
"step": 1794
},
{
"epoch": 0.07836287574934231,
"grad_norm": 0.29296875,
"learning_rate": 0.0009225522362286306,
"loss": 1.6537,
"step": 1817
},
{
"epoch": 0.07935481088540992,
"grad_norm": 0.26953125,
"learning_rate": 0.0009215593161802797,
"loss": 1.6359,
"step": 1840
},
{
"epoch": 0.08034674602147755,
"grad_norm": 0.298828125,
"learning_rate": 0.0009205663961319288,
"loss": 1.6107,
"step": 1863
},
{
"epoch": 0.08133868115754518,
"grad_norm": 0.271484375,
"learning_rate": 0.000919573476083578,
"loss": 1.6137,
"step": 1886
},
{
"epoch": 0.0823306162936128,
"grad_norm": 0.3359375,
"learning_rate": 0.0009185805560352271,
"loss": 1.6428,
"step": 1909
},
{
"epoch": 0.08332255142968042,
"grad_norm": 0.421875,
"learning_rate": 0.0009175876359868762,
"loss": 1.6146,
"step": 1932
},
{
"epoch": 0.08431448656574805,
"grad_norm": 0.283203125,
"learning_rate": 0.0009165947159385253,
"loss": 1.5996,
"step": 1955
},
{
"epoch": 0.08530642170181567,
"grad_norm": 0.3203125,
"learning_rate": 0.0009156017958901744,
"loss": 1.6153,
"step": 1978
},
{
"epoch": 0.0862983568378833,
"grad_norm": 0.283203125,
"learning_rate": 0.0009146088758418235,
"loss": 1.6178,
"step": 2001
},
{
"epoch": 0.08729029197395093,
"grad_norm": 0.3671875,
"learning_rate": 0.0009136159557934727,
"loss": 1.5988,
"step": 2024
},
{
"epoch": 0.08828222711001854,
"grad_norm": 0.279296875,
"learning_rate": 0.0009126230357451217,
"loss": 1.578,
"step": 2047
},
{
"epoch": 0.08927416224608617,
"grad_norm": 0.30859375,
"learning_rate": 0.0009116301156967709,
"loss": 1.5755,
"step": 2070
},
{
"epoch": 0.0902660973821538,
"grad_norm": 0.24609375,
"learning_rate": 0.0009106371956484199,
"loss": 1.5772,
"step": 2093
},
{
"epoch": 0.09125803251822141,
"grad_norm": 0.2734375,
"learning_rate": 0.0009096442756000692,
"loss": 1.5405,
"step": 2116
},
{
"epoch": 0.09224996765428904,
"grad_norm": 0.333984375,
"learning_rate": 0.0009086513555517182,
"loss": 1.5458,
"step": 2139
},
{
"epoch": 0.09324190279035667,
"grad_norm": 0.36328125,
"learning_rate": 0.0009076584355033673,
"loss": 1.5783,
"step": 2162
},
{
"epoch": 0.09423383792642429,
"grad_norm": 0.314453125,
"learning_rate": 0.0009066655154550164,
"loss": 1.5237,
"step": 2185
},
{
"epoch": 0.09522577306249191,
"grad_norm": 0.302734375,
"learning_rate": 0.0009056725954066655,
"loss": 1.5361,
"step": 2208
},
{
"epoch": 0.09621770819855954,
"grad_norm": 0.28125,
"learning_rate": 0.0009046796753583147,
"loss": 1.5493,
"step": 2231
},
{
"epoch": 0.09720964333462716,
"grad_norm": 0.369140625,
"learning_rate": 0.0009036867553099638,
"loss": 1.5243,
"step": 2254
},
{
"epoch": 0.09820157847069479,
"grad_norm": 0.41796875,
"learning_rate": 0.0009026938352616129,
"loss": 1.5061,
"step": 2277
},
{
"epoch": 0.09919351360676242,
"grad_norm": 0.3203125,
"learning_rate": 0.000901700915213262,
"loss": 1.5141,
"step": 2300
},
{
"epoch": 0.09996981066977186,
"eval_runtime": 163.8773,
"eval_samples_per_second": 610.212,
"eval_steps_per_second": 7.628,
"step": 2318
},
{
"epoch": 0.10018544874283003,
"grad_norm": 0.27734375,
"learning_rate": 0.0009007079951649111,
"loss": 1.5069,
"step": 2323
},
{
"epoch": 0.10117738387889766,
"grad_norm": 0.310546875,
"learning_rate": 0.0008997150751165601,
"loss": 1.513,
"step": 2346
},
{
"epoch": 0.10216931901496529,
"grad_norm": 0.33203125,
"learning_rate": 0.0008987221550682093,
"loss": 1.5015,
"step": 2369
},
{
"epoch": 0.1031612541510329,
"grad_norm": 0.3359375,
"learning_rate": 0.0008977292350198584,
"loss": 1.4961,
"step": 2392
},
{
"epoch": 0.10415318928710053,
"grad_norm": 0.271484375,
"learning_rate": 0.0008967363149715075,
"loss": 1.4802,
"step": 2415
},
{
"epoch": 0.10514512442316816,
"grad_norm": 0.369140625,
"learning_rate": 0.0008957433949231566,
"loss": 1.4725,
"step": 2438
},
{
"epoch": 0.10613705955923577,
"grad_norm": 0.23828125,
"learning_rate": 0.0008947504748748057,
"loss": 1.4698,
"step": 2461
},
{
"epoch": 0.1071289946953034,
"grad_norm": 0.357421875,
"learning_rate": 0.0008937575548264549,
"loss": 1.4954,
"step": 2484
},
{
"epoch": 0.10812092983137103,
"grad_norm": 0.43359375,
"learning_rate": 0.000892764634778104,
"loss": 1.4221,
"step": 2507
},
{
"epoch": 0.10911286496743865,
"grad_norm": 0.376953125,
"learning_rate": 0.000891771714729753,
"loss": 1.4364,
"step": 2530
},
{
"epoch": 0.11010480010350628,
"grad_norm": 0.29296875,
"learning_rate": 0.0008907787946814022,
"loss": 1.4479,
"step": 2553
},
{
"epoch": 0.1110967352395739,
"grad_norm": 0.341796875,
"learning_rate": 0.0008897858746330512,
"loss": 1.4662,
"step": 2576
},
{
"epoch": 0.11208867037564152,
"grad_norm": 0.31640625,
"learning_rate": 0.0008887929545847005,
"loss": 1.4463,
"step": 2599
},
{
"epoch": 0.11308060551170915,
"grad_norm": 0.322265625,
"learning_rate": 0.0008878000345363495,
"loss": 1.4342,
"step": 2622
},
{
"epoch": 0.11407254064777678,
"grad_norm": 0.283203125,
"learning_rate": 0.0008868071144879987,
"loss": 1.4457,
"step": 2645
},
{
"epoch": 0.11506447578384439,
"grad_norm": 0.34765625,
"learning_rate": 0.0008858141944396477,
"loss": 1.4413,
"step": 2668
},
{
"epoch": 0.11605641091991202,
"grad_norm": 0.35546875,
"learning_rate": 0.000884821274391297,
"loss": 1.4272,
"step": 2691
},
{
"epoch": 0.11704834605597965,
"grad_norm": 0.31640625,
"learning_rate": 0.000883828354342946,
"loss": 1.4108,
"step": 2714
},
{
"epoch": 0.11804028119204726,
"grad_norm": 0.3359375,
"learning_rate": 0.0008828354342945951,
"loss": 1.4226,
"step": 2737
},
{
"epoch": 0.11903221632811489,
"grad_norm": 0.287109375,
"learning_rate": 0.0008818425142462442,
"loss": 1.4096,
"step": 2760
},
{
"epoch": 0.12002415146418252,
"grad_norm": 0.3125,
"learning_rate": 0.0008808495941978933,
"loss": 1.3829,
"step": 2783
},
{
"epoch": 0.12101608660025014,
"grad_norm": 0.396484375,
"learning_rate": 0.0008798566741495424,
"loss": 1.3841,
"step": 2806
},
{
"epoch": 0.12200802173631776,
"grad_norm": 0.27734375,
"learning_rate": 0.0008788637541011916,
"loss": 1.4068,
"step": 2829
},
{
"epoch": 0.1229999568723854,
"grad_norm": 0.380859375,
"learning_rate": 0.0008778708340528407,
"loss": 1.3872,
"step": 2852
},
{
"epoch": 0.12399189200845301,
"grad_norm": 0.2890625,
"learning_rate": 0.0008768779140044898,
"loss": 1.3999,
"step": 2875
},
{
"epoch": 0.12498382714452064,
"grad_norm": 0.3359375,
"learning_rate": 0.0008758849939561388,
"loss": 1.3889,
"step": 2898
},
{
"epoch": 0.12597576228058827,
"grad_norm": 0.271484375,
"learning_rate": 0.0008748920739077879,
"loss": 1.3763,
"step": 2921
},
{
"epoch": 0.1269676974166559,
"grad_norm": 0.328125,
"learning_rate": 0.000873899153859437,
"loss": 1.39,
"step": 2944
},
{
"epoch": 0.1279596325527235,
"grad_norm": 0.3203125,
"learning_rate": 0.0008729062338110862,
"loss": 1.3858,
"step": 2967
},
{
"epoch": 0.12895156768879112,
"grad_norm": 0.2734375,
"learning_rate": 0.0008719133137627353,
"loss": 1.3674,
"step": 2990
},
{
"epoch": 0.12994350282485875,
"grad_norm": 0.30078125,
"learning_rate": 0.0008709203937143844,
"loss": 1.3458,
"step": 3013
},
{
"epoch": 0.13093543796092638,
"grad_norm": 0.349609375,
"learning_rate": 0.0008699274736660335,
"loss": 1.3549,
"step": 3036
},
{
"epoch": 0.131927373096994,
"grad_norm": 0.2890625,
"learning_rate": 0.0008689345536176825,
"loss": 1.3683,
"step": 3059
},
{
"epoch": 0.13291930823306164,
"grad_norm": 0.298828125,
"learning_rate": 0.0008679416335693318,
"loss": 1.3625,
"step": 3082
},
{
"epoch": 0.13391124336912924,
"grad_norm": 0.318359375,
"learning_rate": 0.0008669487135209808,
"loss": 1.3462,
"step": 3105
},
{
"epoch": 0.13490317850519687,
"grad_norm": 0.34765625,
"learning_rate": 0.00086595579347263,
"loss": 1.345,
"step": 3128
},
{
"epoch": 0.1358951136412645,
"grad_norm": 0.259765625,
"learning_rate": 0.000864962873424279,
"loss": 1.3552,
"step": 3151
},
{
"epoch": 0.13688704877733213,
"grad_norm": 0.240234375,
"learning_rate": 0.0008639699533759282,
"loss": 1.3356,
"step": 3174
},
{
"epoch": 0.13787898391339976,
"grad_norm": 0.255859375,
"learning_rate": 0.0008629770333275773,
"loss": 1.3373,
"step": 3197
},
{
"epoch": 0.13887091904946738,
"grad_norm": 0.35546875,
"learning_rate": 0.0008619841132792265,
"loss": 1.3514,
"step": 3220
},
{
"epoch": 0.13986285418553498,
"grad_norm": 0.310546875,
"learning_rate": 0.0008609911932308755,
"loss": 1.3291,
"step": 3243
},
{
"epoch": 0.1408547893216026,
"grad_norm": 0.36328125,
"learning_rate": 0.0008599982731825246,
"loss": 1.3233,
"step": 3266
},
{
"epoch": 0.14184672445767024,
"grad_norm": 0.38671875,
"learning_rate": 0.0008590053531341737,
"loss": 1.3174,
"step": 3289
},
{
"epoch": 0.14283865959373787,
"grad_norm": 0.25,
"learning_rate": 0.0008580124330858229,
"loss": 1.3304,
"step": 3312
},
{
"epoch": 0.1438305947298055,
"grad_norm": 0.3125,
"learning_rate": 0.000857019513037472,
"loss": 1.3287,
"step": 3335
},
{
"epoch": 0.14482252986587313,
"grad_norm": 0.34375,
"learning_rate": 0.0008560265929891211,
"loss": 1.3201,
"step": 3358
},
{
"epoch": 0.14581446500194073,
"grad_norm": 0.330078125,
"learning_rate": 0.0008550336729407702,
"loss": 1.3027,
"step": 3381
},
{
"epoch": 0.14680640013800836,
"grad_norm": 0.34765625,
"learning_rate": 0.0008540407528924193,
"loss": 1.3128,
"step": 3404
},
{
"epoch": 0.147798335274076,
"grad_norm": 0.3828125,
"learning_rate": 0.0008530478328440683,
"loss": 1.3182,
"step": 3427
},
{
"epoch": 0.14879027041014362,
"grad_norm": 0.287109375,
"learning_rate": 0.0008520549127957176,
"loss": 1.2906,
"step": 3450
},
{
"epoch": 0.14978220554621124,
"grad_norm": 0.271484375,
"learning_rate": 0.0008510619927473666,
"loss": 1.2837,
"step": 3473
},
{
"epoch": 0.15077414068227887,
"grad_norm": 0.365234375,
"learning_rate": 0.0008500690726990157,
"loss": 1.3114,
"step": 3496
},
{
"epoch": 0.15176607581834647,
"grad_norm": 0.365234375,
"learning_rate": 0.0008490761526506648,
"loss": 1.3047,
"step": 3519
},
{
"epoch": 0.1527580109544141,
"grad_norm": 0.322265625,
"learning_rate": 0.000848083232602314,
"loss": 1.2654,
"step": 3542
},
{
"epoch": 0.15374994609048173,
"grad_norm": 0.302734375,
"learning_rate": 0.0008470903125539631,
"loss": 1.2675,
"step": 3565
},
{
"epoch": 0.15474188122654936,
"grad_norm": 0.3671875,
"learning_rate": 0.0008460973925056121,
"loss": 1.2671,
"step": 3588
},
{
"epoch": 0.155733816362617,
"grad_norm": 0.26953125,
"learning_rate": 0.0008451044724572613,
"loss": 1.2719,
"step": 3611
},
{
"epoch": 0.15672575149868462,
"grad_norm": 0.29296875,
"learning_rate": 0.0008441115524089103,
"loss": 1.2755,
"step": 3634
},
{
"epoch": 0.15771768663475222,
"grad_norm": 0.318359375,
"learning_rate": 0.0008431186323605595,
"loss": 1.2876,
"step": 3657
},
{
"epoch": 0.15870962177081985,
"grad_norm": 0.35546875,
"learning_rate": 0.0008421257123122086,
"loss": 1.2928,
"step": 3680
},
{
"epoch": 0.15970155690688748,
"grad_norm": 0.3046875,
"learning_rate": 0.0008411327922638578,
"loss": 1.2855,
"step": 3703
},
{
"epoch": 0.1606934920429551,
"grad_norm": 0.341796875,
"learning_rate": 0.0008401398722155068,
"loss": 1.2558,
"step": 3726
},
{
"epoch": 0.16168542717902273,
"grad_norm": 0.34375,
"learning_rate": 0.000839146952167156,
"loss": 1.2561,
"step": 3749
},
{
"epoch": 0.16267736231509036,
"grad_norm": 0.361328125,
"learning_rate": 0.000838154032118805,
"loss": 1.2851,
"step": 3772
},
{
"epoch": 0.16366929745115796,
"grad_norm": 0.3359375,
"learning_rate": 0.0008371611120704542,
"loss": 1.2682,
"step": 3795
},
{
"epoch": 0.1646612325872256,
"grad_norm": 0.33203125,
"learning_rate": 0.0008361681920221033,
"loss": 1.2709,
"step": 3818
},
{
"epoch": 0.16565316772329322,
"grad_norm": 0.302734375,
"learning_rate": 0.0008351752719737524,
"loss": 1.2459,
"step": 3841
},
{
"epoch": 0.16664510285936085,
"grad_norm": 0.3203125,
"learning_rate": 0.0008341823519254015,
"loss": 1.2134,
"step": 3864
},
{
"epoch": 0.16763703799542848,
"grad_norm": 0.365234375,
"learning_rate": 0.0008331894318770506,
"loss": 1.2445,
"step": 3887
},
{
"epoch": 0.1686289731314961,
"grad_norm": 0.306640625,
"learning_rate": 0.0008321965118286998,
"loss": 1.2522,
"step": 3910
},
{
"epoch": 0.1696209082675637,
"grad_norm": 0.314453125,
"learning_rate": 0.0008312035917803489,
"loss": 1.2296,
"step": 3933
},
{
"epoch": 0.17061284340363134,
"grad_norm": 0.33203125,
"learning_rate": 0.0008302106717319979,
"loss": 1.252,
"step": 3956
},
{
"epoch": 0.17160477853969897,
"grad_norm": 0.29296875,
"learning_rate": 0.0008292177516836471,
"loss": 1.2501,
"step": 3979
},
{
"epoch": 0.1725967136757666,
"grad_norm": 0.337890625,
"learning_rate": 0.0008282248316352961,
"loss": 1.2328,
"step": 4002
},
{
"epoch": 0.17358864881183422,
"grad_norm": 0.306640625,
"learning_rate": 0.0008272319115869454,
"loss": 1.2182,
"step": 4025
},
{
"epoch": 0.17458058394790185,
"grad_norm": 0.400390625,
"learning_rate": 0.0008262389915385944,
"loss": 1.2515,
"step": 4048
},
{
"epoch": 0.17557251908396945,
"grad_norm": 0.32421875,
"learning_rate": 0.0008252460714902435,
"loss": 1.2076,
"step": 4071
},
{
"epoch": 0.17656445422003708,
"grad_norm": 0.322265625,
"learning_rate": 0.0008242531514418926,
"loss": 1.2219,
"step": 4094
},
{
"epoch": 0.1775563893561047,
"grad_norm": 0.2734375,
"learning_rate": 0.0008232602313935416,
"loss": 1.2307,
"step": 4117
},
{
"epoch": 0.17854832449217234,
"grad_norm": 0.41015625,
"learning_rate": 0.0008222673113451908,
"loss": 1.2368,
"step": 4140
},
{
"epoch": 0.17954025962823997,
"grad_norm": 0.322265625,
"learning_rate": 0.0008212743912968399,
"loss": 1.2332,
"step": 4163
},
{
"epoch": 0.1805321947643076,
"grad_norm": 0.2734375,
"learning_rate": 0.0008202814712484891,
"loss": 1.2086,
"step": 4186
},
{
"epoch": 0.1815241299003752,
"grad_norm": 0.4375,
"learning_rate": 0.0008192885512001381,
"loss": 1.2204,
"step": 4209
},
{
"epoch": 0.18251606503644283,
"grad_norm": 0.365234375,
"learning_rate": 0.0008182956311517873,
"loss": 1.2379,
"step": 4232
},
{
"epoch": 0.18350800017251045,
"grad_norm": 0.294921875,
"learning_rate": 0.0008173027111034363,
"loss": 1.2387,
"step": 4255
},
{
"epoch": 0.18449993530857808,
"grad_norm": 0.28125,
"learning_rate": 0.0008163097910550856,
"loss": 1.2008,
"step": 4278
},
{
"epoch": 0.1854918704446457,
"grad_norm": 0.28515625,
"learning_rate": 0.0008153168710067346,
"loss": 1.2191,
"step": 4301
},
{
"epoch": 0.18648380558071334,
"grad_norm": 0.30859375,
"learning_rate": 0.0008143239509583837,
"loss": 1.2153,
"step": 4324
},
{
"epoch": 0.18747574071678094,
"grad_norm": 0.404296875,
"learning_rate": 0.0008133310309100328,
"loss": 1.2171,
"step": 4347
},
{
"epoch": 0.18846767585284857,
"grad_norm": 0.298828125,
"learning_rate": 0.0008123381108616819,
"loss": 1.2154,
"step": 4370
},
{
"epoch": 0.1894596109889162,
"grad_norm": 0.353515625,
"learning_rate": 0.0008113451908133311,
"loss": 1.2055,
"step": 4393
},
{
"epoch": 0.19045154612498383,
"grad_norm": 0.337890625,
"learning_rate": 0.0008103522707649802,
"loss": 1.2215,
"step": 4416
},
{
"epoch": 0.19144348126105146,
"grad_norm": 0.3203125,
"learning_rate": 0.0008093593507166293,
"loss": 1.2035,
"step": 4439
},
{
"epoch": 0.19243541639711909,
"grad_norm": 0.326171875,
"learning_rate": 0.0008083664306682784,
"loss": 1.183,
"step": 4462
},
{
"epoch": 0.1934273515331867,
"grad_norm": 0.34765625,
"learning_rate": 0.0008073735106199274,
"loss": 1.1892,
"step": 4485
},
{
"epoch": 0.19441928666925432,
"grad_norm": 0.337890625,
"learning_rate": 0.0008063805905715767,
"loss": 1.2006,
"step": 4508
},
{
"epoch": 0.19541122180532194,
"grad_norm": 0.41796875,
"learning_rate": 0.0008053876705232257,
"loss": 1.2081,
"step": 4531
},
{
"epoch": 0.19640315694138957,
"grad_norm": 0.357421875,
"learning_rate": 0.0008043947504748749,
"loss": 1.1878,
"step": 4554
},
{
"epoch": 0.1973950920774572,
"grad_norm": 0.3125,
"learning_rate": 0.0008034018304265239,
"loss": 1.1701,
"step": 4577
},
{
"epoch": 0.19838702721352483,
"grad_norm": 0.337890625,
"learning_rate": 0.0008024089103781731,
"loss": 1.179,
"step": 4600
},
{
"epoch": 0.19937896234959243,
"grad_norm": 0.361328125,
"learning_rate": 0.0008014159903298221,
"loss": 1.1794,
"step": 4623
},
{
"epoch": 0.1999396213395437,
"eval_runtime": 164.1241,
"eval_samples_per_second": 609.295,
"eval_steps_per_second": 7.616,
"step": 4636
},
{
"epoch": 0.20037089748566006,
"grad_norm": 0.357421875,
"learning_rate": 0.0008004230702814712,
"loss": 1.181,
"step": 4646
},
{
"epoch": 0.2013628326217277,
"grad_norm": 0.369140625,
"learning_rate": 0.0007994301502331204,
"loss": 1.1706,
"step": 4669
},
{
"epoch": 0.20235476775779532,
"grad_norm": 0.30078125,
"learning_rate": 0.0007984372301847694,
"loss": 1.2005,
"step": 4692
},
{
"epoch": 0.20334670289386295,
"grad_norm": 0.34375,
"learning_rate": 0.0007974443101364186,
"loss": 1.1784,
"step": 4715
},
{
"epoch": 0.20433863802993057,
"grad_norm": 0.310546875,
"learning_rate": 0.0007964513900880676,
"loss": 1.1774,
"step": 4738
},
{
"epoch": 0.20533057316599818,
"grad_norm": 0.26953125,
"learning_rate": 0.0007954584700397169,
"loss": 1.1589,
"step": 4761
},
{
"epoch": 0.2063225083020658,
"grad_norm": 0.345703125,
"learning_rate": 0.0007944655499913659,
"loss": 1.1821,
"step": 4784
},
{
"epoch": 0.20731444343813343,
"grad_norm": 0.287109375,
"learning_rate": 0.0007934726299430151,
"loss": 1.1496,
"step": 4807
},
{
"epoch": 0.20830637857420106,
"grad_norm": 0.3671875,
"learning_rate": 0.0007924797098946641,
"loss": 1.1666,
"step": 4830
},
{
"epoch": 0.2092983137102687,
"grad_norm": 0.3046875,
"learning_rate": 0.0007914867898463132,
"loss": 1.1546,
"step": 4853
},
{
"epoch": 0.21029024884633632,
"grad_norm": 0.361328125,
"learning_rate": 0.0007904938697979624,
"loss": 1.2045,
"step": 4876
},
{
"epoch": 0.21128218398240392,
"grad_norm": 0.56640625,
"learning_rate": 0.0007895009497496115,
"loss": 1.1757,
"step": 4899
},
{
"epoch": 0.21227411911847155,
"grad_norm": 0.36328125,
"learning_rate": 0.0007885080297012606,
"loss": 1.1894,
"step": 4922
},
{
"epoch": 0.21326605425453918,
"grad_norm": 0.3046875,
"learning_rate": 0.0007875151096529097,
"loss": 1.174,
"step": 4945
},
{
"epoch": 0.2142579893906068,
"grad_norm": 0.322265625,
"learning_rate": 0.0007865221896045588,
"loss": 1.1423,
"step": 4968
},
{
"epoch": 0.21524992452667444,
"grad_norm": 0.421875,
"learning_rate": 0.000785529269556208,
"loss": 1.1532,
"step": 4991
},
{
"epoch": 0.21624185966274206,
"grad_norm": 0.3203125,
"learning_rate": 0.000784536349507857,
"loss": 1.1688,
"step": 5014
},
{
"epoch": 0.21723379479880967,
"grad_norm": 0.2734375,
"learning_rate": 0.0007835434294595062,
"loss": 1.1446,
"step": 5037
},
{
"epoch": 0.2182257299348773,
"grad_norm": 0.365234375,
"learning_rate": 0.0007825505094111552,
"loss": 1.1792,
"step": 5060
},
{
"epoch": 0.21921766507094492,
"grad_norm": 0.33203125,
"learning_rate": 0.0007815575893628044,
"loss": 1.1516,
"step": 5083
},
{
"epoch": 0.22020960020701255,
"grad_norm": 0.296875,
"learning_rate": 0.0007805646693144534,
"loss": 1.1712,
"step": 5106
},
{
"epoch": 0.22120153534308018,
"grad_norm": 0.341796875,
"learning_rate": 0.0007795717492661027,
"loss": 1.1562,
"step": 5129
},
{
"epoch": 0.2221934704791478,
"grad_norm": 0.470703125,
"learning_rate": 0.0007785788292177517,
"loss": 1.1641,
"step": 5152
},
{
"epoch": 0.2231854056152154,
"grad_norm": 0.4765625,
"learning_rate": 0.0007775859091694009,
"loss": 1.1649,
"step": 5175
},
{
"epoch": 0.22417734075128304,
"grad_norm": 0.333984375,
"learning_rate": 0.0007765929891210499,
"loss": 1.1684,
"step": 5198
},
{
"epoch": 0.22516927588735067,
"grad_norm": 0.33984375,
"learning_rate": 0.0007756000690726989,
"loss": 1.1659,
"step": 5221
},
{
"epoch": 0.2261612110234183,
"grad_norm": 0.419921875,
"learning_rate": 0.0007746071490243482,
"loss": 1.1683,
"step": 5244
},
{
"epoch": 0.22715314615948592,
"grad_norm": 0.33203125,
"learning_rate": 0.0007736142289759972,
"loss": 1.1536,
"step": 5267
},
{
"epoch": 0.22814508129555355,
"grad_norm": 0.3203125,
"learning_rate": 0.0007726213089276464,
"loss": 1.1451,
"step": 5290
},
{
"epoch": 0.22913701643162115,
"grad_norm": 0.35546875,
"learning_rate": 0.0007716283888792954,
"loss": 1.1453,
"step": 5313
},
{
"epoch": 0.23012895156768878,
"grad_norm": 0.298828125,
"learning_rate": 0.0007706354688309446,
"loss": 1.1585,
"step": 5336
},
{
"epoch": 0.2311208867037564,
"grad_norm": 0.3828125,
"learning_rate": 0.0007696425487825937,
"loss": 1.1585,
"step": 5359
},
{
"epoch": 0.23211282183982404,
"grad_norm": 0.298828125,
"learning_rate": 0.0007686496287342428,
"loss": 1.1396,
"step": 5382
},
{
"epoch": 0.23310475697589167,
"grad_norm": 0.35546875,
"learning_rate": 0.0007676567086858919,
"loss": 1.1527,
"step": 5405
},
{
"epoch": 0.2340966921119593,
"grad_norm": 0.310546875,
"learning_rate": 0.000766663788637541,
"loss": 1.1157,
"step": 5428
},
{
"epoch": 0.2350886272480269,
"grad_norm": 0.31640625,
"learning_rate": 0.0007656708685891901,
"loss": 1.1232,
"step": 5451
},
{
"epoch": 0.23608056238409453,
"grad_norm": 0.3359375,
"learning_rate": 0.0007646779485408393,
"loss": 1.1411,
"step": 5474
},
{
"epoch": 0.23707249752016216,
"grad_norm": 0.314453125,
"learning_rate": 0.0007636850284924884,
"loss": 1.1329,
"step": 5497
},
{
"epoch": 0.23806443265622979,
"grad_norm": 0.314453125,
"learning_rate": 0.0007626921084441375,
"loss": 1.1262,
"step": 5520
},
{
"epoch": 0.23905636779229741,
"grad_norm": 0.3359375,
"learning_rate": 0.0007616991883957865,
"loss": 1.1346,
"step": 5543
},
{
"epoch": 0.24004830292836504,
"grad_norm": 0.306640625,
"learning_rate": 0.0007607062683474357,
"loss": 1.139,
"step": 5566
},
{
"epoch": 0.24104023806443264,
"grad_norm": 0.396484375,
"learning_rate": 0.0007597133482990847,
"loss": 1.1318,
"step": 5589
},
{
"epoch": 0.24203217320050027,
"grad_norm": 0.35546875,
"learning_rate": 0.000758720428250734,
"loss": 1.1178,
"step": 5612
},
{
"epoch": 0.2430241083365679,
"grad_norm": 0.357421875,
"learning_rate": 0.000757727508202383,
"loss": 1.1312,
"step": 5635
},
{
"epoch": 0.24401604347263553,
"grad_norm": 0.373046875,
"learning_rate": 0.0007567345881540322,
"loss": 1.1263,
"step": 5658
},
{
"epoch": 0.24500797860870316,
"grad_norm": 0.306640625,
"learning_rate": 0.0007557416681056812,
"loss": 1.1303,
"step": 5681
},
{
"epoch": 0.2459999137447708,
"grad_norm": 0.412109375,
"learning_rate": 0.0007547487480573305,
"loss": 1.1365,
"step": 5704
},
{
"epoch": 0.2469918488808384,
"grad_norm": 0.375,
"learning_rate": 0.0007537558280089795,
"loss": 1.1239,
"step": 5727
},
{
"epoch": 0.24798378401690602,
"grad_norm": 0.33984375,
"learning_rate": 0.0007527629079606286,
"loss": 1.1212,
"step": 5750
},
{
"epoch": 0.24897571915297365,
"grad_norm": 0.396484375,
"learning_rate": 0.0007517699879122777,
"loss": 1.1128,
"step": 5773
},
{
"epoch": 0.24996765428904127,
"grad_norm": 0.345703125,
"learning_rate": 0.0007507770678639267,
"loss": 1.1153,
"step": 5796
},
{
"epoch": 0.2509595894251089,
"grad_norm": 0.337890625,
"learning_rate": 0.0007497841478155759,
"loss": 1.1191,
"step": 5819
},
{
"epoch": 0.25195152456117653,
"grad_norm": 0.357421875,
"learning_rate": 0.000748791227767225,
"loss": 1.1115,
"step": 5842
},
{
"epoch": 0.25294345969724413,
"grad_norm": 0.484375,
"learning_rate": 0.0007477983077188742,
"loss": 1.1377,
"step": 5865
},
{
"epoch": 0.2539353948333118,
"grad_norm": 0.3671875,
"learning_rate": 0.0007468053876705232,
"loss": 1.1345,
"step": 5888
},
{
"epoch": 0.2549273299693794,
"grad_norm": 0.3359375,
"learning_rate": 0.0007458124676221723,
"loss": 1.1226,
"step": 5911
},
{
"epoch": 0.255919265105447,
"grad_norm": 0.361328125,
"learning_rate": 0.0007448195475738214,
"loss": 1.1163,
"step": 5934
},
{
"epoch": 0.25691120024151465,
"grad_norm": 0.361328125,
"learning_rate": 0.0007438266275254706,
"loss": 1.1255,
"step": 5957
},
{
"epoch": 0.25790313537758225,
"grad_norm": 0.322265625,
"learning_rate": 0.0007428337074771197,
"loss": 1.1049,
"step": 5980
},
{
"epoch": 0.2588950705136499,
"grad_norm": 0.427734375,
"learning_rate": 0.0007418407874287688,
"loss": 1.1294,
"step": 6003
},
{
"epoch": 0.2598870056497175,
"grad_norm": 0.34765625,
"learning_rate": 0.0007408478673804179,
"loss": 1.1167,
"step": 6026
},
{
"epoch": 0.26087894078578516,
"grad_norm": 0.388671875,
"learning_rate": 0.000739854947332067,
"loss": 1.1041,
"step": 6049
},
{
"epoch": 0.26187087592185276,
"grad_norm": 0.302734375,
"learning_rate": 0.000738862027283716,
"loss": 1.1146,
"step": 6072
},
{
"epoch": 0.26286281105792036,
"grad_norm": 0.341796875,
"learning_rate": 0.0007378691072353653,
"loss": 1.104,
"step": 6095
},
{
"epoch": 0.263854746193988,
"grad_norm": 0.416015625,
"learning_rate": 0.0007368761871870143,
"loss": 1.1091,
"step": 6118
},
{
"epoch": 0.2648466813300556,
"grad_norm": 0.326171875,
"learning_rate": 0.0007358832671386635,
"loss": 1.1295,
"step": 6141
},
{
"epoch": 0.2658386164661233,
"grad_norm": 0.341796875,
"learning_rate": 0.0007348903470903125,
"loss": 1.0913,
"step": 6164
},
{
"epoch": 0.2668305516021909,
"grad_norm": 0.32421875,
"learning_rate": 0.0007338974270419618,
"loss": 1.0806,
"step": 6187
},
{
"epoch": 0.2678224867382585,
"grad_norm": 0.380859375,
"learning_rate": 0.0007329045069936108,
"loss": 1.099,
"step": 6210
},
{
"epoch": 0.26881442187432614,
"grad_norm": 0.453125,
"learning_rate": 0.00073191158694526,
"loss": 1.0982,
"step": 6233
},
{
"epoch": 0.26980635701039374,
"grad_norm": 0.3515625,
"learning_rate": 0.000730918666896909,
"loss": 1.093,
"step": 6256
},
{
"epoch": 0.2707982921464614,
"grad_norm": 0.3359375,
"learning_rate": 0.0007299257468485581,
"loss": 1.0773,
"step": 6279
},
{
"epoch": 0.271790227282529,
"grad_norm": 0.33984375,
"learning_rate": 0.0007289328268002072,
"loss": 1.101,
"step": 6302
},
{
"epoch": 0.27278216241859665,
"grad_norm": 0.328125,
"learning_rate": 0.0007279399067518564,
"loss": 1.1084,
"step": 6325
},
{
"epoch": 0.27377409755466425,
"grad_norm": 0.359375,
"learning_rate": 0.0007269469867035055,
"loss": 1.0869,
"step": 6348
},
{
"epoch": 0.27476603269073185,
"grad_norm": 0.3203125,
"learning_rate": 0.0007259540666551545,
"loss": 1.088,
"step": 6371
},
{
"epoch": 0.2757579678267995,
"grad_norm": 0.3984375,
"learning_rate": 0.0007249611466068037,
"loss": 1.1002,
"step": 6394
},
{
"epoch": 0.2767499029628671,
"grad_norm": 0.30859375,
"learning_rate": 0.0007239682265584527,
"loss": 1.083,
"step": 6417
},
{
"epoch": 0.27774183809893477,
"grad_norm": 0.306640625,
"learning_rate": 0.0007229753065101019,
"loss": 1.0884,
"step": 6440
},
{
"epoch": 0.27873377323500237,
"grad_norm": 0.41015625,
"learning_rate": 0.000721982386461751,
"loss": 1.092,
"step": 6463
},
{
"epoch": 0.27972570837106997,
"grad_norm": 0.3359375,
"learning_rate": 0.0007209894664134001,
"loss": 1.0736,
"step": 6486
},
{
"epoch": 0.2807176435071376,
"grad_norm": 0.353515625,
"learning_rate": 0.0007199965463650492,
"loss": 1.091,
"step": 6509
},
{
"epoch": 0.2817095786432052,
"grad_norm": 0.34765625,
"learning_rate": 0.0007190036263166983,
"loss": 1.0893,
"step": 6532
},
{
"epoch": 0.2827015137792729,
"grad_norm": 0.41015625,
"learning_rate": 0.0007180107062683475,
"loss": 1.0936,
"step": 6555
},
{
"epoch": 0.2836934489153405,
"grad_norm": 0.423828125,
"learning_rate": 0.0007170177862199966,
"loss": 1.0955,
"step": 6578
},
{
"epoch": 0.28468538405140814,
"grad_norm": 0.34765625,
"learning_rate": 0.0007160248661716456,
"loss": 1.1016,
"step": 6601
},
{
"epoch": 0.28567731918747574,
"grad_norm": 0.34765625,
"learning_rate": 0.0007150319461232948,
"loss": 1.102,
"step": 6624
},
{
"epoch": 0.28666925432354334,
"grad_norm": 0.404296875,
"learning_rate": 0.0007140390260749438,
"loss": 1.08,
"step": 6647
},
{
"epoch": 0.287661189459611,
"grad_norm": 0.44140625,
"learning_rate": 0.000713046106026593,
"loss": 1.0785,
"step": 6670
},
{
"epoch": 0.2886531245956786,
"grad_norm": 0.392578125,
"learning_rate": 0.0007120531859782421,
"loss": 1.0944,
"step": 6693
},
{
"epoch": 0.28964505973174626,
"grad_norm": 0.34375,
"learning_rate": 0.0007110602659298913,
"loss": 1.0851,
"step": 6716
},
{
"epoch": 0.29063699486781386,
"grad_norm": 0.40234375,
"learning_rate": 0.0007100673458815403,
"loss": 1.0942,
"step": 6739
},
{
"epoch": 0.29162893000388146,
"grad_norm": 0.42578125,
"learning_rate": 0.0007090744258331895,
"loss": 1.0831,
"step": 6762
},
{
"epoch": 0.2926208651399491,
"grad_norm": 0.408203125,
"learning_rate": 0.0007080815057848385,
"loss": 1.0724,
"step": 6785
},
{
"epoch": 0.2936128002760167,
"grad_norm": 0.31640625,
"learning_rate": 0.0007070885857364877,
"loss": 1.0875,
"step": 6808
},
{
"epoch": 0.2946047354120844,
"grad_norm": 0.466796875,
"learning_rate": 0.0007060956656881368,
"loss": 1.0896,
"step": 6831
},
{
"epoch": 0.295596670548152,
"grad_norm": 0.3984375,
"learning_rate": 0.0007051027456397859,
"loss": 1.0702,
"step": 6854
},
{
"epoch": 0.29658860568421963,
"grad_norm": 0.390625,
"learning_rate": 0.000704109825591435,
"loss": 1.0887,
"step": 6877
},
{
"epoch": 0.29758054082028723,
"grad_norm": 0.341796875,
"learning_rate": 0.0007031169055430841,
"loss": 1.0708,
"step": 6900
},
{
"epoch": 0.29857247595635483,
"grad_norm": 0.41796875,
"learning_rate": 0.0007021239854947333,
"loss": 1.0828,
"step": 6923
},
{
"epoch": 0.2995644110924225,
"grad_norm": 0.33984375,
"learning_rate": 0.0007011310654463823,
"loss": 1.061,
"step": 6946
},
{
"epoch": 0.2999094320093156,
"eval_runtime": 164.053,
"eval_samples_per_second": 609.559,
"eval_steps_per_second": 7.619,
"step": 6954
},
{
"epoch": 0.3005563462284901,
"grad_norm": 0.345703125,
"learning_rate": 0.0007001381453980314,
"loss": 1.0634,
"step": 6969
},
{
"epoch": 0.30154828136455775,
"grad_norm": 0.359375,
"learning_rate": 0.0006991452253496805,
"loss": 1.071,
"step": 6992
},
{
"epoch": 0.30254021650062535,
"grad_norm": 0.3828125,
"learning_rate": 0.0006981523053013296,
"loss": 1.0822,
"step": 7015
},
{
"epoch": 0.30353215163669295,
"grad_norm": 0.4140625,
"learning_rate": 0.0006971593852529788,
"loss": 1.0618,
"step": 7038
},
{
"epoch": 0.3045240867727606,
"grad_norm": 0.322265625,
"learning_rate": 0.0006961664652046279,
"loss": 1.0722,
"step": 7061
},
{
"epoch": 0.3055160219088282,
"grad_norm": 0.3359375,
"learning_rate": 0.000695173545156277,
"loss": 1.0651,
"step": 7084
},
{
"epoch": 0.30650795704489586,
"grad_norm": 0.34765625,
"learning_rate": 0.0006941806251079261,
"loss": 1.0648,
"step": 7107
},
{
"epoch": 0.30749989218096346,
"grad_norm": 0.359375,
"learning_rate": 0.0006931877050595752,
"loss": 1.0615,
"step": 7130
},
{
"epoch": 0.3084918273170311,
"grad_norm": 0.349609375,
"learning_rate": 0.0006921947850112244,
"loss": 1.0673,
"step": 7153
},
{
"epoch": 0.3094837624530987,
"grad_norm": 0.38671875,
"learning_rate": 0.0006912018649628734,
"loss": 1.0596,
"step": 7176
},
{
"epoch": 0.3104756975891663,
"grad_norm": 0.337890625,
"learning_rate": 0.0006902089449145226,
"loss": 1.0665,
"step": 7199
},
{
"epoch": 0.311467632725234,
"grad_norm": 0.388671875,
"learning_rate": 0.0006892160248661716,
"loss": 1.0784,
"step": 7222
},
{
"epoch": 0.3124595678613016,
"grad_norm": 0.33203125,
"learning_rate": 0.0006882231048178208,
"loss": 1.0737,
"step": 7245
},
{
"epoch": 0.31345150299736924,
"grad_norm": 0.326171875,
"learning_rate": 0.0006872301847694698,
"loss": 1.0647,
"step": 7268
},
{
"epoch": 0.31444343813343684,
"grad_norm": 0.33984375,
"learning_rate": 0.0006862372647211191,
"loss": 1.0859,
"step": 7291
},
{
"epoch": 0.31543537326950444,
"grad_norm": 0.40625,
"learning_rate": 0.0006852443446727681,
"loss": 1.0703,
"step": 7314
},
{
"epoch": 0.3164273084055721,
"grad_norm": 0.37890625,
"learning_rate": 0.0006842514246244172,
"loss": 1.0581,
"step": 7337
},
{
"epoch": 0.3174192435416397,
"grad_norm": 0.408203125,
"learning_rate": 0.0006832585045760663,
"loss": 1.0557,
"step": 7360
},
{
"epoch": 0.31841117867770735,
"grad_norm": 0.3203125,
"learning_rate": 0.0006822655845277154,
"loss": 1.0853,
"step": 7383
},
{
"epoch": 0.31940311381377495,
"grad_norm": 0.4296875,
"learning_rate": 0.0006812726644793646,
"loss": 1.0602,
"step": 7406
},
{
"epoch": 0.3203950489498426,
"grad_norm": 0.41015625,
"learning_rate": 0.0006802797444310137,
"loss": 1.0611,
"step": 7429
},
{
"epoch": 0.3213869840859102,
"grad_norm": 0.3203125,
"learning_rate": 0.0006792868243826628,
"loss": 1.0703,
"step": 7452
},
{
"epoch": 0.3223789192219778,
"grad_norm": 0.44140625,
"learning_rate": 0.0006782939043343119,
"loss": 1.0357,
"step": 7475
},
{
"epoch": 0.32337085435804547,
"grad_norm": 0.345703125,
"learning_rate": 0.0006773009842859609,
"loss": 1.0556,
"step": 7498
},
{
"epoch": 0.32436278949411307,
"grad_norm": 0.37109375,
"learning_rate": 0.00067630806423761,
"loss": 1.0561,
"step": 7521
},
{
"epoch": 0.3253547246301807,
"grad_norm": 0.341796875,
"learning_rate": 0.0006753151441892592,
"loss": 1.0693,
"step": 7544
},
{
"epoch": 0.3263466597662483,
"grad_norm": 0.48046875,
"learning_rate": 0.0006743222241409083,
"loss": 1.0668,
"step": 7567
},
{
"epoch": 0.3273385949023159,
"grad_norm": 0.33984375,
"learning_rate": 0.0006733293040925574,
"loss": 1.038,
"step": 7590
},
{
"epoch": 0.3283305300383836,
"grad_norm": 0.34765625,
"learning_rate": 0.0006723363840442065,
"loss": 1.056,
"step": 7613
},
{
"epoch": 0.3293224651744512,
"grad_norm": 0.376953125,
"learning_rate": 0.0006713434639958557,
"loss": 1.0444,
"step": 7636
},
{
"epoch": 0.33031440031051884,
"grad_norm": 0.34375,
"learning_rate": 0.0006703505439475048,
"loss": 1.0534,
"step": 7659
},
{
"epoch": 0.33130633544658644,
"grad_norm": 0.341796875,
"learning_rate": 0.0006693576238991539,
"loss": 1.0597,
"step": 7682
},
{
"epoch": 0.3322982705826541,
"grad_norm": 0.41796875,
"learning_rate": 0.0006683647038508029,
"loss": 1.0305,
"step": 7705
},
{
"epoch": 0.3332902057187217,
"grad_norm": 0.3515625,
"learning_rate": 0.0006673717838024521,
"loss": 1.0632,
"step": 7728
},
{
"epoch": 0.3342821408547893,
"grad_norm": 0.341796875,
"learning_rate": 0.0006663788637541011,
"loss": 1.0462,
"step": 7751
},
{
"epoch": 0.33527407599085696,
"grad_norm": 0.423828125,
"learning_rate": 0.0006653859437057504,
"loss": 1.0534,
"step": 7774
},
{
"epoch": 0.33626601112692456,
"grad_norm": 0.37109375,
"learning_rate": 0.0006643930236573994,
"loss": 1.0468,
"step": 7797
},
{
"epoch": 0.3372579462629922,
"grad_norm": 0.37890625,
"learning_rate": 0.0006634001036090486,
"loss": 1.0473,
"step": 7820
},
{
"epoch": 0.3382498813990598,
"grad_norm": 0.349609375,
"learning_rate": 0.0006624071835606976,
"loss": 1.0402,
"step": 7843
},
{
"epoch": 0.3392418165351274,
"grad_norm": 0.384765625,
"learning_rate": 0.0006614142635123467,
"loss": 1.0713,
"step": 7866
},
{
"epoch": 0.3402337516711951,
"grad_norm": 0.388671875,
"learning_rate": 0.0006604213434639959,
"loss": 1.0549,
"step": 7889
},
{
"epoch": 0.3412256868072627,
"grad_norm": 0.3359375,
"learning_rate": 0.000659428423415645,
"loss": 1.0527,
"step": 7912
},
{
"epoch": 0.34221762194333033,
"grad_norm": 0.421875,
"learning_rate": 0.0006584355033672941,
"loss": 1.0599,
"step": 7935
},
{
"epoch": 0.34320955707939793,
"grad_norm": 0.322265625,
"learning_rate": 0.0006574425833189432,
"loss": 1.0503,
"step": 7958
},
{
"epoch": 0.3442014922154656,
"grad_norm": 0.388671875,
"learning_rate": 0.0006564496632705923,
"loss": 1.0465,
"step": 7981
},
{
"epoch": 0.3451934273515332,
"grad_norm": 0.314453125,
"learning_rate": 0.0006554567432222415,
"loss": 1.0246,
"step": 8004
},
{
"epoch": 0.3461853624876008,
"grad_norm": 0.36328125,
"learning_rate": 0.0006544638231738905,
"loss": 1.0463,
"step": 8027
},
{
"epoch": 0.34717729762366845,
"grad_norm": 0.42578125,
"learning_rate": 0.0006534709031255397,
"loss": 1.0471,
"step": 8050
},
{
"epoch": 0.34816923275973605,
"grad_norm": 0.470703125,
"learning_rate": 0.0006524779830771887,
"loss": 1.0414,
"step": 8073
},
{
"epoch": 0.3491611678958037,
"grad_norm": 0.515625,
"learning_rate": 0.0006514850630288378,
"loss": 1.0431,
"step": 8096
},
{
"epoch": 0.3501531030318713,
"grad_norm": 0.37109375,
"learning_rate": 0.000650492142980487,
"loss": 1.0382,
"step": 8119
},
{
"epoch": 0.3511450381679389,
"grad_norm": 0.39453125,
"learning_rate": 0.0006494992229321361,
"loss": 1.0557,
"step": 8142
},
{
"epoch": 0.35213697330400656,
"grad_norm": 0.376953125,
"learning_rate": 0.0006485063028837852,
"loss": 1.0414,
"step": 8165
},
{
"epoch": 0.35312890844007416,
"grad_norm": 0.365234375,
"learning_rate": 0.0006475133828354343,
"loss": 1.0221,
"step": 8188
},
{
"epoch": 0.3541208435761418,
"grad_norm": 0.384765625,
"learning_rate": 0.0006465204627870834,
"loss": 1.0207,
"step": 8211
},
{
"epoch": 0.3551127787122094,
"grad_norm": 0.486328125,
"learning_rate": 0.0006455275427387324,
"loss": 1.0399,
"step": 8234
},
{
"epoch": 0.3561047138482771,
"grad_norm": 0.43359375,
"learning_rate": 0.0006445346226903817,
"loss": 1.0256,
"step": 8257
},
{
"epoch": 0.3570966489843447,
"grad_norm": 0.30859375,
"learning_rate": 0.0006435417026420307,
"loss": 1.0452,
"step": 8280
},
{
"epoch": 0.3580885841204123,
"grad_norm": 0.412109375,
"learning_rate": 0.0006425487825936799,
"loss": 1.0388,
"step": 8303
},
{
"epoch": 0.35908051925647994,
"grad_norm": 0.3828125,
"learning_rate": 0.0006415558625453289,
"loss": 1.0401,
"step": 8326
},
{
"epoch": 0.36007245439254754,
"grad_norm": 0.474609375,
"learning_rate": 0.0006405629424969782,
"loss": 1.0345,
"step": 8349
},
{
"epoch": 0.3610643895286152,
"grad_norm": 0.375,
"learning_rate": 0.0006395700224486272,
"loss": 1.0362,
"step": 8372
},
{
"epoch": 0.3620563246646828,
"grad_norm": 0.3984375,
"learning_rate": 0.0006385771024002763,
"loss": 1.045,
"step": 8395
},
{
"epoch": 0.3630482598007504,
"grad_norm": 0.357421875,
"learning_rate": 0.0006375841823519254,
"loss": 1.0417,
"step": 8418
},
{
"epoch": 0.36404019493681805,
"grad_norm": 0.306640625,
"learning_rate": 0.0006365912623035745,
"loss": 1.0388,
"step": 8441
},
{
"epoch": 0.36503213007288565,
"grad_norm": 0.34765625,
"learning_rate": 0.0006355983422552236,
"loss": 1.0303,
"step": 8464
},
{
"epoch": 0.3660240652089533,
"grad_norm": 0.3359375,
"learning_rate": 0.0006346054222068728,
"loss": 1.017,
"step": 8487
},
{
"epoch": 0.3670160003450209,
"grad_norm": 0.384765625,
"learning_rate": 0.0006336125021585219,
"loss": 1.0274,
"step": 8510
},
{
"epoch": 0.36800793548108857,
"grad_norm": 0.357421875,
"learning_rate": 0.000632619582110171,
"loss": 1.0257,
"step": 8533
},
{
"epoch": 0.36899987061715617,
"grad_norm": 0.33203125,
"learning_rate": 0.0006316266620618201,
"loss": 1.0389,
"step": 8556
},
{
"epoch": 0.36999180575322377,
"grad_norm": 0.369140625,
"learning_rate": 0.0006306337420134692,
"loss": 1.0295,
"step": 8579
},
{
"epoch": 0.3709837408892914,
"grad_norm": 0.4140625,
"learning_rate": 0.0006296408219651183,
"loss": 1.045,
"step": 8602
},
{
"epoch": 0.371975676025359,
"grad_norm": 0.392578125,
"learning_rate": 0.0006286479019167675,
"loss": 1.0581,
"step": 8625
},
{
"epoch": 0.3729676111614267,
"grad_norm": 0.44140625,
"learning_rate": 0.0006276549818684165,
"loss": 1.0521,
"step": 8648
},
{
"epoch": 0.3739595462974943,
"grad_norm": 0.36328125,
"learning_rate": 0.0006266620618200657,
"loss": 1.0355,
"step": 8671
},
{
"epoch": 0.3749514814335619,
"grad_norm": 0.349609375,
"learning_rate": 0.0006256691417717147,
"loss": 1.0366,
"step": 8694
},
{
"epoch": 0.37594341656962954,
"grad_norm": 0.349609375,
"learning_rate": 0.0006246762217233639,
"loss": 1.0148,
"step": 8717
},
{
"epoch": 0.37693535170569714,
"grad_norm": 0.421875,
"learning_rate": 0.000623683301675013,
"loss": 1.0401,
"step": 8740
},
{
"epoch": 0.3779272868417648,
"grad_norm": 0.3203125,
"learning_rate": 0.000622690381626662,
"loss": 1.0346,
"step": 8763
},
{
"epoch": 0.3789192219778324,
"grad_norm": 0.34375,
"learning_rate": 0.0006216974615783112,
"loss": 1.0218,
"step": 8786
},
{
"epoch": 0.37991115711390006,
"grad_norm": 0.427734375,
"learning_rate": 0.0006207045415299602,
"loss": 1.0405,
"step": 8809
},
{
"epoch": 0.38090309224996766,
"grad_norm": 0.392578125,
"learning_rate": 0.0006197116214816095,
"loss": 1.0253,
"step": 8832
},
{
"epoch": 0.38189502738603526,
"grad_norm": 0.345703125,
"learning_rate": 0.0006187187014332585,
"loss": 1.0409,
"step": 8855
},
{
"epoch": 0.3828869625221029,
"grad_norm": 0.349609375,
"learning_rate": 0.0006177257813849077,
"loss": 1.0349,
"step": 8878
},
{
"epoch": 0.3838788976581705,
"grad_norm": 0.49609375,
"learning_rate": 0.0006167328613365567,
"loss": 1.029,
"step": 8901
},
{
"epoch": 0.38487083279423817,
"grad_norm": 0.447265625,
"learning_rate": 0.0006157399412882058,
"loss": 1.0338,
"step": 8924
},
{
"epoch": 0.3858627679303058,
"grad_norm": 0.416015625,
"learning_rate": 0.0006147470212398549,
"loss": 1.0335,
"step": 8947
},
{
"epoch": 0.3868547030663734,
"grad_norm": 0.40234375,
"learning_rate": 0.0006137541011915041,
"loss": 1.0512,
"step": 8970
},
{
"epoch": 0.38784663820244103,
"grad_norm": 0.388671875,
"learning_rate": 0.0006127611811431532,
"loss": 1.0187,
"step": 8993
},
{
"epoch": 0.38883857333850863,
"grad_norm": 0.298828125,
"learning_rate": 0.0006117682610948023,
"loss": 1.0321,
"step": 9016
},
{
"epoch": 0.3898305084745763,
"grad_norm": 0.353515625,
"learning_rate": 0.0006107753410464514,
"loss": 1.0314,
"step": 9039
},
{
"epoch": 0.3908224436106439,
"grad_norm": 0.3984375,
"learning_rate": 0.0006097824209981005,
"loss": 1.011,
"step": 9062
},
{
"epoch": 0.39181437874671154,
"grad_norm": 0.37890625,
"learning_rate": 0.0006087895009497497,
"loss": 1.0127,
"step": 9085
},
{
"epoch": 0.39280631388277915,
"grad_norm": 0.40625,
"learning_rate": 0.0006077965809013988,
"loss": 1.0045,
"step": 9108
},
{
"epoch": 0.39379824901884675,
"grad_norm": 0.349609375,
"learning_rate": 0.0006068036608530478,
"loss": 0.9966,
"step": 9131
},
{
"epoch": 0.3947901841549144,
"grad_norm": 0.318359375,
"learning_rate": 0.000605810740804697,
"loss": 1.0235,
"step": 9154
},
{
"epoch": 0.395782119290982,
"grad_norm": 0.427734375,
"learning_rate": 0.000604817820756346,
"loss": 1.0448,
"step": 9177
},
{
"epoch": 0.39677405442704966,
"grad_norm": 0.337890625,
"learning_rate": 0.0006038249007079953,
"loss": 1.0197,
"step": 9200
},
{
"epoch": 0.39776598956311726,
"grad_norm": 0.447265625,
"learning_rate": 0.0006028319806596443,
"loss": 1.0093,
"step": 9223
},
{
"epoch": 0.39875792469918486,
"grad_norm": 0.412109375,
"learning_rate": 0.0006018390606112935,
"loss": 1.014,
"step": 9246
},
{
"epoch": 0.3997498598352525,
"grad_norm": 0.462890625,
"learning_rate": 0.0006008461405629425,
"loss": 1.0269,
"step": 9269
},
{
"epoch": 0.3998792426790874,
"eval_runtime": 163.9225,
"eval_samples_per_second": 610.045,
"eval_steps_per_second": 7.626,
"step": 9272
},
{
"epoch": 0.4007417949713201,
"grad_norm": 0.408203125,
"learning_rate": 0.0005998532205145915,
"loss": 1.0191,
"step": 9292
},
{
"epoch": 0.4017337301073878,
"grad_norm": 0.46484375,
"learning_rate": 0.0005988603004662408,
"loss": 1.0074,
"step": 9315
},
{
"epoch": 0.4027256652434554,
"grad_norm": 0.353515625,
"learning_rate": 0.0005978673804178898,
"loss": 1.0052,
"step": 9338
},
{
"epoch": 0.40371760037952303,
"grad_norm": 0.361328125,
"learning_rate": 0.000596874460369539,
"loss": 1.0248,
"step": 9361
},
{
"epoch": 0.40470953551559063,
"grad_norm": 0.376953125,
"learning_rate": 0.000595881540321188,
"loss": 1.0328,
"step": 9384
},
{
"epoch": 0.40570147065165824,
"grad_norm": 0.4453125,
"learning_rate": 0.0005948886202728372,
"loss": 1.0254,
"step": 9407
},
{
"epoch": 0.4066934057877259,
"grad_norm": 0.302734375,
"learning_rate": 0.0005938957002244862,
"loss": 1.0285,
"step": 9430
},
{
"epoch": 0.4076853409237935,
"grad_norm": 0.439453125,
"learning_rate": 0.0005929027801761354,
"loss": 1.0077,
"step": 9453
},
{
"epoch": 0.40867727605986115,
"grad_norm": 0.3515625,
"learning_rate": 0.0005919098601277845,
"loss": 1.0181,
"step": 9476
},
{
"epoch": 0.40966921119592875,
"grad_norm": 0.33203125,
"learning_rate": 0.0005909169400794336,
"loss": 1.0427,
"step": 9499
},
{
"epoch": 0.41066114633199635,
"grad_norm": 0.380859375,
"learning_rate": 0.0005899240200310827,
"loss": 1.0133,
"step": 9522
},
{
"epoch": 0.411653081468064,
"grad_norm": 0.40234375,
"learning_rate": 0.0005889310999827318,
"loss": 1.0187,
"step": 9545
},
{
"epoch": 0.4126450166041316,
"grad_norm": 0.44921875,
"learning_rate": 0.000587938179934381,
"loss": 1.0091,
"step": 9568
},
{
"epoch": 0.41363695174019927,
"grad_norm": 0.37109375,
"learning_rate": 0.0005869452598860301,
"loss": 1.002,
"step": 9591
},
{
"epoch": 0.41462888687626687,
"grad_norm": 0.47265625,
"learning_rate": 0.0005859523398376792,
"loss": 1.0321,
"step": 9614
},
{
"epoch": 0.4156208220123345,
"grad_norm": 0.4453125,
"learning_rate": 0.0005849594197893283,
"loss": 1.0244,
"step": 9637
},
{
"epoch": 0.4166127571484021,
"grad_norm": 0.4375,
"learning_rate": 0.0005839664997409773,
"loss": 1.0345,
"step": 9660
},
{
"epoch": 0.4176046922844697,
"grad_norm": 0.357421875,
"learning_rate": 0.0005829735796926266,
"loss": 1.0157,
"step": 9683
},
{
"epoch": 0.4185966274205374,
"grad_norm": 0.3515625,
"learning_rate": 0.0005819806596442756,
"loss": 0.9774,
"step": 9706
},
{
"epoch": 0.419588562556605,
"grad_norm": 0.408203125,
"learning_rate": 0.0005809877395959248,
"loss": 1.0011,
"step": 9729
},
{
"epoch": 0.42058049769267264,
"grad_norm": 0.41015625,
"learning_rate": 0.0005799948195475738,
"loss": 1.0148,
"step": 9752
},
{
"epoch": 0.42157243282874024,
"grad_norm": 0.53515625,
"learning_rate": 0.000579001899499223,
"loss": 1.0058,
"step": 9775
},
{
"epoch": 0.42256436796480784,
"grad_norm": 0.37890625,
"learning_rate": 0.000578008979450872,
"loss": 1.0241,
"step": 9798
},
{
"epoch": 0.4235563031008755,
"grad_norm": 0.38671875,
"learning_rate": 0.0005770160594025212,
"loss": 1.0067,
"step": 9821
},
{
"epoch": 0.4245482382369431,
"grad_norm": 0.3359375,
"learning_rate": 0.0005760231393541703,
"loss": 1.0105,
"step": 9844
},
{
"epoch": 0.42554017337301075,
"grad_norm": 0.40625,
"learning_rate": 0.0005750302193058193,
"loss": 1.0226,
"step": 9867
},
{
"epoch": 0.42653210850907836,
"grad_norm": 0.392578125,
"learning_rate": 0.0005740372992574685,
"loss": 1.0243,
"step": 9890
},
{
"epoch": 0.427524043645146,
"grad_norm": 0.419921875,
"learning_rate": 0.0005730443792091175,
"loss": 1.0128,
"step": 9913
},
{
"epoch": 0.4285159787812136,
"grad_norm": 0.373046875,
"learning_rate": 0.0005720514591607668,
"loss": 0.9993,
"step": 9936
},
{
"epoch": 0.4295079139172812,
"grad_norm": 0.365234375,
"learning_rate": 0.0005710585391124158,
"loss": 1.012,
"step": 9959
},
{
"epoch": 0.43049984905334887,
"grad_norm": 0.302734375,
"learning_rate": 0.000570065619064065,
"loss": 1.0178,
"step": 9982
},
{
"epoch": 0.43149178418941647,
"grad_norm": 0.3671875,
"learning_rate": 0.000569072699015714,
"loss": 1.0192,
"step": 10005
},
{
"epoch": 0.43248371932548413,
"grad_norm": 0.345703125,
"learning_rate": 0.0005680797789673631,
"loss": 0.9992,
"step": 10028
},
{
"epoch": 0.43347565446155173,
"grad_norm": 0.4140625,
"learning_rate": 0.0005670868589190123,
"loss": 1.0033,
"step": 10051
},
{
"epoch": 0.43446758959761933,
"grad_norm": 0.359375,
"learning_rate": 0.0005660939388706614,
"loss": 1.0153,
"step": 10074
},
{
"epoch": 0.435459524733687,
"grad_norm": 0.36328125,
"learning_rate": 0.0005651010188223105,
"loss": 0.997,
"step": 10097
},
{
"epoch": 0.4364514598697546,
"grad_norm": 0.3671875,
"learning_rate": 0.0005641080987739596,
"loss": 1.0147,
"step": 10120
},
{
"epoch": 0.43744339500582224,
"grad_norm": 0.3515625,
"learning_rate": 0.0005631151787256087,
"loss": 1.0046,
"step": 10143
},
{
"epoch": 0.43843533014188985,
"grad_norm": 0.37890625,
"learning_rate": 0.0005621222586772579,
"loss": 1.0008,
"step": 10166
},
{
"epoch": 0.4394272652779575,
"grad_norm": 0.41796875,
"learning_rate": 0.0005611293386289069,
"loss": 1.0315,
"step": 10189
},
{
"epoch": 0.4404192004140251,
"grad_norm": 0.365234375,
"learning_rate": 0.0005601364185805561,
"loss": 1.0164,
"step": 10212
},
{
"epoch": 0.4414111355500927,
"grad_norm": 0.361328125,
"learning_rate": 0.0005591434985322051,
"loss": 1.0044,
"step": 10235
},
{
"epoch": 0.44240307068616036,
"grad_norm": 0.439453125,
"learning_rate": 0.0005581505784838543,
"loss": 1.0236,
"step": 10258
},
{
"epoch": 0.44339500582222796,
"grad_norm": 0.341796875,
"learning_rate": 0.0005571576584355034,
"loss": 1.0156,
"step": 10281
},
{
"epoch": 0.4443869409582956,
"grad_norm": 0.373046875,
"learning_rate": 0.0005561647383871526,
"loss": 0.9916,
"step": 10304
},
{
"epoch": 0.4453788760943632,
"grad_norm": 0.33203125,
"learning_rate": 0.0005551718183388016,
"loss": 0.9961,
"step": 10327
},
{
"epoch": 0.4463708112304308,
"grad_norm": 0.392578125,
"learning_rate": 0.0005541788982904507,
"loss": 1.0021,
"step": 10350
},
{
"epoch": 0.4473627463664985,
"grad_norm": 0.375,
"learning_rate": 0.0005531859782420998,
"loss": 1.0219,
"step": 10373
},
{
"epoch": 0.4483546815025661,
"grad_norm": 0.4140625,
"learning_rate": 0.000552193058193749,
"loss": 0.9982,
"step": 10396
},
{
"epoch": 0.44934661663863373,
"grad_norm": 0.392578125,
"learning_rate": 0.0005512001381453981,
"loss": 0.994,
"step": 10419
},
{
"epoch": 0.45033855177470133,
"grad_norm": 0.34765625,
"learning_rate": 0.0005502072180970471,
"loss": 0.9899,
"step": 10442
},
{
"epoch": 0.451330486910769,
"grad_norm": 0.3828125,
"learning_rate": 0.0005492142980486963,
"loss": 1.0096,
"step": 10465
},
{
"epoch": 0.4523224220468366,
"grad_norm": 0.3984375,
"learning_rate": 0.0005482213780003453,
"loss": 0.9882,
"step": 10488
},
{
"epoch": 0.4533143571829042,
"grad_norm": 0.390625,
"learning_rate": 0.0005472284579519945,
"loss": 0.999,
"step": 10511
},
{
"epoch": 0.45430629231897185,
"grad_norm": 0.3984375,
"learning_rate": 0.0005462355379036436,
"loss": 1.0087,
"step": 10534
},
{
"epoch": 0.45529822745503945,
"grad_norm": 0.388671875,
"learning_rate": 0.0005452426178552927,
"loss": 0.9985,
"step": 10557
},
{
"epoch": 0.4562901625911071,
"grad_norm": 0.455078125,
"learning_rate": 0.0005442496978069418,
"loss": 1.0104,
"step": 10580
},
{
"epoch": 0.4572820977271747,
"grad_norm": 0.61328125,
"learning_rate": 0.0005432567777585909,
"loss": 1.0056,
"step": 10603
},
{
"epoch": 0.4582740328632423,
"grad_norm": 0.3359375,
"learning_rate": 0.00054226385771024,
"loss": 1.0115,
"step": 10626
},
{
"epoch": 0.45926596799930997,
"grad_norm": 0.3515625,
"learning_rate": 0.0005412709376618892,
"loss": 1.0143,
"step": 10649
},
{
"epoch": 0.46025790313537757,
"grad_norm": 0.388671875,
"learning_rate": 0.0005402780176135383,
"loss": 0.9916,
"step": 10672
},
{
"epoch": 0.4612498382714452,
"grad_norm": 0.396484375,
"learning_rate": 0.0005392850975651874,
"loss": 0.9967,
"step": 10695
},
{
"epoch": 0.4622417734075128,
"grad_norm": 0.41796875,
"learning_rate": 0.0005382921775168364,
"loss": 1.0009,
"step": 10718
},
{
"epoch": 0.4632337085435805,
"grad_norm": 0.34765625,
"learning_rate": 0.0005372992574684856,
"loss": 0.9919,
"step": 10741
},
{
"epoch": 0.4642256436796481,
"grad_norm": 0.3515625,
"learning_rate": 0.0005363063374201347,
"loss": 1.0128,
"step": 10764
},
{
"epoch": 0.4652175788157157,
"grad_norm": 0.400390625,
"learning_rate": 0.0005353134173717839,
"loss": 0.9982,
"step": 10787
},
{
"epoch": 0.46620951395178334,
"grad_norm": 0.3515625,
"learning_rate": 0.0005343204973234329,
"loss": 0.9998,
"step": 10810
},
{
"epoch": 0.46720144908785094,
"grad_norm": 0.5390625,
"learning_rate": 0.0005333275772750821,
"loss": 1.0177,
"step": 10833
},
{
"epoch": 0.4681933842239186,
"grad_norm": 0.37890625,
"learning_rate": 0.0005323346572267311,
"loss": 0.9899,
"step": 10856
},
{
"epoch": 0.4691853193599862,
"grad_norm": 0.38671875,
"learning_rate": 0.0005313417371783802,
"loss": 1.0052,
"step": 10879
},
{
"epoch": 0.4701772544960538,
"grad_norm": 0.36328125,
"learning_rate": 0.0005303488171300294,
"loss": 0.9741,
"step": 10902
},
{
"epoch": 0.47116918963212145,
"grad_norm": 0.4453125,
"learning_rate": 0.0005293558970816785,
"loss": 1.0021,
"step": 10925
},
{
"epoch": 0.47216112476818906,
"grad_norm": 0.322265625,
"learning_rate": 0.0005283629770333276,
"loss": 0.9896,
"step": 10948
},
{
"epoch": 0.4731530599042567,
"grad_norm": 0.36328125,
"learning_rate": 0.0005273700569849767,
"loss": 1.0046,
"step": 10971
},
{
"epoch": 0.4741449950403243,
"grad_norm": 0.345703125,
"learning_rate": 0.0005263771369366258,
"loss": 1.0004,
"step": 10994
},
{
"epoch": 0.47513693017639197,
"grad_norm": 0.357421875,
"learning_rate": 0.0005253842168882749,
"loss": 1.0031,
"step": 11017
},
{
"epoch": 0.47612886531245957,
"grad_norm": 0.359375,
"learning_rate": 0.0005243912968399241,
"loss": 1.007,
"step": 11040
},
{
"epoch": 0.47712080044852717,
"grad_norm": 0.38671875,
"learning_rate": 0.0005233983767915731,
"loss": 1.0046,
"step": 11063
},
{
"epoch": 0.47811273558459483,
"grad_norm": 0.341796875,
"learning_rate": 0.0005224054567432222,
"loss": 0.9956,
"step": 11086
},
{
"epoch": 0.47910467072066243,
"grad_norm": 0.3515625,
"learning_rate": 0.0005214125366948713,
"loss": 1.01,
"step": 11109
},
{
"epoch": 0.4800966058567301,
"grad_norm": 0.431640625,
"learning_rate": 0.0005204196166465205,
"loss": 1.0211,
"step": 11132
},
{
"epoch": 0.4810885409927977,
"grad_norm": 0.375,
"learning_rate": 0.0005194266965981696,
"loss": 1.0039,
"step": 11155
},
{
"epoch": 0.4820804761288653,
"grad_norm": 0.392578125,
"learning_rate": 0.0005184337765498187,
"loss": 0.9886,
"step": 11178
},
{
"epoch": 0.48307241126493294,
"grad_norm": 0.515625,
"learning_rate": 0.0005174408565014678,
"loss": 0.9973,
"step": 11201
},
{
"epoch": 0.48406434640100054,
"grad_norm": 0.396484375,
"learning_rate": 0.0005164479364531169,
"loss": 1.013,
"step": 11224
},
{
"epoch": 0.4850562815370682,
"grad_norm": 0.451171875,
"learning_rate": 0.000515455016404766,
"loss": 0.9876,
"step": 11247
},
{
"epoch": 0.4860482166731358,
"grad_norm": 0.375,
"learning_rate": 0.0005144620963564152,
"loss": 0.9984,
"step": 11270
},
{
"epoch": 0.48704015180920346,
"grad_norm": 0.416015625,
"learning_rate": 0.0005134691763080642,
"loss": 0.9875,
"step": 11293
},
{
"epoch": 0.48803208694527106,
"grad_norm": 0.369140625,
"learning_rate": 0.0005124762562597134,
"loss": 0.9954,
"step": 11316
},
{
"epoch": 0.48902402208133866,
"grad_norm": 0.337890625,
"learning_rate": 0.0005114833362113624,
"loss": 0.9825,
"step": 11339
},
{
"epoch": 0.4900159572174063,
"grad_norm": 0.37890625,
"learning_rate": 0.0005104904161630117,
"loss": 0.9983,
"step": 11362
},
{
"epoch": 0.4910078923534739,
"grad_norm": 0.328125,
"learning_rate": 0.0005094974961146607,
"loss": 0.9818,
"step": 11385
},
{
"epoch": 0.4919998274895416,
"grad_norm": 0.357421875,
"learning_rate": 0.0005085045760663098,
"loss": 0.9928,
"step": 11408
},
{
"epoch": 0.4929917626256092,
"grad_norm": 0.4921875,
"learning_rate": 0.0005075116560179589,
"loss": 0.9771,
"step": 11431
},
{
"epoch": 0.4939836977616768,
"grad_norm": 0.341796875,
"learning_rate": 0.000506518735969608,
"loss": 1.0059,
"step": 11454
},
{
"epoch": 0.49497563289774443,
"grad_norm": 0.400390625,
"learning_rate": 0.0005055258159212571,
"loss": 1.0058,
"step": 11477
},
{
"epoch": 0.49596756803381203,
"grad_norm": 0.328125,
"learning_rate": 0.0005045328958729063,
"loss": 0.9962,
"step": 11500
},
{
"epoch": 0.4969595031698797,
"grad_norm": 0.326171875,
"learning_rate": 0.0005035399758245554,
"loss": 0.9828,
"step": 11523
},
{
"epoch": 0.4979514383059473,
"grad_norm": 0.5078125,
"learning_rate": 0.0005025470557762045,
"loss": 0.9881,
"step": 11546
},
{
"epoch": 0.49894337344201495,
"grad_norm": 0.43359375,
"learning_rate": 0.0005015541357278536,
"loss": 0.9863,
"step": 11569
},
{
"epoch": 0.49984905334885926,
"eval_runtime": 163.9862,
"eval_samples_per_second": 609.807,
"eval_steps_per_second": 7.623,
"step": 11590
},
{
"epoch": 0.49993530857808255,
"grad_norm": 0.353515625,
"learning_rate": 0.0005005612156795026,
"loss": 0.9764,
"step": 11592
},
{
"epoch": 0.5009272437141502,
"grad_norm": 0.36328125,
"learning_rate": 0.0004995682956311518,
"loss": 0.9923,
"step": 11615
},
{
"epoch": 0.5019191788502178,
"grad_norm": 0.39453125,
"learning_rate": 0.0004985753755828009,
"loss": 0.9738,
"step": 11638
},
{
"epoch": 0.5029111139862854,
"grad_norm": 0.48828125,
"learning_rate": 0.00049758245553445,
"loss": 0.973,
"step": 11661
},
{
"epoch": 0.5039030491223531,
"grad_norm": 0.384765625,
"learning_rate": 0.0004965895354860991,
"loss": 0.9741,
"step": 11684
},
{
"epoch": 0.5048949842584207,
"grad_norm": 0.359375,
"learning_rate": 0.0004955966154377482,
"loss": 0.9842,
"step": 11707
},
{
"epoch": 0.5058869193944883,
"grad_norm": 0.443359375,
"learning_rate": 0.0004946036953893974,
"loss": 0.9927,
"step": 11730
},
{
"epoch": 0.5068788545305559,
"grad_norm": 0.44921875,
"learning_rate": 0.0004936107753410465,
"loss": 0.9921,
"step": 11753
},
{
"epoch": 0.5078707896666236,
"grad_norm": 0.40625,
"learning_rate": 0.0004926178552926956,
"loss": 0.9827,
"step": 11776
},
{
"epoch": 0.5088627248026911,
"grad_norm": 0.416015625,
"learning_rate": 0.0004916249352443447,
"loss": 0.9836,
"step": 11799
},
{
"epoch": 0.5098546599387588,
"grad_norm": 0.36328125,
"learning_rate": 0.0004906320151959938,
"loss": 0.9783,
"step": 11822
},
{
"epoch": 0.5108465950748264,
"grad_norm": 0.357421875,
"learning_rate": 0.0004896390951476428,
"loss": 1.0003,
"step": 11845
},
{
"epoch": 0.511838530210894,
"grad_norm": 0.39453125,
"learning_rate": 0.000488646175099292,
"loss": 0.995,
"step": 11868
},
{
"epoch": 0.5128304653469616,
"grad_norm": 0.376953125,
"learning_rate": 0.0004876532550509411,
"loss": 0.9952,
"step": 11891
},
{
"epoch": 0.5138224004830293,
"grad_norm": 0.3828125,
"learning_rate": 0.0004866603350025902,
"loss": 0.9912,
"step": 11914
},
{
"epoch": 0.514814335619097,
"grad_norm": 0.34375,
"learning_rate": 0.00048566741495423933,
"loss": 0.995,
"step": 11937
},
{
"epoch": 0.5158062707551645,
"grad_norm": 0.408203125,
"learning_rate": 0.00048467449490588845,
"loss": 0.9856,
"step": 11960
},
{
"epoch": 0.5167982058912322,
"grad_norm": 0.427734375,
"learning_rate": 0.00048368157485753757,
"loss": 0.9887,
"step": 11983
},
{
"epoch": 0.5177901410272998,
"grad_norm": 0.376953125,
"learning_rate": 0.0004826886548091867,
"loss": 0.9815,
"step": 12006
},
{
"epoch": 0.5187820761633674,
"grad_norm": 0.369140625,
"learning_rate": 0.00048169573476083575,
"loss": 0.9944,
"step": 12029
},
{
"epoch": 0.519774011299435,
"grad_norm": 0.427734375,
"learning_rate": 0.0004807028147124849,
"loss": 0.9714,
"step": 12052
},
{
"epoch": 0.5207659464355027,
"grad_norm": 0.326171875,
"learning_rate": 0.000479709894664134,
"loss": 0.9849,
"step": 12075
},
{
"epoch": 0.5217578815715703,
"grad_norm": 0.427734375,
"learning_rate": 0.0004787169746157831,
"loss": 0.9861,
"step": 12098
},
{
"epoch": 0.5227498167076379,
"grad_norm": 0.4765625,
"learning_rate": 0.00047772405456743223,
"loss": 1.0009,
"step": 12121
},
{
"epoch": 0.5237417518437055,
"grad_norm": 0.345703125,
"learning_rate": 0.00047673113451908135,
"loss": 0.9892,
"step": 12144
},
{
"epoch": 0.5247336869797732,
"grad_norm": 0.345703125,
"learning_rate": 0.00047573821447073047,
"loss": 0.9843,
"step": 12167
},
{
"epoch": 0.5257256221158407,
"grad_norm": 0.40234375,
"learning_rate": 0.0004747452944223796,
"loss": 0.9767,
"step": 12190
},
{
"epoch": 0.5267175572519084,
"grad_norm": 0.359375,
"learning_rate": 0.00047375237437402866,
"loss": 0.9599,
"step": 12213
},
{
"epoch": 0.527709492387976,
"grad_norm": 0.388671875,
"learning_rate": 0.0004727594543256778,
"loss": 0.9797,
"step": 12236
},
{
"epoch": 0.5287014275240437,
"grad_norm": 0.3359375,
"learning_rate": 0.0004717665342773269,
"loss": 0.9952,
"step": 12259
},
{
"epoch": 0.5296933626601112,
"grad_norm": 0.359375,
"learning_rate": 0.000470773614228976,
"loss": 0.9851,
"step": 12282
},
{
"epoch": 0.5306852977961789,
"grad_norm": 0.4140625,
"learning_rate": 0.00046978069418062514,
"loss": 0.9728,
"step": 12305
},
{
"epoch": 0.5316772329322466,
"grad_norm": 0.376953125,
"learning_rate": 0.00046878777413227426,
"loss": 0.9813,
"step": 12328
},
{
"epoch": 0.5326691680683141,
"grad_norm": 0.326171875,
"learning_rate": 0.0004677948540839234,
"loss": 0.9729,
"step": 12351
},
{
"epoch": 0.5336611032043818,
"grad_norm": 0.33203125,
"learning_rate": 0.0004668019340355725,
"loss": 0.969,
"step": 12374
},
{
"epoch": 0.5346530383404494,
"grad_norm": 0.43359375,
"learning_rate": 0.00046580901398722156,
"loss": 0.9786,
"step": 12397
},
{
"epoch": 0.535644973476517,
"grad_norm": 0.388671875,
"learning_rate": 0.00046481609393887063,
"loss": 0.9773,
"step": 12420
},
{
"epoch": 0.5366369086125846,
"grad_norm": 0.451171875,
"learning_rate": 0.00046382317389051975,
"loss": 0.9972,
"step": 12443
},
{
"epoch": 0.5376288437486523,
"grad_norm": 0.408203125,
"learning_rate": 0.00046283025384216887,
"loss": 0.9893,
"step": 12466
},
{
"epoch": 0.5386207788847199,
"grad_norm": 0.400390625,
"learning_rate": 0.000461837333793818,
"loss": 0.9747,
"step": 12489
},
{
"epoch": 0.5396127140207875,
"grad_norm": 0.4921875,
"learning_rate": 0.0004608444137454671,
"loss": 0.9795,
"step": 12512
},
{
"epoch": 0.5406046491568551,
"grad_norm": 0.37109375,
"learning_rate": 0.00045985149369711623,
"loss": 0.9608,
"step": 12535
},
{
"epoch": 0.5415965842929228,
"grad_norm": 0.3515625,
"learning_rate": 0.00045885857364876535,
"loss": 0.966,
"step": 12558
},
{
"epoch": 0.5425885194289903,
"grad_norm": 0.361328125,
"learning_rate": 0.0004578656536004144,
"loss": 0.9689,
"step": 12581
},
{
"epoch": 0.543580454565058,
"grad_norm": 0.404296875,
"learning_rate": 0.00045687273355206353,
"loss": 0.9655,
"step": 12604
},
{
"epoch": 0.5445723897011256,
"grad_norm": 0.37890625,
"learning_rate": 0.00045587981350371265,
"loss": 0.9693,
"step": 12627
},
{
"epoch": 0.5455643248371933,
"grad_norm": 0.41015625,
"learning_rate": 0.00045488689345536177,
"loss": 0.9986,
"step": 12650
},
{
"epoch": 0.5465562599732608,
"grad_norm": 0.345703125,
"learning_rate": 0.0004538939734070109,
"loss": 0.9715,
"step": 12673
},
{
"epoch": 0.5475481951093285,
"grad_norm": 0.37890625,
"learning_rate": 0.00045290105335866,
"loss": 0.9781,
"step": 12696
},
{
"epoch": 0.5485401302453962,
"grad_norm": 0.42578125,
"learning_rate": 0.00045190813331030913,
"loss": 1.0001,
"step": 12719
},
{
"epoch": 0.5495320653814637,
"grad_norm": 0.43359375,
"learning_rate": 0.0004509152132619582,
"loss": 0.9811,
"step": 12742
},
{
"epoch": 0.5505240005175314,
"grad_norm": 0.341796875,
"learning_rate": 0.0004499222932136073,
"loss": 0.9584,
"step": 12765
},
{
"epoch": 0.551515935653599,
"grad_norm": 0.419921875,
"learning_rate": 0.00044892937316525644,
"loss": 0.977,
"step": 12788
},
{
"epoch": 0.5525078707896667,
"grad_norm": 0.416015625,
"learning_rate": 0.00044793645311690556,
"loss": 0.9746,
"step": 12811
},
{
"epoch": 0.5534998059257342,
"grad_norm": 0.390625,
"learning_rate": 0.0004469435330685547,
"loss": 0.9811,
"step": 12834
},
{
"epoch": 0.5544917410618019,
"grad_norm": 0.35546875,
"learning_rate": 0.0004459506130202038,
"loss": 0.9523,
"step": 12857
},
{
"epoch": 0.5554836761978695,
"grad_norm": 0.37890625,
"learning_rate": 0.0004449576929718529,
"loss": 0.9641,
"step": 12880
},
{
"epoch": 0.5564756113339371,
"grad_norm": 0.36328125,
"learning_rate": 0.00044396477292350204,
"loss": 0.9845,
"step": 12903
},
{
"epoch": 0.5574675464700047,
"grad_norm": 0.365234375,
"learning_rate": 0.0004429718528751511,
"loss": 0.9788,
"step": 12926
},
{
"epoch": 0.5584594816060724,
"grad_norm": 0.390625,
"learning_rate": 0.0004419789328268002,
"loss": 0.9795,
"step": 12949
},
{
"epoch": 0.5594514167421399,
"grad_norm": 0.37109375,
"learning_rate": 0.00044098601277844934,
"loss": 0.9716,
"step": 12972
},
{
"epoch": 0.5604433518782076,
"grad_norm": 0.38671875,
"learning_rate": 0.00043999309273009846,
"loss": 0.9814,
"step": 12995
},
{
"epoch": 0.5614352870142753,
"grad_norm": 0.34765625,
"learning_rate": 0.00043900017268174753,
"loss": 0.9724,
"step": 13018
},
{
"epoch": 0.5624272221503429,
"grad_norm": 0.44921875,
"learning_rate": 0.00043800725263339665,
"loss": 0.9538,
"step": 13041
},
{
"epoch": 0.5634191572864105,
"grad_norm": 0.3828125,
"learning_rate": 0.00043701433258504577,
"loss": 0.9744,
"step": 13064
},
{
"epoch": 0.5644110924224781,
"grad_norm": 0.423828125,
"learning_rate": 0.0004360214125366949,
"loss": 0.9777,
"step": 13087
},
{
"epoch": 0.5654030275585458,
"grad_norm": 0.365234375,
"learning_rate": 0.00043502849248834395,
"loss": 0.9688,
"step": 13110
},
{
"epoch": 0.5663949626946133,
"grad_norm": 0.470703125,
"learning_rate": 0.00043403557243999307,
"loss": 0.988,
"step": 13133
},
{
"epoch": 0.567386897830681,
"grad_norm": 0.341796875,
"learning_rate": 0.0004330426523916422,
"loss": 0.9678,
"step": 13156
},
{
"epoch": 0.5683788329667486,
"grad_norm": 0.345703125,
"learning_rate": 0.0004320497323432913,
"loss": 0.9735,
"step": 13179
},
{
"epoch": 0.5693707681028163,
"grad_norm": 0.416015625,
"learning_rate": 0.00043105681229494043,
"loss": 0.9612,
"step": 13202
},
{
"epoch": 0.5703627032388838,
"grad_norm": 0.375,
"learning_rate": 0.00043006389224658955,
"loss": 0.9428,
"step": 13225
},
{
"epoch": 0.5713546383749515,
"grad_norm": 0.4296875,
"learning_rate": 0.00042907097219823867,
"loss": 0.9654,
"step": 13248
},
{
"epoch": 0.5723465735110191,
"grad_norm": 0.353515625,
"learning_rate": 0.0004280780521498878,
"loss": 0.9739,
"step": 13271
},
{
"epoch": 0.5733385086470867,
"grad_norm": 0.380859375,
"learning_rate": 0.00042708513210153686,
"loss": 0.9755,
"step": 13294
},
{
"epoch": 0.5743304437831543,
"grad_norm": 0.357421875,
"learning_rate": 0.000426092212053186,
"loss": 0.9784,
"step": 13317
},
{
"epoch": 0.575322378919222,
"grad_norm": 0.3125,
"learning_rate": 0.0004250992920048351,
"loss": 0.9625,
"step": 13340
},
{
"epoch": 0.5763143140552897,
"grad_norm": 0.345703125,
"learning_rate": 0.0004241063719564842,
"loss": 0.9521,
"step": 13363
},
{
"epoch": 0.5773062491913572,
"grad_norm": 0.333984375,
"learning_rate": 0.00042311345190813334,
"loss": 0.984,
"step": 13386
},
{
"epoch": 0.5782981843274249,
"grad_norm": 0.45703125,
"learning_rate": 0.00042212053185978246,
"loss": 0.9794,
"step": 13409
},
{
"epoch": 0.5792901194634925,
"grad_norm": 0.396484375,
"learning_rate": 0.0004211276118114316,
"loss": 0.9705,
"step": 13432
},
{
"epoch": 0.5802820545995601,
"grad_norm": 0.400390625,
"learning_rate": 0.00042013469176308064,
"loss": 0.97,
"step": 13455
},
{
"epoch": 0.5812739897356277,
"grad_norm": 0.37890625,
"learning_rate": 0.00041914177171472976,
"loss": 0.968,
"step": 13478
},
{
"epoch": 0.5822659248716954,
"grad_norm": 0.365234375,
"learning_rate": 0.0004181488516663789,
"loss": 0.9664,
"step": 13501
},
{
"epoch": 0.5832578600077629,
"grad_norm": 0.361328125,
"learning_rate": 0.000417155931618028,
"loss": 0.9722,
"step": 13524
},
{
"epoch": 0.5842497951438306,
"grad_norm": 0.369140625,
"learning_rate": 0.0004161630115696771,
"loss": 0.9695,
"step": 13547
},
{
"epoch": 0.5852417302798982,
"grad_norm": 0.337890625,
"learning_rate": 0.00041517009152132624,
"loss": 0.9628,
"step": 13570
},
{
"epoch": 0.5862336654159659,
"grad_norm": 0.330078125,
"learning_rate": 0.0004141771714729753,
"loss": 0.9515,
"step": 13593
},
{
"epoch": 0.5872256005520334,
"grad_norm": 0.359375,
"learning_rate": 0.0004131842514246244,
"loss": 0.965,
"step": 13616
},
{
"epoch": 0.5882175356881011,
"grad_norm": 0.392578125,
"learning_rate": 0.0004121913313762735,
"loss": 0.9598,
"step": 13639
},
{
"epoch": 0.5892094708241687,
"grad_norm": 0.41796875,
"learning_rate": 0.0004111984113279226,
"loss": 0.9575,
"step": 13662
},
{
"epoch": 0.5902014059602363,
"grad_norm": 0.5234375,
"learning_rate": 0.00041020549127957173,
"loss": 0.9933,
"step": 13685
},
{
"epoch": 0.591193341096304,
"grad_norm": 0.423828125,
"learning_rate": 0.00040921257123122085,
"loss": 0.9621,
"step": 13708
},
{
"epoch": 0.5921852762323716,
"grad_norm": 0.33203125,
"learning_rate": 0.00040821965118286997,
"loss": 0.964,
"step": 13731
},
{
"epoch": 0.5931772113684393,
"grad_norm": 0.423828125,
"learning_rate": 0.0004072267311345191,
"loss": 0.9854,
"step": 13754
},
{
"epoch": 0.5941691465045068,
"grad_norm": 0.3515625,
"learning_rate": 0.0004062338110861682,
"loss": 0.9883,
"step": 13777
},
{
"epoch": 0.5951610816405745,
"grad_norm": 0.408203125,
"learning_rate": 0.00040524089103781733,
"loss": 0.9853,
"step": 13800
},
{
"epoch": 0.5961530167766421,
"grad_norm": 0.408203125,
"learning_rate": 0.0004042479709894664,
"loss": 0.9557,
"step": 13823
},
{
"epoch": 0.5971449519127097,
"grad_norm": 0.42578125,
"learning_rate": 0.0004032550509411155,
"loss": 0.9587,
"step": 13846
},
{
"epoch": 0.5981368870487773,
"grad_norm": 0.44921875,
"learning_rate": 0.00040226213089276464,
"loss": 0.9771,
"step": 13869
},
{
"epoch": 0.599128822184845,
"grad_norm": 0.431640625,
"learning_rate": 0.00040126921084441376,
"loss": 0.9661,
"step": 13892
},
{
"epoch": 0.5998188640186312,
"eval_runtime": 163.7921,
"eval_samples_per_second": 610.53,
"eval_steps_per_second": 7.632,
"step": 13908
}
],
"logging_steps": 23,
"max_steps": 23187,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 2318,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0167159364234772e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}