paligemma2-3b-e621-224-adpt / trainer_state.json
Fizzarolli's picture
Upload folder using huggingface_hub
0ad4aa7 verified
raw
history blame
175 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4325610316580605,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004325610316580605,
"grad_norm": 10.074225425720215,
"learning_rate": 8.000000000000001e-07,
"loss": 3.4502,
"step": 1
},
{
"epoch": 0.000865122063316121,
"grad_norm": 10.193795204162598,
"learning_rate": 1.6000000000000001e-06,
"loss": 3.4309,
"step": 2
},
{
"epoch": 0.0012976830949741815,
"grad_norm": 9.579278945922852,
"learning_rate": 2.4000000000000003e-06,
"loss": 3.4075,
"step": 3
},
{
"epoch": 0.001730244126632242,
"grad_norm": 9.713829040527344,
"learning_rate": 3.2000000000000003e-06,
"loss": 3.4737,
"step": 4
},
{
"epoch": 0.0021628051582903026,
"grad_norm": 9.49286937713623,
"learning_rate": 4.000000000000001e-06,
"loss": 3.4149,
"step": 5
},
{
"epoch": 0.002595366189948363,
"grad_norm": 10.680686950683594,
"learning_rate": 4.800000000000001e-06,
"loss": 3.4793,
"step": 6
},
{
"epoch": 0.0030279272216064235,
"grad_norm": 10.119192123413086,
"learning_rate": 5.600000000000001e-06,
"loss": 3.3707,
"step": 7
},
{
"epoch": 0.003460488253264484,
"grad_norm": 10.217940330505371,
"learning_rate": 6.4000000000000006e-06,
"loss": 3.4339,
"step": 8
},
{
"epoch": 0.0038930492849225447,
"grad_norm": 10.2111234664917,
"learning_rate": 7.2000000000000005e-06,
"loss": 3.3944,
"step": 9
},
{
"epoch": 0.004325610316580605,
"grad_norm": 9.485647201538086,
"learning_rate": 8.000000000000001e-06,
"loss": 3.3145,
"step": 10
},
{
"epoch": 0.004758171348238666,
"grad_norm": 10.536123275756836,
"learning_rate": 8.8e-06,
"loss": 3.37,
"step": 11
},
{
"epoch": 0.005190732379896726,
"grad_norm": 10.602726936340332,
"learning_rate": 9.600000000000001e-06,
"loss": 3.4304,
"step": 12
},
{
"epoch": 0.0056232934115547865,
"grad_norm": 10.237251281738281,
"learning_rate": 1.04e-05,
"loss": 3.3628,
"step": 13
},
{
"epoch": 0.006055854443212847,
"grad_norm": 10.820761680603027,
"learning_rate": 1.1200000000000001e-05,
"loss": 3.4281,
"step": 14
},
{
"epoch": 0.006488415474870907,
"grad_norm": 10.799651145935059,
"learning_rate": 1.2e-05,
"loss": 3.4524,
"step": 15
},
{
"epoch": 0.006920976506528968,
"grad_norm": 11.567938804626465,
"learning_rate": 1.2800000000000001e-05,
"loss": 3.4891,
"step": 16
},
{
"epoch": 0.007353537538187028,
"grad_norm": 11.040711402893066,
"learning_rate": 1.3600000000000002e-05,
"loss": 3.3775,
"step": 17
},
{
"epoch": 0.0077860985698450895,
"grad_norm": 10.833939552307129,
"learning_rate": 1.4400000000000001e-05,
"loss": 3.4044,
"step": 18
},
{
"epoch": 0.008218659601503149,
"grad_norm": 11.41971206665039,
"learning_rate": 1.5200000000000002e-05,
"loss": 3.4054,
"step": 19
},
{
"epoch": 0.00865122063316121,
"grad_norm": 11.41285514831543,
"learning_rate": 1.6000000000000003e-05,
"loss": 3.2957,
"step": 20
},
{
"epoch": 0.00908378166481927,
"grad_norm": 11.056841850280762,
"learning_rate": 1.6800000000000002e-05,
"loss": 3.3388,
"step": 21
},
{
"epoch": 0.009516342696477331,
"grad_norm": 11.64125919342041,
"learning_rate": 1.76e-05,
"loss": 3.3148,
"step": 22
},
{
"epoch": 0.00994890372813539,
"grad_norm": 11.749085426330566,
"learning_rate": 1.8400000000000003e-05,
"loss": 3.3344,
"step": 23
},
{
"epoch": 0.010381464759793452,
"grad_norm": 10.946850776672363,
"learning_rate": 1.9200000000000003e-05,
"loss": 3.3136,
"step": 24
},
{
"epoch": 0.010814025791451513,
"grad_norm": 11.453843116760254,
"learning_rate": 2e-05,
"loss": 3.2952,
"step": 25
},
{
"epoch": 0.011246586823109573,
"grad_norm": 10.720843315124512,
"learning_rate": 1.99912510936133e-05,
"loss": 3.3089,
"step": 26
},
{
"epoch": 0.011679147854767634,
"grad_norm": 10.230990409851074,
"learning_rate": 1.9982502187226597e-05,
"loss": 3.1701,
"step": 27
},
{
"epoch": 0.012111708886425694,
"grad_norm": 10.450053215026855,
"learning_rate": 1.9973753280839896e-05,
"loss": 3.3022,
"step": 28
},
{
"epoch": 0.012544269918083755,
"grad_norm": 9.970443725585938,
"learning_rate": 1.9965004374453195e-05,
"loss": 3.1913,
"step": 29
},
{
"epoch": 0.012976830949741815,
"grad_norm": 8.793647766113281,
"learning_rate": 1.9956255468066494e-05,
"loss": 3.1559,
"step": 30
},
{
"epoch": 0.013409391981399876,
"grad_norm": 9.256972312927246,
"learning_rate": 1.9947506561679793e-05,
"loss": 3.0502,
"step": 31
},
{
"epoch": 0.013841953013057936,
"grad_norm": 9.748438835144043,
"learning_rate": 1.993875765529309e-05,
"loss": 3.1057,
"step": 32
},
{
"epoch": 0.014274514044715997,
"grad_norm": 9.952435493469238,
"learning_rate": 1.993000874890639e-05,
"loss": 3.0494,
"step": 33
},
{
"epoch": 0.014707075076374056,
"grad_norm": 9.690461158752441,
"learning_rate": 1.9921259842519688e-05,
"loss": 3.0637,
"step": 34
},
{
"epoch": 0.015139636108032118,
"grad_norm": 9.847665786743164,
"learning_rate": 1.9912510936132984e-05,
"loss": 3.039,
"step": 35
},
{
"epoch": 0.015572197139690179,
"grad_norm": 8.799947738647461,
"learning_rate": 1.9903762029746283e-05,
"loss": 3.0492,
"step": 36
},
{
"epoch": 0.01600475817134824,
"grad_norm": 9.004206657409668,
"learning_rate": 1.9895013123359582e-05,
"loss": 2.9891,
"step": 37
},
{
"epoch": 0.016437319203006298,
"grad_norm": 9.604561805725098,
"learning_rate": 1.9886264216972878e-05,
"loss": 2.9614,
"step": 38
},
{
"epoch": 0.01686988023466436,
"grad_norm": 9.404193878173828,
"learning_rate": 1.9877515310586177e-05,
"loss": 2.9704,
"step": 39
},
{
"epoch": 0.01730244126632242,
"grad_norm": 10.134177207946777,
"learning_rate": 1.9868766404199476e-05,
"loss": 2.8793,
"step": 40
},
{
"epoch": 0.01773500229798048,
"grad_norm": 8.96322250366211,
"learning_rate": 1.9860017497812775e-05,
"loss": 2.913,
"step": 41
},
{
"epoch": 0.01816756332963854,
"grad_norm": 8.704127311706543,
"learning_rate": 1.9851268591426075e-05,
"loss": 2.8886,
"step": 42
},
{
"epoch": 0.018600124361296603,
"grad_norm": 9.069153785705566,
"learning_rate": 1.984251968503937e-05,
"loss": 2.8725,
"step": 43
},
{
"epoch": 0.019032685392954662,
"grad_norm": 9.031394004821777,
"learning_rate": 1.983377077865267e-05,
"loss": 2.8581,
"step": 44
},
{
"epoch": 0.019465246424612722,
"grad_norm": 8.829975128173828,
"learning_rate": 1.982502187226597e-05,
"loss": 2.8044,
"step": 45
},
{
"epoch": 0.01989780745627078,
"grad_norm": 8.565601348876953,
"learning_rate": 1.9816272965879265e-05,
"loss": 2.8094,
"step": 46
},
{
"epoch": 0.020330368487928845,
"grad_norm": 8.82313060760498,
"learning_rate": 1.9807524059492564e-05,
"loss": 2.7984,
"step": 47
},
{
"epoch": 0.020762929519586904,
"grad_norm": 9.0712251663208,
"learning_rate": 1.9798775153105863e-05,
"loss": 2.6778,
"step": 48
},
{
"epoch": 0.021195490551244964,
"grad_norm": 9.187145233154297,
"learning_rate": 1.9790026246719162e-05,
"loss": 2.7568,
"step": 49
},
{
"epoch": 0.021628051582903027,
"grad_norm": 9.249820709228516,
"learning_rate": 1.978127734033246e-05,
"loss": 2.7787,
"step": 50
},
{
"epoch": 0.022060612614561086,
"grad_norm": 8.687273025512695,
"learning_rate": 1.977252843394576e-05,
"loss": 2.7557,
"step": 51
},
{
"epoch": 0.022493173646219146,
"grad_norm": 9.046874046325684,
"learning_rate": 1.9763779527559057e-05,
"loss": 2.6655,
"step": 52
},
{
"epoch": 0.022925734677877205,
"grad_norm": 9.147759437561035,
"learning_rate": 1.9755030621172356e-05,
"loss": 2.6384,
"step": 53
},
{
"epoch": 0.02335829570953527,
"grad_norm": 8.494421005249023,
"learning_rate": 1.9746281714785655e-05,
"loss": 2.668,
"step": 54
},
{
"epoch": 0.023790856741193328,
"grad_norm": 9.623828887939453,
"learning_rate": 1.973753280839895e-05,
"loss": 2.5583,
"step": 55
},
{
"epoch": 0.024223417772851388,
"grad_norm": 8.837956428527832,
"learning_rate": 1.972878390201225e-05,
"loss": 2.657,
"step": 56
},
{
"epoch": 0.024655978804509447,
"grad_norm": 9.294939041137695,
"learning_rate": 1.972003499562555e-05,
"loss": 2.5761,
"step": 57
},
{
"epoch": 0.02508853983616751,
"grad_norm": 9.436785697937012,
"learning_rate": 1.9711286089238845e-05,
"loss": 2.5929,
"step": 58
},
{
"epoch": 0.02552110086782557,
"grad_norm": 8.875762939453125,
"learning_rate": 1.9702537182852148e-05,
"loss": 2.6005,
"step": 59
},
{
"epoch": 0.02595366189948363,
"grad_norm": 8.302167892456055,
"learning_rate": 1.9693788276465443e-05,
"loss": 2.5536,
"step": 60
},
{
"epoch": 0.026386222931141692,
"grad_norm": 8.504595756530762,
"learning_rate": 1.9685039370078743e-05,
"loss": 2.479,
"step": 61
},
{
"epoch": 0.026818783962799752,
"grad_norm": 9.210023880004883,
"learning_rate": 1.9676290463692042e-05,
"loss": 2.4938,
"step": 62
},
{
"epoch": 0.02725134499445781,
"grad_norm": 8.98862075805664,
"learning_rate": 1.9667541557305338e-05,
"loss": 2.4786,
"step": 63
},
{
"epoch": 0.02768390602611587,
"grad_norm": 8.775471687316895,
"learning_rate": 1.9658792650918637e-05,
"loss": 2.4588,
"step": 64
},
{
"epoch": 0.028116467057773934,
"grad_norm": 8.455652236938477,
"learning_rate": 1.9650043744531936e-05,
"loss": 2.5419,
"step": 65
},
{
"epoch": 0.028549028089431994,
"grad_norm": 8.13263988494873,
"learning_rate": 1.9641294838145232e-05,
"loss": 2.4738,
"step": 66
},
{
"epoch": 0.028981589121090053,
"grad_norm": 8.96052360534668,
"learning_rate": 1.963254593175853e-05,
"loss": 2.3496,
"step": 67
},
{
"epoch": 0.029414150152748113,
"grad_norm": 8.10035228729248,
"learning_rate": 1.962379702537183e-05,
"loss": 2.4487,
"step": 68
},
{
"epoch": 0.029846711184406176,
"grad_norm": 8.069640159606934,
"learning_rate": 1.961504811898513e-05,
"loss": 2.4545,
"step": 69
},
{
"epoch": 0.030279272216064235,
"grad_norm": 8.711186408996582,
"learning_rate": 1.960629921259843e-05,
"loss": 2.3936,
"step": 70
},
{
"epoch": 0.030711833247722295,
"grad_norm": 9.187376022338867,
"learning_rate": 1.9597550306211725e-05,
"loss": 2.2613,
"step": 71
},
{
"epoch": 0.031144394279380358,
"grad_norm": 8.515965461730957,
"learning_rate": 1.9588801399825024e-05,
"loss": 2.4022,
"step": 72
},
{
"epoch": 0.031576955311038414,
"grad_norm": 8.901565551757812,
"learning_rate": 1.9580052493438323e-05,
"loss": 2.3848,
"step": 73
},
{
"epoch": 0.03200951634269648,
"grad_norm": 8.479150772094727,
"learning_rate": 1.957130358705162e-05,
"loss": 2.2304,
"step": 74
},
{
"epoch": 0.03244207737435454,
"grad_norm": 8.931798934936523,
"learning_rate": 1.9562554680664918e-05,
"loss": 2.3045,
"step": 75
},
{
"epoch": 0.032874638406012596,
"grad_norm": 8.757261276245117,
"learning_rate": 1.9553805774278217e-05,
"loss": 2.3887,
"step": 76
},
{
"epoch": 0.03330719943767066,
"grad_norm": 8.421159744262695,
"learning_rate": 1.9545056867891513e-05,
"loss": 2.3631,
"step": 77
},
{
"epoch": 0.03373976046932872,
"grad_norm": 8.453868865966797,
"learning_rate": 1.9536307961504816e-05,
"loss": 2.1783,
"step": 78
},
{
"epoch": 0.03417232150098678,
"grad_norm": 8.190146446228027,
"learning_rate": 1.952755905511811e-05,
"loss": 2.3526,
"step": 79
},
{
"epoch": 0.03460488253264484,
"grad_norm": 9.044288635253906,
"learning_rate": 1.951881014873141e-05,
"loss": 2.2243,
"step": 80
},
{
"epoch": 0.0350374435643029,
"grad_norm": 7.598982334136963,
"learning_rate": 1.951006124234471e-05,
"loss": 2.3016,
"step": 81
},
{
"epoch": 0.03547000459596096,
"grad_norm": 8.639673233032227,
"learning_rate": 1.9501312335958006e-05,
"loss": 2.2283,
"step": 82
},
{
"epoch": 0.035902565627619024,
"grad_norm": 8.017471313476562,
"learning_rate": 1.9492563429571305e-05,
"loss": 2.2479,
"step": 83
},
{
"epoch": 0.03633512665927708,
"grad_norm": 8.051676750183105,
"learning_rate": 1.9483814523184604e-05,
"loss": 2.2333,
"step": 84
},
{
"epoch": 0.03676768769093514,
"grad_norm": 9.102787971496582,
"learning_rate": 1.94750656167979e-05,
"loss": 2.1107,
"step": 85
},
{
"epoch": 0.037200248722593206,
"grad_norm": 7.457028388977051,
"learning_rate": 1.94663167104112e-05,
"loss": 2.307,
"step": 86
},
{
"epoch": 0.03763280975425126,
"grad_norm": 8.147334098815918,
"learning_rate": 1.9457567804024498e-05,
"loss": 2.1986,
"step": 87
},
{
"epoch": 0.038065370785909325,
"grad_norm": 8.232810974121094,
"learning_rate": 1.9448818897637797e-05,
"loss": 2.1094,
"step": 88
},
{
"epoch": 0.03849793181756739,
"grad_norm": 8.264384269714355,
"learning_rate": 1.9440069991251097e-05,
"loss": 2.2574,
"step": 89
},
{
"epoch": 0.038930492849225444,
"grad_norm": 8.394896507263184,
"learning_rate": 1.9431321084864392e-05,
"loss": 2.2311,
"step": 90
},
{
"epoch": 0.03936305388088351,
"grad_norm": 8.641265869140625,
"learning_rate": 1.9422572178477692e-05,
"loss": 2.1939,
"step": 91
},
{
"epoch": 0.03979561491254156,
"grad_norm": 7.687323570251465,
"learning_rate": 1.941382327209099e-05,
"loss": 2.1942,
"step": 92
},
{
"epoch": 0.040228175944199626,
"grad_norm": 7.773819923400879,
"learning_rate": 1.9405074365704287e-05,
"loss": 2.2004,
"step": 93
},
{
"epoch": 0.04066073697585769,
"grad_norm": 8.127384185791016,
"learning_rate": 1.9396325459317586e-05,
"loss": 2.1736,
"step": 94
},
{
"epoch": 0.041093298007515745,
"grad_norm": 9.069780349731445,
"learning_rate": 1.9387576552930885e-05,
"loss": 2.0747,
"step": 95
},
{
"epoch": 0.04152585903917381,
"grad_norm": 7.772279739379883,
"learning_rate": 1.9378827646544184e-05,
"loss": 2.1285,
"step": 96
},
{
"epoch": 0.04195842007083187,
"grad_norm": 8.430010795593262,
"learning_rate": 1.937007874015748e-05,
"loss": 2.2845,
"step": 97
},
{
"epoch": 0.04239098110248993,
"grad_norm": 7.560873508453369,
"learning_rate": 1.9361329833770783e-05,
"loss": 2.1905,
"step": 98
},
{
"epoch": 0.04282354213414799,
"grad_norm": 9.867399215698242,
"learning_rate": 1.935258092738408e-05,
"loss": 2.1375,
"step": 99
},
{
"epoch": 0.043256103165806054,
"grad_norm": 9.414137840270996,
"learning_rate": 1.9343832020997378e-05,
"loss": 2.059,
"step": 100
},
{
"epoch": 0.04368866419746411,
"grad_norm": 8.068355560302734,
"learning_rate": 1.9335083114610677e-05,
"loss": 2.1939,
"step": 101
},
{
"epoch": 0.04412122522912217,
"grad_norm": 7.778262615203857,
"learning_rate": 1.9326334208223973e-05,
"loss": 2.2688,
"step": 102
},
{
"epoch": 0.04455378626078023,
"grad_norm": 8.01554012298584,
"learning_rate": 1.9317585301837272e-05,
"loss": 2.1017,
"step": 103
},
{
"epoch": 0.04498634729243829,
"grad_norm": 7.774407386779785,
"learning_rate": 1.930883639545057e-05,
"loss": 2.0725,
"step": 104
},
{
"epoch": 0.045418908324096355,
"grad_norm": 8.284996032714844,
"learning_rate": 1.9300087489063867e-05,
"loss": 2.0856,
"step": 105
},
{
"epoch": 0.04585146935575441,
"grad_norm": 10.201037406921387,
"learning_rate": 1.9291338582677166e-05,
"loss": 2.3246,
"step": 106
},
{
"epoch": 0.046284030387412474,
"grad_norm": 7.818156719207764,
"learning_rate": 1.9282589676290465e-05,
"loss": 2.041,
"step": 107
},
{
"epoch": 0.04671659141907054,
"grad_norm": 10.078817367553711,
"learning_rate": 1.9273840769903765e-05,
"loss": 2.0881,
"step": 108
},
{
"epoch": 0.04714915245072859,
"grad_norm": 7.91151762008667,
"learning_rate": 1.9265091863517064e-05,
"loss": 2.0693,
"step": 109
},
{
"epoch": 0.047581713482386656,
"grad_norm": 8.433507919311523,
"learning_rate": 1.925634295713036e-05,
"loss": 2.1478,
"step": 110
},
{
"epoch": 0.04801427451404472,
"grad_norm": 8.348559379577637,
"learning_rate": 1.924759405074366e-05,
"loss": 2.0982,
"step": 111
},
{
"epoch": 0.048446835545702775,
"grad_norm": 8.86949348449707,
"learning_rate": 1.9238845144356958e-05,
"loss": 2.0297,
"step": 112
},
{
"epoch": 0.04887939657736084,
"grad_norm": 8.434800148010254,
"learning_rate": 1.9230096237970254e-05,
"loss": 2.1681,
"step": 113
},
{
"epoch": 0.049311957609018894,
"grad_norm": 9.036213874816895,
"learning_rate": 1.9221347331583553e-05,
"loss": 2.047,
"step": 114
},
{
"epoch": 0.04974451864067696,
"grad_norm": 9.136820793151855,
"learning_rate": 1.9212598425196852e-05,
"loss": 2.0121,
"step": 115
},
{
"epoch": 0.05017707967233502,
"grad_norm": 10.170919418334961,
"learning_rate": 1.9203849518810148e-05,
"loss": 2.1075,
"step": 116
},
{
"epoch": 0.050609640703993077,
"grad_norm": 8.642443656921387,
"learning_rate": 1.919510061242345e-05,
"loss": 2.1603,
"step": 117
},
{
"epoch": 0.05104220173565114,
"grad_norm": 8.400376319885254,
"learning_rate": 1.9186351706036747e-05,
"loss": 2.0528,
"step": 118
},
{
"epoch": 0.0514747627673092,
"grad_norm": 10.212697982788086,
"learning_rate": 1.9177602799650046e-05,
"loss": 2.0214,
"step": 119
},
{
"epoch": 0.05190732379896726,
"grad_norm": 8.315373420715332,
"learning_rate": 1.9168853893263345e-05,
"loss": 2.0902,
"step": 120
},
{
"epoch": 0.05233988483062532,
"grad_norm": 9.261432647705078,
"learning_rate": 1.916010498687664e-05,
"loss": 1.9828,
"step": 121
},
{
"epoch": 0.052772445862283385,
"grad_norm": 9.281742095947266,
"learning_rate": 1.915135608048994e-05,
"loss": 2.0072,
"step": 122
},
{
"epoch": 0.05320500689394144,
"grad_norm": 9.036360740661621,
"learning_rate": 1.914260717410324e-05,
"loss": 2.0366,
"step": 123
},
{
"epoch": 0.053637567925599504,
"grad_norm": 9.62345027923584,
"learning_rate": 1.9133858267716535e-05,
"loss": 2.0936,
"step": 124
},
{
"epoch": 0.05407012895725756,
"grad_norm": 8.963865280151367,
"learning_rate": 1.9125109361329834e-05,
"loss": 1.9454,
"step": 125
},
{
"epoch": 0.05450268998891562,
"grad_norm": 9.449174880981445,
"learning_rate": 1.9116360454943133e-05,
"loss": 2.0623,
"step": 126
},
{
"epoch": 0.054935251020573686,
"grad_norm": 9.598356246948242,
"learning_rate": 1.9107611548556433e-05,
"loss": 2.0225,
"step": 127
},
{
"epoch": 0.05536781205223174,
"grad_norm": 8.742138862609863,
"learning_rate": 1.9098862642169732e-05,
"loss": 2.0985,
"step": 128
},
{
"epoch": 0.055800373083889805,
"grad_norm": 9.00941276550293,
"learning_rate": 1.9090113735783028e-05,
"loss": 1.9905,
"step": 129
},
{
"epoch": 0.05623293411554787,
"grad_norm": 10.214828491210938,
"learning_rate": 1.9081364829396327e-05,
"loss": 2.0573,
"step": 130
},
{
"epoch": 0.056665495147205924,
"grad_norm": 8.646235466003418,
"learning_rate": 1.9072615923009626e-05,
"loss": 1.9669,
"step": 131
},
{
"epoch": 0.05709805617886399,
"grad_norm": 9.02608585357666,
"learning_rate": 1.9063867016622922e-05,
"loss": 1.9952,
"step": 132
},
{
"epoch": 0.05753061721052205,
"grad_norm": 9.741292953491211,
"learning_rate": 1.905511811023622e-05,
"loss": 2.0118,
"step": 133
},
{
"epoch": 0.057963178242180106,
"grad_norm": 9.34339427947998,
"learning_rate": 1.904636920384952e-05,
"loss": 2.0352,
"step": 134
},
{
"epoch": 0.05839573927383817,
"grad_norm": 9.542576789855957,
"learning_rate": 1.9037620297462816e-05,
"loss": 2.0067,
"step": 135
},
{
"epoch": 0.058828300305496226,
"grad_norm": 9.324414253234863,
"learning_rate": 1.902887139107612e-05,
"loss": 2.0344,
"step": 136
},
{
"epoch": 0.05926086133715429,
"grad_norm": 10.529446601867676,
"learning_rate": 1.9020122484689415e-05,
"loss": 1.905,
"step": 137
},
{
"epoch": 0.05969342236881235,
"grad_norm": 9.371881484985352,
"learning_rate": 1.9011373578302714e-05,
"loss": 1.9717,
"step": 138
},
{
"epoch": 0.06012598340047041,
"grad_norm": 10.534412384033203,
"learning_rate": 1.9002624671916013e-05,
"loss": 2.1448,
"step": 139
},
{
"epoch": 0.06055854443212847,
"grad_norm": 9.028956413269043,
"learning_rate": 1.899387576552931e-05,
"loss": 2.0132,
"step": 140
},
{
"epoch": 0.060991105463786534,
"grad_norm": 9.384864807128906,
"learning_rate": 1.8985126859142608e-05,
"loss": 1.9173,
"step": 141
},
{
"epoch": 0.06142366649544459,
"grad_norm": 9.299654960632324,
"learning_rate": 1.8976377952755907e-05,
"loss": 1.9669,
"step": 142
},
{
"epoch": 0.06185622752710265,
"grad_norm": 10.067913055419922,
"learning_rate": 1.8967629046369206e-05,
"loss": 1.9765,
"step": 143
},
{
"epoch": 0.062288788558760716,
"grad_norm": 10.266395568847656,
"learning_rate": 1.8958880139982502e-05,
"loss": 1.9275,
"step": 144
},
{
"epoch": 0.06272134959041878,
"grad_norm": 9.555045127868652,
"learning_rate": 1.89501312335958e-05,
"loss": 1.9977,
"step": 145
},
{
"epoch": 0.06315391062207683,
"grad_norm": 9.367684364318848,
"learning_rate": 1.89413823272091e-05,
"loss": 1.9813,
"step": 146
},
{
"epoch": 0.06358647165373489,
"grad_norm": 9.195287704467773,
"learning_rate": 1.89326334208224e-05,
"loss": 1.982,
"step": 147
},
{
"epoch": 0.06401903268539295,
"grad_norm": 11.219182014465332,
"learning_rate": 1.89238845144357e-05,
"loss": 2.0398,
"step": 148
},
{
"epoch": 0.06445159371705102,
"grad_norm": 10.429877281188965,
"learning_rate": 1.8915135608048995e-05,
"loss": 1.9312,
"step": 149
},
{
"epoch": 0.06488415474870908,
"grad_norm": 10.788617134094238,
"learning_rate": 1.8906386701662294e-05,
"loss": 1.9711,
"step": 150
},
{
"epoch": 0.06531671578036713,
"grad_norm": 11.090737342834473,
"learning_rate": 1.8897637795275593e-05,
"loss": 2.0233,
"step": 151
},
{
"epoch": 0.06574927681202519,
"grad_norm": 10.588007926940918,
"learning_rate": 1.888888888888889e-05,
"loss": 2.0727,
"step": 152
},
{
"epoch": 0.06618183784368326,
"grad_norm": 10.38557243347168,
"learning_rate": 1.888013998250219e-05,
"loss": 1.9655,
"step": 153
},
{
"epoch": 0.06661439887534132,
"grad_norm": 9.603656768798828,
"learning_rate": 1.8871391076115488e-05,
"loss": 1.9975,
"step": 154
},
{
"epoch": 0.06704695990699938,
"grad_norm": 9.86426830291748,
"learning_rate": 1.8862642169728787e-05,
"loss": 1.9229,
"step": 155
},
{
"epoch": 0.06747952093865744,
"grad_norm": 9.31710147857666,
"learning_rate": 1.8853893263342086e-05,
"loss": 1.9682,
"step": 156
},
{
"epoch": 0.0679120819703155,
"grad_norm": 10.599958419799805,
"learning_rate": 1.8845144356955382e-05,
"loss": 1.9162,
"step": 157
},
{
"epoch": 0.06834464300197356,
"grad_norm": 9.712906837463379,
"learning_rate": 1.883639545056868e-05,
"loss": 1.9684,
"step": 158
},
{
"epoch": 0.06877720403363162,
"grad_norm": 10.723039627075195,
"learning_rate": 1.882764654418198e-05,
"loss": 1.9253,
"step": 159
},
{
"epoch": 0.06920976506528968,
"grad_norm": 9.580291748046875,
"learning_rate": 1.8818897637795276e-05,
"loss": 2.0012,
"step": 160
},
{
"epoch": 0.06964232609694775,
"grad_norm": 10.511956214904785,
"learning_rate": 1.8810148731408575e-05,
"loss": 1.9845,
"step": 161
},
{
"epoch": 0.0700748871286058,
"grad_norm": 11.413307189941406,
"learning_rate": 1.8801399825021874e-05,
"loss": 1.9733,
"step": 162
},
{
"epoch": 0.07050744816026386,
"grad_norm": 9.986128807067871,
"learning_rate": 1.879265091863517e-05,
"loss": 1.8991,
"step": 163
},
{
"epoch": 0.07094000919192192,
"grad_norm": 10.067076683044434,
"learning_rate": 1.878390201224847e-05,
"loss": 1.9415,
"step": 164
},
{
"epoch": 0.07137257022357998,
"grad_norm": 10.817390441894531,
"learning_rate": 1.877515310586177e-05,
"loss": 1.965,
"step": 165
},
{
"epoch": 0.07180513125523805,
"grad_norm": 10.907906532287598,
"learning_rate": 1.8766404199475068e-05,
"loss": 1.8497,
"step": 166
},
{
"epoch": 0.07223769228689611,
"grad_norm": 12.694193840026855,
"learning_rate": 1.8757655293088367e-05,
"loss": 1.8588,
"step": 167
},
{
"epoch": 0.07267025331855416,
"grad_norm": 11.461287498474121,
"learning_rate": 1.8748906386701663e-05,
"loss": 1.9708,
"step": 168
},
{
"epoch": 0.07310281435021222,
"grad_norm": 12.703743934631348,
"learning_rate": 1.8740157480314962e-05,
"loss": 1.9838,
"step": 169
},
{
"epoch": 0.07353537538187029,
"grad_norm": 9.77420425415039,
"learning_rate": 1.873140857392826e-05,
"loss": 1.9551,
"step": 170
},
{
"epoch": 0.07396793641352835,
"grad_norm": 11.191120147705078,
"learning_rate": 1.8722659667541557e-05,
"loss": 2.001,
"step": 171
},
{
"epoch": 0.07440049744518641,
"grad_norm": 9.07374095916748,
"learning_rate": 1.8713910761154856e-05,
"loss": 1.874,
"step": 172
},
{
"epoch": 0.07483305847684446,
"grad_norm": 10.750893592834473,
"learning_rate": 1.8705161854768156e-05,
"loss": 1.8673,
"step": 173
},
{
"epoch": 0.07526561950850252,
"grad_norm": 10.637674331665039,
"learning_rate": 1.8696412948381455e-05,
"loss": 1.9057,
"step": 174
},
{
"epoch": 0.07569818054016059,
"grad_norm": 11.109024047851562,
"learning_rate": 1.8687664041994754e-05,
"loss": 1.9262,
"step": 175
},
{
"epoch": 0.07613074157181865,
"grad_norm": 10.31051254272461,
"learning_rate": 1.867891513560805e-05,
"loss": 1.9316,
"step": 176
},
{
"epoch": 0.07656330260347671,
"grad_norm": 10.382652282714844,
"learning_rate": 1.867016622922135e-05,
"loss": 1.8514,
"step": 177
},
{
"epoch": 0.07699586363513478,
"grad_norm": 11.28693962097168,
"learning_rate": 1.8661417322834648e-05,
"loss": 1.9258,
"step": 178
},
{
"epoch": 0.07742842466679282,
"grad_norm": 9.876675605773926,
"learning_rate": 1.8652668416447944e-05,
"loss": 1.935,
"step": 179
},
{
"epoch": 0.07786098569845089,
"grad_norm": 10.760346412658691,
"learning_rate": 1.8643919510061243e-05,
"loss": 1.8479,
"step": 180
},
{
"epoch": 0.07829354673010895,
"grad_norm": 10.346819877624512,
"learning_rate": 1.8635170603674542e-05,
"loss": 1.8821,
"step": 181
},
{
"epoch": 0.07872610776176701,
"grad_norm": 10.750500679016113,
"learning_rate": 1.8626421697287838e-05,
"loss": 1.8996,
"step": 182
},
{
"epoch": 0.07915866879342508,
"grad_norm": 11.348615646362305,
"learning_rate": 1.8617672790901137e-05,
"loss": 1.8542,
"step": 183
},
{
"epoch": 0.07959122982508313,
"grad_norm": 10.184338569641113,
"learning_rate": 1.8608923884514437e-05,
"loss": 1.9138,
"step": 184
},
{
"epoch": 0.08002379085674119,
"grad_norm": 12.307705879211426,
"learning_rate": 1.8600174978127736e-05,
"loss": 1.8653,
"step": 185
},
{
"epoch": 0.08045635188839925,
"grad_norm": 9.707239151000977,
"learning_rate": 1.8591426071741035e-05,
"loss": 1.9195,
"step": 186
},
{
"epoch": 0.08088891292005732,
"grad_norm": 10.568989753723145,
"learning_rate": 1.858267716535433e-05,
"loss": 1.8921,
"step": 187
},
{
"epoch": 0.08132147395171538,
"grad_norm": 10.401839256286621,
"learning_rate": 1.857392825896763e-05,
"loss": 1.9463,
"step": 188
},
{
"epoch": 0.08175403498337344,
"grad_norm": 11.29499626159668,
"learning_rate": 1.856517935258093e-05,
"loss": 1.8881,
"step": 189
},
{
"epoch": 0.08218659601503149,
"grad_norm": 12.853240013122559,
"learning_rate": 1.855643044619423e-05,
"loss": 1.8063,
"step": 190
},
{
"epoch": 0.08261915704668955,
"grad_norm": 12.939081192016602,
"learning_rate": 1.8547681539807524e-05,
"loss": 1.8993,
"step": 191
},
{
"epoch": 0.08305171807834762,
"grad_norm": 11.397017478942871,
"learning_rate": 1.8538932633420824e-05,
"loss": 1.9577,
"step": 192
},
{
"epoch": 0.08348427911000568,
"grad_norm": 10.588729858398438,
"learning_rate": 1.8530183727034123e-05,
"loss": 1.9725,
"step": 193
},
{
"epoch": 0.08391684014166374,
"grad_norm": 10.367941856384277,
"learning_rate": 1.8521434820647422e-05,
"loss": 1.9014,
"step": 194
},
{
"epoch": 0.08434940117332179,
"grad_norm": 10.358473777770996,
"learning_rate": 1.851268591426072e-05,
"loss": 1.9505,
"step": 195
},
{
"epoch": 0.08478196220497985,
"grad_norm": 10.932351112365723,
"learning_rate": 1.8503937007874017e-05,
"loss": 1.9628,
"step": 196
},
{
"epoch": 0.08521452323663792,
"grad_norm": 10.701963424682617,
"learning_rate": 1.8495188101487316e-05,
"loss": 1.8751,
"step": 197
},
{
"epoch": 0.08564708426829598,
"grad_norm": 12.282137870788574,
"learning_rate": 1.8486439195100615e-05,
"loss": 1.7744,
"step": 198
},
{
"epoch": 0.08607964529995404,
"grad_norm": 11.679487228393555,
"learning_rate": 1.847769028871391e-05,
"loss": 1.8378,
"step": 199
},
{
"epoch": 0.08651220633161211,
"grad_norm": 15.917386054992676,
"learning_rate": 1.846894138232721e-05,
"loss": 1.8038,
"step": 200
},
{
"epoch": 0.08694476736327016,
"grad_norm": 13.143421173095703,
"learning_rate": 1.846019247594051e-05,
"loss": 1.9285,
"step": 201
},
{
"epoch": 0.08737732839492822,
"grad_norm": 12.354574203491211,
"learning_rate": 1.8451443569553805e-05,
"loss": 1.7922,
"step": 202
},
{
"epoch": 0.08780988942658628,
"grad_norm": 13.790152549743652,
"learning_rate": 1.8442694663167108e-05,
"loss": 1.9562,
"step": 203
},
{
"epoch": 0.08824245045824435,
"grad_norm": 11.429886817932129,
"learning_rate": 1.8433945756780404e-05,
"loss": 1.7733,
"step": 204
},
{
"epoch": 0.08867501148990241,
"grad_norm": 11.52943229675293,
"learning_rate": 1.8425196850393703e-05,
"loss": 1.8627,
"step": 205
},
{
"epoch": 0.08910757252156046,
"grad_norm": 11.378132820129395,
"learning_rate": 1.8416447944007002e-05,
"loss": 1.7967,
"step": 206
},
{
"epoch": 0.08954013355321852,
"grad_norm": 10.589323997497559,
"learning_rate": 1.8407699037620298e-05,
"loss": 1.9194,
"step": 207
},
{
"epoch": 0.08997269458487658,
"grad_norm": 10.294939994812012,
"learning_rate": 1.8398950131233597e-05,
"loss": 1.8982,
"step": 208
},
{
"epoch": 0.09040525561653465,
"grad_norm": 10.35794448852539,
"learning_rate": 1.8390201224846896e-05,
"loss": 1.9172,
"step": 209
},
{
"epoch": 0.09083781664819271,
"grad_norm": 11.790461540222168,
"learning_rate": 1.8381452318460192e-05,
"loss": 1.8459,
"step": 210
},
{
"epoch": 0.09127037767985077,
"grad_norm": 12.333897590637207,
"learning_rate": 1.837270341207349e-05,
"loss": 1.7778,
"step": 211
},
{
"epoch": 0.09170293871150882,
"grad_norm": 12.425847053527832,
"learning_rate": 1.836395450568679e-05,
"loss": 1.8701,
"step": 212
},
{
"epoch": 0.09213549974316688,
"grad_norm": 11.710013389587402,
"learning_rate": 1.835520559930009e-05,
"loss": 1.7047,
"step": 213
},
{
"epoch": 0.09256806077482495,
"grad_norm": 11.748705863952637,
"learning_rate": 1.834645669291339e-05,
"loss": 1.8291,
"step": 214
},
{
"epoch": 0.09300062180648301,
"grad_norm": 11.944961547851562,
"learning_rate": 1.8337707786526685e-05,
"loss": 1.8243,
"step": 215
},
{
"epoch": 0.09343318283814107,
"grad_norm": 11.755900382995605,
"learning_rate": 1.8328958880139984e-05,
"loss": 1.883,
"step": 216
},
{
"epoch": 0.09386574386979912,
"grad_norm": 12.01804256439209,
"learning_rate": 1.8320209973753283e-05,
"loss": 1.8377,
"step": 217
},
{
"epoch": 0.09429830490145719,
"grad_norm": 11.53456974029541,
"learning_rate": 1.831146106736658e-05,
"loss": 1.7949,
"step": 218
},
{
"epoch": 0.09473086593311525,
"grad_norm": 10.804949760437012,
"learning_rate": 1.830271216097988e-05,
"loss": 1.8741,
"step": 219
},
{
"epoch": 0.09516342696477331,
"grad_norm": 11.649739265441895,
"learning_rate": 1.8293963254593178e-05,
"loss": 1.8424,
"step": 220
},
{
"epoch": 0.09559598799643138,
"grad_norm": 11.372157096862793,
"learning_rate": 1.8285214348206473e-05,
"loss": 1.7853,
"step": 221
},
{
"epoch": 0.09602854902808944,
"grad_norm": 10.01382064819336,
"learning_rate": 1.8276465441819776e-05,
"loss": 1.8669,
"step": 222
},
{
"epoch": 0.09646111005974749,
"grad_norm": 11.315641403198242,
"learning_rate": 1.8267716535433072e-05,
"loss": 1.7636,
"step": 223
},
{
"epoch": 0.09689367109140555,
"grad_norm": 12.253576278686523,
"learning_rate": 1.825896762904637e-05,
"loss": 1.8279,
"step": 224
},
{
"epoch": 0.09732623212306361,
"grad_norm": 11.940933227539062,
"learning_rate": 1.825021872265967e-05,
"loss": 1.9796,
"step": 225
},
{
"epoch": 0.09775879315472168,
"grad_norm": 11.986372947692871,
"learning_rate": 1.8241469816272966e-05,
"loss": 1.8362,
"step": 226
},
{
"epoch": 0.09819135418637974,
"grad_norm": 12.15494155883789,
"learning_rate": 1.8232720909886265e-05,
"loss": 1.8801,
"step": 227
},
{
"epoch": 0.09862391521803779,
"grad_norm": 11.447915077209473,
"learning_rate": 1.8223972003499564e-05,
"loss": 1.8247,
"step": 228
},
{
"epoch": 0.09905647624969585,
"grad_norm": 11.581623077392578,
"learning_rate": 1.821522309711286e-05,
"loss": 1.8369,
"step": 229
},
{
"epoch": 0.09948903728135391,
"grad_norm": 11.14620590209961,
"learning_rate": 1.820647419072616e-05,
"loss": 1.8653,
"step": 230
},
{
"epoch": 0.09992159831301198,
"grad_norm": 12.83015251159668,
"learning_rate": 1.819772528433946e-05,
"loss": 1.8845,
"step": 231
},
{
"epoch": 0.10035415934467004,
"grad_norm": 11.10083293914795,
"learning_rate": 1.8188976377952758e-05,
"loss": 1.837,
"step": 232
},
{
"epoch": 0.1007867203763281,
"grad_norm": 12.611995697021484,
"learning_rate": 1.8180227471566057e-05,
"loss": 1.8799,
"step": 233
},
{
"epoch": 0.10121928140798615,
"grad_norm": 14.688155174255371,
"learning_rate": 1.8171478565179353e-05,
"loss": 1.9175,
"step": 234
},
{
"epoch": 0.10165184243964422,
"grad_norm": 11.024300575256348,
"learning_rate": 1.8162729658792652e-05,
"loss": 1.7136,
"step": 235
},
{
"epoch": 0.10208440347130228,
"grad_norm": 14.889878273010254,
"learning_rate": 1.815398075240595e-05,
"loss": 1.7828,
"step": 236
},
{
"epoch": 0.10251696450296034,
"grad_norm": 11.804996490478516,
"learning_rate": 1.814523184601925e-05,
"loss": 1.8391,
"step": 237
},
{
"epoch": 0.1029495255346184,
"grad_norm": 11.816452026367188,
"learning_rate": 1.8136482939632546e-05,
"loss": 1.8498,
"step": 238
},
{
"epoch": 0.10338208656627645,
"grad_norm": 13.521631240844727,
"learning_rate": 1.8127734033245846e-05,
"loss": 1.7968,
"step": 239
},
{
"epoch": 0.10381464759793452,
"grad_norm": 11.820068359375,
"learning_rate": 1.8118985126859145e-05,
"loss": 1.8115,
"step": 240
},
{
"epoch": 0.10424720862959258,
"grad_norm": 14.849625587463379,
"learning_rate": 1.811023622047244e-05,
"loss": 1.7471,
"step": 241
},
{
"epoch": 0.10467976966125064,
"grad_norm": 11.63190746307373,
"learning_rate": 1.8101487314085743e-05,
"loss": 1.7383,
"step": 242
},
{
"epoch": 0.1051123306929087,
"grad_norm": 11.374736785888672,
"learning_rate": 1.809273840769904e-05,
"loss": 1.8412,
"step": 243
},
{
"epoch": 0.10554489172456677,
"grad_norm": 14.540604591369629,
"learning_rate": 1.8083989501312338e-05,
"loss": 1.7489,
"step": 244
},
{
"epoch": 0.10597745275622482,
"grad_norm": 12.734722137451172,
"learning_rate": 1.8075240594925637e-05,
"loss": 1.8997,
"step": 245
},
{
"epoch": 0.10641001378788288,
"grad_norm": 13.632735252380371,
"learning_rate": 1.8066491688538933e-05,
"loss": 1.8283,
"step": 246
},
{
"epoch": 0.10684257481954094,
"grad_norm": 13.188791275024414,
"learning_rate": 1.8057742782152232e-05,
"loss": 1.7936,
"step": 247
},
{
"epoch": 0.10727513585119901,
"grad_norm": 12.36187744140625,
"learning_rate": 1.804899387576553e-05,
"loss": 1.7985,
"step": 248
},
{
"epoch": 0.10770769688285707,
"grad_norm": 13.20405101776123,
"learning_rate": 1.8040244969378827e-05,
"loss": 1.7463,
"step": 249
},
{
"epoch": 0.10814025791451512,
"grad_norm": 12.463266372680664,
"learning_rate": 1.8031496062992127e-05,
"loss": 1.7374,
"step": 250
},
{
"epoch": 0.10857281894617318,
"grad_norm": 13.607605934143066,
"learning_rate": 1.8022747156605426e-05,
"loss": 1.7598,
"step": 251
},
{
"epoch": 0.10900537997783125,
"grad_norm": 13.48353099822998,
"learning_rate": 1.8013998250218725e-05,
"loss": 1.7661,
"step": 252
},
{
"epoch": 0.10943794100948931,
"grad_norm": 13.995173454284668,
"learning_rate": 1.8005249343832024e-05,
"loss": 1.7972,
"step": 253
},
{
"epoch": 0.10987050204114737,
"grad_norm": 14.007890701293945,
"learning_rate": 1.799650043744532e-05,
"loss": 1.7641,
"step": 254
},
{
"epoch": 0.11030306307280544,
"grad_norm": 16.224395751953125,
"learning_rate": 1.798775153105862e-05,
"loss": 1.8158,
"step": 255
},
{
"epoch": 0.11073562410446348,
"grad_norm": 15.336459159851074,
"learning_rate": 1.797900262467192e-05,
"loss": 1.7918,
"step": 256
},
{
"epoch": 0.11116818513612155,
"grad_norm": 14.77893352508545,
"learning_rate": 1.7970253718285214e-05,
"loss": 1.8369,
"step": 257
},
{
"epoch": 0.11160074616777961,
"grad_norm": 12.808744430541992,
"learning_rate": 1.7961504811898514e-05,
"loss": 1.7846,
"step": 258
},
{
"epoch": 0.11203330719943767,
"grad_norm": 11.46886920928955,
"learning_rate": 1.7952755905511813e-05,
"loss": 1.8555,
"step": 259
},
{
"epoch": 0.11246586823109574,
"grad_norm": 13.576665878295898,
"learning_rate": 1.794400699912511e-05,
"loss": 1.8384,
"step": 260
},
{
"epoch": 0.11289842926275379,
"grad_norm": 13.28268814086914,
"learning_rate": 1.793525809273841e-05,
"loss": 1.809,
"step": 261
},
{
"epoch": 0.11333099029441185,
"grad_norm": 11.743298530578613,
"learning_rate": 1.7926509186351707e-05,
"loss": 1.7515,
"step": 262
},
{
"epoch": 0.11376355132606991,
"grad_norm": 12.474933624267578,
"learning_rate": 1.7917760279965006e-05,
"loss": 1.7624,
"step": 263
},
{
"epoch": 0.11419611235772797,
"grad_norm": 13.138879776000977,
"learning_rate": 1.7909011373578305e-05,
"loss": 1.8352,
"step": 264
},
{
"epoch": 0.11462867338938604,
"grad_norm": 11.728463172912598,
"learning_rate": 1.79002624671916e-05,
"loss": 1.8015,
"step": 265
},
{
"epoch": 0.1150612344210441,
"grad_norm": 11.462135314941406,
"learning_rate": 1.78915135608049e-05,
"loss": 1.7868,
"step": 266
},
{
"epoch": 0.11549379545270215,
"grad_norm": 12.589030265808105,
"learning_rate": 1.78827646544182e-05,
"loss": 1.6825,
"step": 267
},
{
"epoch": 0.11592635648436021,
"grad_norm": 14.064338684082031,
"learning_rate": 1.7874015748031495e-05,
"loss": 1.7202,
"step": 268
},
{
"epoch": 0.11635891751601828,
"grad_norm": 12.65044116973877,
"learning_rate": 1.7865266841644795e-05,
"loss": 1.7865,
"step": 269
},
{
"epoch": 0.11679147854767634,
"grad_norm": 13.943512916564941,
"learning_rate": 1.7856517935258094e-05,
"loss": 1.7506,
"step": 270
},
{
"epoch": 0.1172240395793344,
"grad_norm": 13.109914779663086,
"learning_rate": 1.7847769028871393e-05,
"loss": 1.7659,
"step": 271
},
{
"epoch": 0.11765660061099245,
"grad_norm": 12.937542915344238,
"learning_rate": 1.7839020122484692e-05,
"loss": 1.8309,
"step": 272
},
{
"epoch": 0.11808916164265051,
"grad_norm": 12.564200401306152,
"learning_rate": 1.7830271216097988e-05,
"loss": 1.8121,
"step": 273
},
{
"epoch": 0.11852172267430858,
"grad_norm": 12.3132905960083,
"learning_rate": 1.7821522309711287e-05,
"loss": 1.8254,
"step": 274
},
{
"epoch": 0.11895428370596664,
"grad_norm": 10.902737617492676,
"learning_rate": 1.7812773403324587e-05,
"loss": 1.7647,
"step": 275
},
{
"epoch": 0.1193868447376247,
"grad_norm": 11.777158737182617,
"learning_rate": 1.7804024496937882e-05,
"loss": 1.776,
"step": 276
},
{
"epoch": 0.11981940576928277,
"grad_norm": 13.244769096374512,
"learning_rate": 1.779527559055118e-05,
"loss": 1.7909,
"step": 277
},
{
"epoch": 0.12025196680094082,
"grad_norm": 13.334715843200684,
"learning_rate": 1.778652668416448e-05,
"loss": 1.7326,
"step": 278
},
{
"epoch": 0.12068452783259888,
"grad_norm": 11.51339340209961,
"learning_rate": 1.7777777777777777e-05,
"loss": 1.7133,
"step": 279
},
{
"epoch": 0.12111708886425694,
"grad_norm": 13.407567024230957,
"learning_rate": 1.776902887139108e-05,
"loss": 1.8828,
"step": 280
},
{
"epoch": 0.121549649895915,
"grad_norm": 10.572535514831543,
"learning_rate": 1.7760279965004375e-05,
"loss": 1.8534,
"step": 281
},
{
"epoch": 0.12198221092757307,
"grad_norm": 12.543910026550293,
"learning_rate": 1.7751531058617674e-05,
"loss": 1.7876,
"step": 282
},
{
"epoch": 0.12241477195923112,
"grad_norm": 12.414093017578125,
"learning_rate": 1.7742782152230973e-05,
"loss": 1.7249,
"step": 283
},
{
"epoch": 0.12284733299088918,
"grad_norm": 12.800058364868164,
"learning_rate": 1.7734033245844273e-05,
"loss": 1.8623,
"step": 284
},
{
"epoch": 0.12327989402254724,
"grad_norm": 14.119792938232422,
"learning_rate": 1.772528433945757e-05,
"loss": 1.8873,
"step": 285
},
{
"epoch": 0.1237124550542053,
"grad_norm": 15.86700439453125,
"learning_rate": 1.7716535433070868e-05,
"loss": 1.6621,
"step": 286
},
{
"epoch": 0.12414501608586337,
"grad_norm": 12.095022201538086,
"learning_rate": 1.7707786526684167e-05,
"loss": 1.8166,
"step": 287
},
{
"epoch": 0.12457757711752143,
"grad_norm": 14.493571281433105,
"learning_rate": 1.7699037620297463e-05,
"loss": 1.9194,
"step": 288
},
{
"epoch": 0.1250101381491795,
"grad_norm": 12.202247619628906,
"learning_rate": 1.7690288713910762e-05,
"loss": 1.7355,
"step": 289
},
{
"epoch": 0.12544269918083756,
"grad_norm": 13.065779685974121,
"learning_rate": 1.768153980752406e-05,
"loss": 1.7089,
"step": 290
},
{
"epoch": 0.1258752602124956,
"grad_norm": 11.993766784667969,
"learning_rate": 1.767279090113736e-05,
"loss": 1.7966,
"step": 291
},
{
"epoch": 0.12630782124415366,
"grad_norm": 12.460329055786133,
"learning_rate": 1.766404199475066e-05,
"loss": 1.8666,
"step": 292
},
{
"epoch": 0.12674038227581172,
"grad_norm": 12.044783592224121,
"learning_rate": 1.7655293088363955e-05,
"loss": 1.7071,
"step": 293
},
{
"epoch": 0.12717294330746978,
"grad_norm": 14.959409713745117,
"learning_rate": 1.7646544181977255e-05,
"loss": 1.8463,
"step": 294
},
{
"epoch": 0.12760550433912785,
"grad_norm": 15.096085548400879,
"learning_rate": 1.7637795275590554e-05,
"loss": 1.8419,
"step": 295
},
{
"epoch": 0.1280380653707859,
"grad_norm": 12.444029808044434,
"learning_rate": 1.762904636920385e-05,
"loss": 1.7926,
"step": 296
},
{
"epoch": 0.12847062640244397,
"grad_norm": 11.794731140136719,
"learning_rate": 1.762029746281715e-05,
"loss": 1.7099,
"step": 297
},
{
"epoch": 0.12890318743410203,
"grad_norm": 12.06802749633789,
"learning_rate": 1.7611548556430448e-05,
"loss": 1.7576,
"step": 298
},
{
"epoch": 0.1293357484657601,
"grad_norm": 13.183172225952148,
"learning_rate": 1.7602799650043747e-05,
"loss": 1.7683,
"step": 299
},
{
"epoch": 0.12976830949741816,
"grad_norm": 12.662933349609375,
"learning_rate": 1.7594050743657046e-05,
"loss": 1.7571,
"step": 300
},
{
"epoch": 0.13020087052907622,
"grad_norm": 13.42810344696045,
"learning_rate": 1.7585301837270342e-05,
"loss": 1.8103,
"step": 301
},
{
"epoch": 0.13063343156073426,
"grad_norm": 12.349446296691895,
"learning_rate": 1.757655293088364e-05,
"loss": 1.8039,
"step": 302
},
{
"epoch": 0.13106599259239232,
"grad_norm": 12.955143928527832,
"learning_rate": 1.756780402449694e-05,
"loss": 1.7638,
"step": 303
},
{
"epoch": 0.13149855362405038,
"grad_norm": 12.840434074401855,
"learning_rate": 1.7559055118110236e-05,
"loss": 1.6824,
"step": 304
},
{
"epoch": 0.13193111465570845,
"grad_norm": 12.311904907226562,
"learning_rate": 1.7550306211723536e-05,
"loss": 1.7549,
"step": 305
},
{
"epoch": 0.1323636756873665,
"grad_norm": 14.21117877960205,
"learning_rate": 1.7541557305336835e-05,
"loss": 1.8167,
"step": 306
},
{
"epoch": 0.13279623671902457,
"grad_norm": 15.326268196105957,
"learning_rate": 1.753280839895013e-05,
"loss": 1.7197,
"step": 307
},
{
"epoch": 0.13322879775068264,
"grad_norm": 12.805957794189453,
"learning_rate": 1.752405949256343e-05,
"loss": 1.7673,
"step": 308
},
{
"epoch": 0.1336613587823407,
"grad_norm": 14.023848533630371,
"learning_rate": 1.751531058617673e-05,
"loss": 1.684,
"step": 309
},
{
"epoch": 0.13409391981399876,
"grad_norm": 14.051582336425781,
"learning_rate": 1.7506561679790028e-05,
"loss": 1.7981,
"step": 310
},
{
"epoch": 0.13452648084565683,
"grad_norm": 14.332208633422852,
"learning_rate": 1.7497812773403328e-05,
"loss": 1.7263,
"step": 311
},
{
"epoch": 0.1349590418773149,
"grad_norm": 13.481989860534668,
"learning_rate": 1.7489063867016623e-05,
"loss": 1.844,
"step": 312
},
{
"epoch": 0.13539160290897292,
"grad_norm": 13.109614372253418,
"learning_rate": 1.7480314960629923e-05,
"loss": 1.7525,
"step": 313
},
{
"epoch": 0.135824163940631,
"grad_norm": 12.776349067687988,
"learning_rate": 1.7471566054243222e-05,
"loss": 1.8072,
"step": 314
},
{
"epoch": 0.13625672497228905,
"grad_norm": 12.574403762817383,
"learning_rate": 1.7462817147856518e-05,
"loss": 1.7611,
"step": 315
},
{
"epoch": 0.1366892860039471,
"grad_norm": 13.381563186645508,
"learning_rate": 1.7454068241469817e-05,
"loss": 1.6969,
"step": 316
},
{
"epoch": 0.13712184703560518,
"grad_norm": 12.077308654785156,
"learning_rate": 1.7445319335083116e-05,
"loss": 1.8818,
"step": 317
},
{
"epoch": 0.13755440806726324,
"grad_norm": 12.719555854797363,
"learning_rate": 1.7436570428696415e-05,
"loss": 1.7705,
"step": 318
},
{
"epoch": 0.1379869690989213,
"grad_norm": 14.66213607788086,
"learning_rate": 1.7427821522309714e-05,
"loss": 1.6998,
"step": 319
},
{
"epoch": 0.13841953013057937,
"grad_norm": 13.779396057128906,
"learning_rate": 1.741907261592301e-05,
"loss": 1.7427,
"step": 320
},
{
"epoch": 0.13885209116223743,
"grad_norm": 14.436192512512207,
"learning_rate": 1.741032370953631e-05,
"loss": 1.746,
"step": 321
},
{
"epoch": 0.1392846521938955,
"grad_norm": 13.372818946838379,
"learning_rate": 1.740157480314961e-05,
"loss": 1.7303,
"step": 322
},
{
"epoch": 0.13971721322555355,
"grad_norm": 12.75151538848877,
"learning_rate": 1.7392825896762904e-05,
"loss": 1.7359,
"step": 323
},
{
"epoch": 0.1401497742572116,
"grad_norm": 14.832432746887207,
"learning_rate": 1.7384076990376204e-05,
"loss": 1.7595,
"step": 324
},
{
"epoch": 0.14058233528886965,
"grad_norm": 12.097084045410156,
"learning_rate": 1.7375328083989503e-05,
"loss": 1.8151,
"step": 325
},
{
"epoch": 0.14101489632052772,
"grad_norm": 15.27542781829834,
"learning_rate": 1.73665791776028e-05,
"loss": 1.7799,
"step": 326
},
{
"epoch": 0.14144745735218578,
"grad_norm": 13.651289939880371,
"learning_rate": 1.7357830271216098e-05,
"loss": 1.788,
"step": 327
},
{
"epoch": 0.14188001838384384,
"grad_norm": 12.324203491210938,
"learning_rate": 1.7349081364829397e-05,
"loss": 1.7426,
"step": 328
},
{
"epoch": 0.1423125794155019,
"grad_norm": 13.443849563598633,
"learning_rate": 1.7340332458442696e-05,
"loss": 1.7648,
"step": 329
},
{
"epoch": 0.14274514044715997,
"grad_norm": 15.6445894241333,
"learning_rate": 1.7331583552055995e-05,
"loss": 1.819,
"step": 330
},
{
"epoch": 0.14317770147881803,
"grad_norm": 14.16102409362793,
"learning_rate": 1.7322834645669295e-05,
"loss": 1.7316,
"step": 331
},
{
"epoch": 0.1436102625104761,
"grad_norm": 14.051115989685059,
"learning_rate": 1.731408573928259e-05,
"loss": 1.7395,
"step": 332
},
{
"epoch": 0.14404282354213416,
"grad_norm": 13.74497127532959,
"learning_rate": 1.730533683289589e-05,
"loss": 1.7931,
"step": 333
},
{
"epoch": 0.14447538457379222,
"grad_norm": 14.392535209655762,
"learning_rate": 1.729658792650919e-05,
"loss": 1.8631,
"step": 334
},
{
"epoch": 0.14490794560545026,
"grad_norm": 12.427038192749023,
"learning_rate": 1.7287839020122485e-05,
"loss": 1.7007,
"step": 335
},
{
"epoch": 0.14534050663710832,
"grad_norm": 13.627511024475098,
"learning_rate": 1.7279090113735784e-05,
"loss": 1.7324,
"step": 336
},
{
"epoch": 0.14577306766876638,
"grad_norm": 13.441803932189941,
"learning_rate": 1.7270341207349083e-05,
"loss": 1.7494,
"step": 337
},
{
"epoch": 0.14620562870042444,
"grad_norm": 15.683148384094238,
"learning_rate": 1.7261592300962382e-05,
"loss": 1.6773,
"step": 338
},
{
"epoch": 0.1466381897320825,
"grad_norm": 14.39584732055664,
"learning_rate": 1.725284339457568e-05,
"loss": 1.6515,
"step": 339
},
{
"epoch": 0.14707075076374057,
"grad_norm": 13.478503227233887,
"learning_rate": 1.7244094488188977e-05,
"loss": 1.7104,
"step": 340
},
{
"epoch": 0.14750331179539863,
"grad_norm": 13.521794319152832,
"learning_rate": 1.7235345581802277e-05,
"loss": 1.8039,
"step": 341
},
{
"epoch": 0.1479358728270567,
"grad_norm": 13.772107124328613,
"learning_rate": 1.7226596675415576e-05,
"loss": 1.7705,
"step": 342
},
{
"epoch": 0.14836843385871476,
"grad_norm": 12.936812400817871,
"learning_rate": 1.721784776902887e-05,
"loss": 1.6975,
"step": 343
},
{
"epoch": 0.14880099489037282,
"grad_norm": 13.462157249450684,
"learning_rate": 1.720909886264217e-05,
"loss": 1.7893,
"step": 344
},
{
"epoch": 0.14923355592203089,
"grad_norm": 12.636610984802246,
"learning_rate": 1.720034995625547e-05,
"loss": 1.7366,
"step": 345
},
{
"epoch": 0.14966611695368892,
"grad_norm": 12.809752464294434,
"learning_rate": 1.7191601049868766e-05,
"loss": 1.7415,
"step": 346
},
{
"epoch": 0.15009867798534698,
"grad_norm": 13.976734161376953,
"learning_rate": 1.718285214348207e-05,
"loss": 1.735,
"step": 347
},
{
"epoch": 0.15053123901700505,
"grad_norm": 12.980571746826172,
"learning_rate": 1.7174103237095364e-05,
"loss": 1.7759,
"step": 348
},
{
"epoch": 0.1509638000486631,
"grad_norm": 15.27466106414795,
"learning_rate": 1.7165354330708663e-05,
"loss": 1.7244,
"step": 349
},
{
"epoch": 0.15139636108032117,
"grad_norm": 14.315723419189453,
"learning_rate": 1.7156605424321963e-05,
"loss": 1.7684,
"step": 350
},
{
"epoch": 0.15182892211197924,
"grad_norm": 13.90048599243164,
"learning_rate": 1.714785651793526e-05,
"loss": 1.7475,
"step": 351
},
{
"epoch": 0.1522614831436373,
"grad_norm": 13.61722469329834,
"learning_rate": 1.7139107611548558e-05,
"loss": 1.6943,
"step": 352
},
{
"epoch": 0.15269404417529536,
"grad_norm": 14.208736419677734,
"learning_rate": 1.7130358705161857e-05,
"loss": 1.7269,
"step": 353
},
{
"epoch": 0.15312660520695343,
"grad_norm": 12.213624954223633,
"learning_rate": 1.7121609798775153e-05,
"loss": 1.7076,
"step": 354
},
{
"epoch": 0.1535591662386115,
"grad_norm": 14.757974624633789,
"learning_rate": 1.7112860892388452e-05,
"loss": 1.714,
"step": 355
},
{
"epoch": 0.15399172727026955,
"grad_norm": 13.660282135009766,
"learning_rate": 1.710411198600175e-05,
"loss": 1.7442,
"step": 356
},
{
"epoch": 0.1544242883019276,
"grad_norm": 13.772233963012695,
"learning_rate": 1.709536307961505e-05,
"loss": 1.7667,
"step": 357
},
{
"epoch": 0.15485684933358565,
"grad_norm": 14.523663520812988,
"learning_rate": 1.708661417322835e-05,
"loss": 1.6005,
"step": 358
},
{
"epoch": 0.1552894103652437,
"grad_norm": 14.664237976074219,
"learning_rate": 1.7077865266841645e-05,
"loss": 1.6852,
"step": 359
},
{
"epoch": 0.15572197139690178,
"grad_norm": 14.592924118041992,
"learning_rate": 1.7069116360454945e-05,
"loss": 1.7847,
"step": 360
},
{
"epoch": 0.15615453242855984,
"grad_norm": 13.490509033203125,
"learning_rate": 1.7060367454068244e-05,
"loss": 1.7272,
"step": 361
},
{
"epoch": 0.1565870934602179,
"grad_norm": 14.302976608276367,
"learning_rate": 1.705161854768154e-05,
"loss": 1.6654,
"step": 362
},
{
"epoch": 0.15701965449187597,
"grad_norm": 13.850686073303223,
"learning_rate": 1.704286964129484e-05,
"loss": 1.6631,
"step": 363
},
{
"epoch": 0.15745221552353403,
"grad_norm": 12.717830657958984,
"learning_rate": 1.7034120734908138e-05,
"loss": 1.7526,
"step": 364
},
{
"epoch": 0.1578847765551921,
"grad_norm": 14.529204368591309,
"learning_rate": 1.7025371828521434e-05,
"loss": 1.6707,
"step": 365
},
{
"epoch": 0.15831733758685015,
"grad_norm": 13.817510604858398,
"learning_rate": 1.7016622922134736e-05,
"loss": 1.7353,
"step": 366
},
{
"epoch": 0.15874989861850822,
"grad_norm": 15.481966018676758,
"learning_rate": 1.7007874015748032e-05,
"loss": 1.6683,
"step": 367
},
{
"epoch": 0.15918245965016625,
"grad_norm": 14.049205780029297,
"learning_rate": 1.699912510936133e-05,
"loss": 1.7273,
"step": 368
},
{
"epoch": 0.15961502068182432,
"grad_norm": 14.65227222442627,
"learning_rate": 1.699037620297463e-05,
"loss": 1.6432,
"step": 369
},
{
"epoch": 0.16004758171348238,
"grad_norm": 14.594015121459961,
"learning_rate": 1.6981627296587927e-05,
"loss": 1.8021,
"step": 370
},
{
"epoch": 0.16048014274514044,
"grad_norm": 13.285860061645508,
"learning_rate": 1.6972878390201226e-05,
"loss": 1.6821,
"step": 371
},
{
"epoch": 0.1609127037767985,
"grad_norm": 14.19520378112793,
"learning_rate": 1.6964129483814525e-05,
"loss": 1.6678,
"step": 372
},
{
"epoch": 0.16134526480845657,
"grad_norm": 14.195395469665527,
"learning_rate": 1.695538057742782e-05,
"loss": 1.7482,
"step": 373
},
{
"epoch": 0.16177782584011463,
"grad_norm": 14.017548561096191,
"learning_rate": 1.694663167104112e-05,
"loss": 1.7071,
"step": 374
},
{
"epoch": 0.1622103868717727,
"grad_norm": 14.560656547546387,
"learning_rate": 1.693788276465442e-05,
"loss": 1.711,
"step": 375
},
{
"epoch": 0.16264294790343076,
"grad_norm": 14.667351722717285,
"learning_rate": 1.692913385826772e-05,
"loss": 1.6147,
"step": 376
},
{
"epoch": 0.16307550893508882,
"grad_norm": 15.273613929748535,
"learning_rate": 1.6920384951881018e-05,
"loss": 1.7753,
"step": 377
},
{
"epoch": 0.16350806996674688,
"grad_norm": 15.08775806427002,
"learning_rate": 1.6911636045494317e-05,
"loss": 1.8437,
"step": 378
},
{
"epoch": 0.16394063099840492,
"grad_norm": 16.35053825378418,
"learning_rate": 1.6902887139107613e-05,
"loss": 1.6818,
"step": 379
},
{
"epoch": 0.16437319203006298,
"grad_norm": 14.675270080566406,
"learning_rate": 1.6894138232720912e-05,
"loss": 1.8182,
"step": 380
},
{
"epoch": 0.16480575306172104,
"grad_norm": 12.950959205627441,
"learning_rate": 1.688538932633421e-05,
"loss": 1.6691,
"step": 381
},
{
"epoch": 0.1652383140933791,
"grad_norm": 15.214534759521484,
"learning_rate": 1.6876640419947507e-05,
"loss": 1.7198,
"step": 382
},
{
"epoch": 0.16567087512503717,
"grad_norm": 14.145734786987305,
"learning_rate": 1.6867891513560806e-05,
"loss": 1.8248,
"step": 383
},
{
"epoch": 0.16610343615669523,
"grad_norm": 13.487668991088867,
"learning_rate": 1.6859142607174105e-05,
"loss": 1.7785,
"step": 384
},
{
"epoch": 0.1665359971883533,
"grad_norm": 15.348024368286133,
"learning_rate": 1.68503937007874e-05,
"loss": 1.6533,
"step": 385
},
{
"epoch": 0.16696855822001136,
"grad_norm": 13.304719924926758,
"learning_rate": 1.6841644794400704e-05,
"loss": 1.7462,
"step": 386
},
{
"epoch": 0.16740111925166942,
"grad_norm": 15.1539888381958,
"learning_rate": 1.6832895888014e-05,
"loss": 1.7745,
"step": 387
},
{
"epoch": 0.16783368028332749,
"grad_norm": 14.753242492675781,
"learning_rate": 1.68241469816273e-05,
"loss": 1.7696,
"step": 388
},
{
"epoch": 0.16826624131498555,
"grad_norm": 14.071645736694336,
"learning_rate": 1.6815398075240598e-05,
"loss": 1.7495,
"step": 389
},
{
"epoch": 0.16869880234664358,
"grad_norm": 13.17452335357666,
"learning_rate": 1.6806649168853894e-05,
"loss": 1.6654,
"step": 390
},
{
"epoch": 0.16913136337830165,
"grad_norm": 13.739713668823242,
"learning_rate": 1.6797900262467193e-05,
"loss": 1.871,
"step": 391
},
{
"epoch": 0.1695639244099597,
"grad_norm": 14.904608726501465,
"learning_rate": 1.6789151356080492e-05,
"loss": 1.7308,
"step": 392
},
{
"epoch": 0.16999648544161777,
"grad_norm": 13.76181697845459,
"learning_rate": 1.6780402449693788e-05,
"loss": 1.7218,
"step": 393
},
{
"epoch": 0.17042904647327584,
"grad_norm": 15.682334899902344,
"learning_rate": 1.6771653543307087e-05,
"loss": 1.7135,
"step": 394
},
{
"epoch": 0.1708616075049339,
"grad_norm": 14.770147323608398,
"learning_rate": 1.6762904636920386e-05,
"loss": 1.6647,
"step": 395
},
{
"epoch": 0.17129416853659196,
"grad_norm": 14.868924140930176,
"learning_rate": 1.6754155730533686e-05,
"loss": 1.8061,
"step": 396
},
{
"epoch": 0.17172672956825002,
"grad_norm": 13.417051315307617,
"learning_rate": 1.6745406824146985e-05,
"loss": 1.7588,
"step": 397
},
{
"epoch": 0.1721592905999081,
"grad_norm": 15.186949729919434,
"learning_rate": 1.673665791776028e-05,
"loss": 1.7662,
"step": 398
},
{
"epoch": 0.17259185163156615,
"grad_norm": 14.485387802124023,
"learning_rate": 1.672790901137358e-05,
"loss": 1.7135,
"step": 399
},
{
"epoch": 0.17302441266322421,
"grad_norm": 14.660444259643555,
"learning_rate": 1.671916010498688e-05,
"loss": 1.6872,
"step": 400
},
{
"epoch": 0.17345697369488225,
"grad_norm": 16.875911712646484,
"learning_rate": 1.6710411198600175e-05,
"loss": 1.6135,
"step": 401
},
{
"epoch": 0.1738895347265403,
"grad_norm": 15.121524810791016,
"learning_rate": 1.6701662292213474e-05,
"loss": 1.6156,
"step": 402
},
{
"epoch": 0.17432209575819838,
"grad_norm": 14.13774585723877,
"learning_rate": 1.6692913385826773e-05,
"loss": 1.7311,
"step": 403
},
{
"epoch": 0.17475465678985644,
"grad_norm": 15.118247032165527,
"learning_rate": 1.668416447944007e-05,
"loss": 1.7038,
"step": 404
},
{
"epoch": 0.1751872178215145,
"grad_norm": 17.146011352539062,
"learning_rate": 1.667541557305337e-05,
"loss": 1.6302,
"step": 405
},
{
"epoch": 0.17561977885317256,
"grad_norm": 13.198725700378418,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.7395,
"step": 406
},
{
"epoch": 0.17605233988483063,
"grad_norm": 14.39787483215332,
"learning_rate": 1.6657917760279967e-05,
"loss": 1.7715,
"step": 407
},
{
"epoch": 0.1764849009164887,
"grad_norm": 14.981849670410156,
"learning_rate": 1.6649168853893266e-05,
"loss": 1.6916,
"step": 408
},
{
"epoch": 0.17691746194814675,
"grad_norm": 14.908326148986816,
"learning_rate": 1.6640419947506562e-05,
"loss": 1.7563,
"step": 409
},
{
"epoch": 0.17735002297980482,
"grad_norm": 14.553082466125488,
"learning_rate": 1.663167104111986e-05,
"loss": 1.7202,
"step": 410
},
{
"epoch": 0.17778258401146288,
"grad_norm": 13.793340682983398,
"learning_rate": 1.662292213473316e-05,
"loss": 1.7678,
"step": 411
},
{
"epoch": 0.17821514504312091,
"grad_norm": 16.31795310974121,
"learning_rate": 1.6614173228346456e-05,
"loss": 1.8089,
"step": 412
},
{
"epoch": 0.17864770607477898,
"grad_norm": 17.318607330322266,
"learning_rate": 1.6605424321959755e-05,
"loss": 1.6696,
"step": 413
},
{
"epoch": 0.17908026710643704,
"grad_norm": 13.9689302444458,
"learning_rate": 1.6596675415573054e-05,
"loss": 1.7372,
"step": 414
},
{
"epoch": 0.1795128281380951,
"grad_norm": 14.510354995727539,
"learning_rate": 1.6587926509186354e-05,
"loss": 1.6597,
"step": 415
},
{
"epoch": 0.17994538916975317,
"grad_norm": 15.901954650878906,
"learning_rate": 1.6579177602799653e-05,
"loss": 1.7872,
"step": 416
},
{
"epoch": 0.18037795020141123,
"grad_norm": 13.021835327148438,
"learning_rate": 1.657042869641295e-05,
"loss": 1.6739,
"step": 417
},
{
"epoch": 0.1808105112330693,
"grad_norm": 14.94437313079834,
"learning_rate": 1.6561679790026248e-05,
"loss": 1.6915,
"step": 418
},
{
"epoch": 0.18124307226472736,
"grad_norm": 14.61279296875,
"learning_rate": 1.6552930883639547e-05,
"loss": 1.5963,
"step": 419
},
{
"epoch": 0.18167563329638542,
"grad_norm": 16.396753311157227,
"learning_rate": 1.6544181977252843e-05,
"loss": 1.702,
"step": 420
},
{
"epoch": 0.18210819432804348,
"grad_norm": 16.830820083618164,
"learning_rate": 1.6535433070866142e-05,
"loss": 1.8108,
"step": 421
},
{
"epoch": 0.18254075535970155,
"grad_norm": 15.551237106323242,
"learning_rate": 1.652668416447944e-05,
"loss": 1.7831,
"step": 422
},
{
"epoch": 0.18297331639135958,
"grad_norm": 15.525558471679688,
"learning_rate": 1.651793525809274e-05,
"loss": 1.6718,
"step": 423
},
{
"epoch": 0.18340587742301764,
"grad_norm": 12.82323169708252,
"learning_rate": 1.650918635170604e-05,
"loss": 1.642,
"step": 424
},
{
"epoch": 0.1838384384546757,
"grad_norm": 15.793924331665039,
"learning_rate": 1.650043744531934e-05,
"loss": 1.6871,
"step": 425
},
{
"epoch": 0.18427099948633377,
"grad_norm": 14.773061752319336,
"learning_rate": 1.6491688538932635e-05,
"loss": 1.6956,
"step": 426
},
{
"epoch": 0.18470356051799183,
"grad_norm": 16.91400909423828,
"learning_rate": 1.6482939632545934e-05,
"loss": 1.6336,
"step": 427
},
{
"epoch": 0.1851361215496499,
"grad_norm": 15.226611137390137,
"learning_rate": 1.6474190726159233e-05,
"loss": 1.7333,
"step": 428
},
{
"epoch": 0.18556868258130796,
"grad_norm": 16.962453842163086,
"learning_rate": 1.646544181977253e-05,
"loss": 1.6063,
"step": 429
},
{
"epoch": 0.18600124361296602,
"grad_norm": 16.48338508605957,
"learning_rate": 1.6456692913385828e-05,
"loss": 1.7043,
"step": 430
},
{
"epoch": 0.18643380464462408,
"grad_norm": 14.127233505249023,
"learning_rate": 1.6447944006999127e-05,
"loss": 1.709,
"step": 431
},
{
"epoch": 0.18686636567628215,
"grad_norm": 15.949320793151855,
"learning_rate": 1.6439195100612423e-05,
"loss": 1.7161,
"step": 432
},
{
"epoch": 0.1872989267079402,
"grad_norm": 15.113992691040039,
"learning_rate": 1.6430446194225722e-05,
"loss": 1.7378,
"step": 433
},
{
"epoch": 0.18773148773959825,
"grad_norm": 15.371466636657715,
"learning_rate": 1.642169728783902e-05,
"loss": 1.6978,
"step": 434
},
{
"epoch": 0.1881640487712563,
"grad_norm": 15.309712409973145,
"learning_rate": 1.641294838145232e-05,
"loss": 1.7107,
"step": 435
},
{
"epoch": 0.18859660980291437,
"grad_norm": 14.199424743652344,
"learning_rate": 1.640419947506562e-05,
"loss": 1.6976,
"step": 436
},
{
"epoch": 0.18902917083457244,
"grad_norm": 16.60711097717285,
"learning_rate": 1.6395450568678916e-05,
"loss": 1.5963,
"step": 437
},
{
"epoch": 0.1894617318662305,
"grad_norm": 14.320137023925781,
"learning_rate": 1.6386701662292215e-05,
"loss": 1.7274,
"step": 438
},
{
"epoch": 0.18989429289788856,
"grad_norm": 15.285809516906738,
"learning_rate": 1.6377952755905514e-05,
"loss": 1.7584,
"step": 439
},
{
"epoch": 0.19032685392954662,
"grad_norm": 15.035262107849121,
"learning_rate": 1.636920384951881e-05,
"loss": 1.6926,
"step": 440
},
{
"epoch": 0.1907594149612047,
"grad_norm": 15.250046730041504,
"learning_rate": 1.636045494313211e-05,
"loss": 1.7165,
"step": 441
},
{
"epoch": 0.19119197599286275,
"grad_norm": 17.97435188293457,
"learning_rate": 1.635170603674541e-05,
"loss": 1.8039,
"step": 442
},
{
"epoch": 0.1916245370245208,
"grad_norm": 13.659225463867188,
"learning_rate": 1.6342957130358708e-05,
"loss": 1.6173,
"step": 443
},
{
"epoch": 0.19205709805617888,
"grad_norm": 15.350072860717773,
"learning_rate": 1.6334208223972007e-05,
"loss": 1.6974,
"step": 444
},
{
"epoch": 0.1924896590878369,
"grad_norm": 16.155649185180664,
"learning_rate": 1.6325459317585303e-05,
"loss": 1.6537,
"step": 445
},
{
"epoch": 0.19292222011949497,
"grad_norm": 14.439220428466797,
"learning_rate": 1.6316710411198602e-05,
"loss": 1.7225,
"step": 446
},
{
"epoch": 0.19335478115115304,
"grad_norm": 17.609535217285156,
"learning_rate": 1.63079615048119e-05,
"loss": 1.723,
"step": 447
},
{
"epoch": 0.1937873421828111,
"grad_norm": 17.415800094604492,
"learning_rate": 1.6299212598425197e-05,
"loss": 1.636,
"step": 448
},
{
"epoch": 0.19421990321446916,
"grad_norm": 15.260915756225586,
"learning_rate": 1.6290463692038496e-05,
"loss": 1.6597,
"step": 449
},
{
"epoch": 0.19465246424612723,
"grad_norm": 16.445232391357422,
"learning_rate": 1.6281714785651795e-05,
"loss": 1.7469,
"step": 450
},
{
"epoch": 0.1950850252777853,
"grad_norm": 16.05836296081543,
"learning_rate": 1.627296587926509e-05,
"loss": 1.651,
"step": 451
},
{
"epoch": 0.19551758630944335,
"grad_norm": 14.436004638671875,
"learning_rate": 1.626421697287839e-05,
"loss": 1.6635,
"step": 452
},
{
"epoch": 0.19595014734110142,
"grad_norm": 14.610448837280273,
"learning_rate": 1.625546806649169e-05,
"loss": 1.8475,
"step": 453
},
{
"epoch": 0.19638270837275948,
"grad_norm": 16.329818725585938,
"learning_rate": 1.624671916010499e-05,
"loss": 1.7341,
"step": 454
},
{
"epoch": 0.19681526940441754,
"grad_norm": 15.847041130065918,
"learning_rate": 1.6237970253718288e-05,
"loss": 1.721,
"step": 455
},
{
"epoch": 0.19724783043607558,
"grad_norm": 13.708486557006836,
"learning_rate": 1.6229221347331584e-05,
"loss": 1.7186,
"step": 456
},
{
"epoch": 0.19768039146773364,
"grad_norm": 15.195647239685059,
"learning_rate": 1.6220472440944883e-05,
"loss": 1.7061,
"step": 457
},
{
"epoch": 0.1981129524993917,
"grad_norm": 16.739809036254883,
"learning_rate": 1.6211723534558182e-05,
"loss": 1.6949,
"step": 458
},
{
"epoch": 0.19854551353104977,
"grad_norm": 15.860810279846191,
"learning_rate": 1.6202974628171478e-05,
"loss": 1.654,
"step": 459
},
{
"epoch": 0.19897807456270783,
"grad_norm": 16.588729858398438,
"learning_rate": 1.6194225721784777e-05,
"loss": 1.6778,
"step": 460
},
{
"epoch": 0.1994106355943659,
"grad_norm": 15.871442794799805,
"learning_rate": 1.6185476815398076e-05,
"loss": 1.7289,
"step": 461
},
{
"epoch": 0.19984319662602396,
"grad_norm": 15.373008728027344,
"learning_rate": 1.6176727909011372e-05,
"loss": 1.6259,
"step": 462
},
{
"epoch": 0.20027575765768202,
"grad_norm": 14.221921920776367,
"learning_rate": 1.6167979002624675e-05,
"loss": 1.6892,
"step": 463
},
{
"epoch": 0.20070831868934008,
"grad_norm": 17.429065704345703,
"learning_rate": 1.615923009623797e-05,
"loss": 1.724,
"step": 464
},
{
"epoch": 0.20114087972099814,
"grad_norm": 20.687255859375,
"learning_rate": 1.615048118985127e-05,
"loss": 1.6504,
"step": 465
},
{
"epoch": 0.2015734407526562,
"grad_norm": 15.03296947479248,
"learning_rate": 1.614173228346457e-05,
"loss": 1.6124,
"step": 466
},
{
"epoch": 0.20200600178431424,
"grad_norm": 16.98514175415039,
"learning_rate": 1.6132983377077865e-05,
"loss": 1.78,
"step": 467
},
{
"epoch": 0.2024385628159723,
"grad_norm": 14.70289134979248,
"learning_rate": 1.6124234470691164e-05,
"loss": 1.7044,
"step": 468
},
{
"epoch": 0.20287112384763037,
"grad_norm": 14.248015403747559,
"learning_rate": 1.6115485564304463e-05,
"loss": 1.7596,
"step": 469
},
{
"epoch": 0.20330368487928843,
"grad_norm": 15.52908706665039,
"learning_rate": 1.6106736657917763e-05,
"loss": 1.7105,
"step": 470
},
{
"epoch": 0.2037362459109465,
"grad_norm": 18.255674362182617,
"learning_rate": 1.609798775153106e-05,
"loss": 1.7161,
"step": 471
},
{
"epoch": 0.20416880694260456,
"grad_norm": 22.3636474609375,
"learning_rate": 1.608923884514436e-05,
"loss": 1.5983,
"step": 472
},
{
"epoch": 0.20460136797426262,
"grad_norm": 17.28295135498047,
"learning_rate": 1.6080489938757657e-05,
"loss": 1.6615,
"step": 473
},
{
"epoch": 0.20503392900592068,
"grad_norm": 13.663941383361816,
"learning_rate": 1.6071741032370956e-05,
"loss": 1.6207,
"step": 474
},
{
"epoch": 0.20546649003757875,
"grad_norm": 14.511791229248047,
"learning_rate": 1.6062992125984255e-05,
"loss": 1.7379,
"step": 475
},
{
"epoch": 0.2058990510692368,
"grad_norm": 15.742350578308105,
"learning_rate": 1.605424321959755e-05,
"loss": 1.7579,
"step": 476
},
{
"epoch": 0.20633161210089487,
"grad_norm": 15.552555084228516,
"learning_rate": 1.604549431321085e-05,
"loss": 1.641,
"step": 477
},
{
"epoch": 0.2067641731325529,
"grad_norm": 21.889766693115234,
"learning_rate": 1.603674540682415e-05,
"loss": 1.7403,
"step": 478
},
{
"epoch": 0.20719673416421097,
"grad_norm": 20.306562423706055,
"learning_rate": 1.6027996500437445e-05,
"loss": 1.7009,
"step": 479
},
{
"epoch": 0.20762929519586903,
"grad_norm": 18.12723731994629,
"learning_rate": 1.6019247594050744e-05,
"loss": 1.6979,
"step": 480
},
{
"epoch": 0.2080618562275271,
"grad_norm": 14.94935417175293,
"learning_rate": 1.6010498687664044e-05,
"loss": 1.6484,
"step": 481
},
{
"epoch": 0.20849441725918516,
"grad_norm": 14.743106842041016,
"learning_rate": 1.6001749781277343e-05,
"loss": 1.6963,
"step": 482
},
{
"epoch": 0.20892697829084322,
"grad_norm": 14.982324600219727,
"learning_rate": 1.5993000874890642e-05,
"loss": 1.6868,
"step": 483
},
{
"epoch": 0.2093595393225013,
"grad_norm": 16.85517692565918,
"learning_rate": 1.5984251968503938e-05,
"loss": 1.5787,
"step": 484
},
{
"epoch": 0.20979210035415935,
"grad_norm": 15.209792137145996,
"learning_rate": 1.5975503062117237e-05,
"loss": 1.7086,
"step": 485
},
{
"epoch": 0.2102246613858174,
"grad_norm": 14.563628196716309,
"learning_rate": 1.5966754155730536e-05,
"loss": 1.7039,
"step": 486
},
{
"epoch": 0.21065722241747548,
"grad_norm": 15.1539945602417,
"learning_rate": 1.5958005249343832e-05,
"loss": 1.6866,
"step": 487
},
{
"epoch": 0.21108978344913354,
"grad_norm": 19.112892150878906,
"learning_rate": 1.594925634295713e-05,
"loss": 1.5955,
"step": 488
},
{
"epoch": 0.21152234448079157,
"grad_norm": 17.022937774658203,
"learning_rate": 1.594050743657043e-05,
"loss": 1.7138,
"step": 489
},
{
"epoch": 0.21195490551244964,
"grad_norm": 18.67705535888672,
"learning_rate": 1.5931758530183726e-05,
"loss": 1.7335,
"step": 490
},
{
"epoch": 0.2123874665441077,
"grad_norm": 17.58185577392578,
"learning_rate": 1.592300962379703e-05,
"loss": 1.6535,
"step": 491
},
{
"epoch": 0.21282002757576576,
"grad_norm": 16.525270462036133,
"learning_rate": 1.5914260717410325e-05,
"loss": 1.6493,
"step": 492
},
{
"epoch": 0.21325258860742383,
"grad_norm": 18.45403289794922,
"learning_rate": 1.5905511811023624e-05,
"loss": 1.6937,
"step": 493
},
{
"epoch": 0.2136851496390819,
"grad_norm": 18.30523681640625,
"learning_rate": 1.5896762904636923e-05,
"loss": 1.6143,
"step": 494
},
{
"epoch": 0.21411771067073995,
"grad_norm": 17.691091537475586,
"learning_rate": 1.588801399825022e-05,
"loss": 1.6493,
"step": 495
},
{
"epoch": 0.21455027170239802,
"grad_norm": 15.201132774353027,
"learning_rate": 1.5879265091863518e-05,
"loss": 1.6753,
"step": 496
},
{
"epoch": 0.21498283273405608,
"grad_norm": 17.74352264404297,
"learning_rate": 1.5870516185476817e-05,
"loss": 1.7362,
"step": 497
},
{
"epoch": 0.21541539376571414,
"grad_norm": 16.426006317138672,
"learning_rate": 1.5861767279090113e-05,
"loss": 1.5865,
"step": 498
},
{
"epoch": 0.2158479547973722,
"grad_norm": 18.481365203857422,
"learning_rate": 1.5853018372703412e-05,
"loss": 1.7536,
"step": 499
},
{
"epoch": 0.21628051582903024,
"grad_norm": 16.491756439208984,
"learning_rate": 1.584426946631671e-05,
"loss": 1.7682,
"step": 500
},
{
"epoch": 0.2167130768606883,
"grad_norm": 15.862979888916016,
"learning_rate": 1.583552055993001e-05,
"loss": 1.6539,
"step": 501
},
{
"epoch": 0.21714563789234637,
"grad_norm": 14.549097061157227,
"learning_rate": 1.582677165354331e-05,
"loss": 1.7378,
"step": 502
},
{
"epoch": 0.21757819892400443,
"grad_norm": 14.149446487426758,
"learning_rate": 1.5818022747156606e-05,
"loss": 1.6905,
"step": 503
},
{
"epoch": 0.2180107599556625,
"grad_norm": 17.096704483032227,
"learning_rate": 1.5809273840769905e-05,
"loss": 1.5642,
"step": 504
},
{
"epoch": 0.21844332098732056,
"grad_norm": 17.932981491088867,
"learning_rate": 1.5800524934383204e-05,
"loss": 1.6426,
"step": 505
},
{
"epoch": 0.21887588201897862,
"grad_norm": 17.249406814575195,
"learning_rate": 1.57917760279965e-05,
"loss": 1.6455,
"step": 506
},
{
"epoch": 0.21930844305063668,
"grad_norm": 18.000295639038086,
"learning_rate": 1.57830271216098e-05,
"loss": 1.6118,
"step": 507
},
{
"epoch": 0.21974100408229474,
"grad_norm": 16.909530639648438,
"learning_rate": 1.57742782152231e-05,
"loss": 1.6894,
"step": 508
},
{
"epoch": 0.2201735651139528,
"grad_norm": 17.263492584228516,
"learning_rate": 1.5765529308836394e-05,
"loss": 1.7422,
"step": 509
},
{
"epoch": 0.22060612614561087,
"grad_norm": 16.53858184814453,
"learning_rate": 1.5756780402449694e-05,
"loss": 1.5943,
"step": 510
},
{
"epoch": 0.2210386871772689,
"grad_norm": 15.582964897155762,
"learning_rate": 1.5748031496062993e-05,
"loss": 1.568,
"step": 511
},
{
"epoch": 0.22147124820892697,
"grad_norm": 17.556089401245117,
"learning_rate": 1.5739282589676292e-05,
"loss": 1.7498,
"step": 512
},
{
"epoch": 0.22190380924058503,
"grad_norm": 16.844451904296875,
"learning_rate": 1.573053368328959e-05,
"loss": 1.6113,
"step": 513
},
{
"epoch": 0.2223363702722431,
"grad_norm": 16.041711807250977,
"learning_rate": 1.5721784776902887e-05,
"loss": 1.7614,
"step": 514
},
{
"epoch": 0.22276893130390116,
"grad_norm": 17.016220092773438,
"learning_rate": 1.5713035870516186e-05,
"loss": 1.6991,
"step": 515
},
{
"epoch": 0.22320149233555922,
"grad_norm": 16.927995681762695,
"learning_rate": 1.5704286964129485e-05,
"loss": 1.6816,
"step": 516
},
{
"epoch": 0.22363405336721728,
"grad_norm": 16.519311904907227,
"learning_rate": 1.5695538057742785e-05,
"loss": 1.5943,
"step": 517
},
{
"epoch": 0.22406661439887535,
"grad_norm": 18.04831314086914,
"learning_rate": 1.568678915135608e-05,
"loss": 1.7193,
"step": 518
},
{
"epoch": 0.2244991754305334,
"grad_norm": 18.500019073486328,
"learning_rate": 1.567804024496938e-05,
"loss": 1.6316,
"step": 519
},
{
"epoch": 0.22493173646219147,
"grad_norm": 18.94154930114746,
"learning_rate": 1.566929133858268e-05,
"loss": 1.6008,
"step": 520
},
{
"epoch": 0.22536429749384954,
"grad_norm": 15.78384780883789,
"learning_rate": 1.5660542432195978e-05,
"loss": 1.6177,
"step": 521
},
{
"epoch": 0.22579685852550757,
"grad_norm": 18.01935577392578,
"learning_rate": 1.5651793525809277e-05,
"loss": 1.7999,
"step": 522
},
{
"epoch": 0.22622941955716563,
"grad_norm": 19.498945236206055,
"learning_rate": 1.5643044619422573e-05,
"loss": 1.6587,
"step": 523
},
{
"epoch": 0.2266619805888237,
"grad_norm": 16.238536834716797,
"learning_rate": 1.5634295713035872e-05,
"loss": 1.6589,
"step": 524
},
{
"epoch": 0.22709454162048176,
"grad_norm": 14.878324508666992,
"learning_rate": 1.562554680664917e-05,
"loss": 1.692,
"step": 525
},
{
"epoch": 0.22752710265213982,
"grad_norm": 15.313729286193848,
"learning_rate": 1.5616797900262467e-05,
"loss": 1.6564,
"step": 526
},
{
"epoch": 0.2279596636837979,
"grad_norm": 14.713421821594238,
"learning_rate": 1.5608048993875766e-05,
"loss": 1.6997,
"step": 527
},
{
"epoch": 0.22839222471545595,
"grad_norm": 14.811202049255371,
"learning_rate": 1.5599300087489066e-05,
"loss": 1.7147,
"step": 528
},
{
"epoch": 0.228824785747114,
"grad_norm": 17.43622589111328,
"learning_rate": 1.559055118110236e-05,
"loss": 1.6092,
"step": 529
},
{
"epoch": 0.22925734677877208,
"grad_norm": 16.88450813293457,
"learning_rate": 1.5581802274715664e-05,
"loss": 1.6191,
"step": 530
},
{
"epoch": 0.22968990781043014,
"grad_norm": 17.206796646118164,
"learning_rate": 1.557305336832896e-05,
"loss": 1.7351,
"step": 531
},
{
"epoch": 0.2301224688420882,
"grad_norm": 18.76306915283203,
"learning_rate": 1.556430446194226e-05,
"loss": 1.5707,
"step": 532
},
{
"epoch": 0.23055502987374624,
"grad_norm": 15.88653564453125,
"learning_rate": 1.555555555555556e-05,
"loss": 1.6625,
"step": 533
},
{
"epoch": 0.2309875909054043,
"grad_norm": 17.676189422607422,
"learning_rate": 1.5546806649168854e-05,
"loss": 1.6742,
"step": 534
},
{
"epoch": 0.23142015193706236,
"grad_norm": 17.690826416015625,
"learning_rate": 1.5538057742782153e-05,
"loss": 1.6381,
"step": 535
},
{
"epoch": 0.23185271296872043,
"grad_norm": 16.335132598876953,
"learning_rate": 1.5529308836395453e-05,
"loss": 1.5988,
"step": 536
},
{
"epoch": 0.2322852740003785,
"grad_norm": 17.346731185913086,
"learning_rate": 1.552055993000875e-05,
"loss": 1.687,
"step": 537
},
{
"epoch": 0.23271783503203655,
"grad_norm": 16.654361724853516,
"learning_rate": 1.5511811023622048e-05,
"loss": 1.6375,
"step": 538
},
{
"epoch": 0.23315039606369461,
"grad_norm": 17.761816024780273,
"learning_rate": 1.5503062117235347e-05,
"loss": 1.6785,
"step": 539
},
{
"epoch": 0.23358295709535268,
"grad_norm": 16.973031997680664,
"learning_rate": 1.5494313210848646e-05,
"loss": 1.6814,
"step": 540
},
{
"epoch": 0.23401551812701074,
"grad_norm": 16.909561157226562,
"learning_rate": 1.5485564304461945e-05,
"loss": 1.6855,
"step": 541
},
{
"epoch": 0.2344480791586688,
"grad_norm": 17.03185272216797,
"learning_rate": 1.547681539807524e-05,
"loss": 1.6887,
"step": 542
},
{
"epoch": 0.23488064019032687,
"grad_norm": 17.3386173248291,
"learning_rate": 1.546806649168854e-05,
"loss": 1.6587,
"step": 543
},
{
"epoch": 0.2353132012219849,
"grad_norm": 16.984390258789062,
"learning_rate": 1.545931758530184e-05,
"loss": 1.5812,
"step": 544
},
{
"epoch": 0.23574576225364297,
"grad_norm": 16.079471588134766,
"learning_rate": 1.5450568678915135e-05,
"loss": 1.7236,
"step": 545
},
{
"epoch": 0.23617832328530103,
"grad_norm": 16.26591682434082,
"learning_rate": 1.5441819772528434e-05,
"loss": 1.6052,
"step": 546
},
{
"epoch": 0.2366108843169591,
"grad_norm": 17.542015075683594,
"learning_rate": 1.5433070866141734e-05,
"loss": 1.5683,
"step": 547
},
{
"epoch": 0.23704344534861715,
"grad_norm": 19.87151527404785,
"learning_rate": 1.542432195975503e-05,
"loss": 1.5891,
"step": 548
},
{
"epoch": 0.23747600638027522,
"grad_norm": 17.004043579101562,
"learning_rate": 1.5415573053368332e-05,
"loss": 1.6065,
"step": 549
},
{
"epoch": 0.23790856741193328,
"grad_norm": 16.69712257385254,
"learning_rate": 1.5406824146981628e-05,
"loss": 1.6858,
"step": 550
},
{
"epoch": 0.23834112844359134,
"grad_norm": 17.42692756652832,
"learning_rate": 1.5398075240594927e-05,
"loss": 1.7016,
"step": 551
},
{
"epoch": 0.2387736894752494,
"grad_norm": 15.988450050354004,
"learning_rate": 1.5389326334208226e-05,
"loss": 1.592,
"step": 552
},
{
"epoch": 0.23920625050690747,
"grad_norm": 16.765094757080078,
"learning_rate": 1.5380577427821522e-05,
"loss": 1.5969,
"step": 553
},
{
"epoch": 0.23963881153856553,
"grad_norm": 16.65338134765625,
"learning_rate": 1.537182852143482e-05,
"loss": 1.5488,
"step": 554
},
{
"epoch": 0.24007137257022357,
"grad_norm": 20.152639389038086,
"learning_rate": 1.536307961504812e-05,
"loss": 1.5912,
"step": 555
},
{
"epoch": 0.24050393360188163,
"grad_norm": 17.90691566467285,
"learning_rate": 1.5354330708661416e-05,
"loss": 1.7089,
"step": 556
},
{
"epoch": 0.2409364946335397,
"grad_norm": 16.137821197509766,
"learning_rate": 1.5345581802274716e-05,
"loss": 1.6125,
"step": 557
},
{
"epoch": 0.24136905566519776,
"grad_norm": 17.443849563598633,
"learning_rate": 1.5336832895888015e-05,
"loss": 1.7119,
"step": 558
},
{
"epoch": 0.24180161669685582,
"grad_norm": 19.99116325378418,
"learning_rate": 1.5328083989501314e-05,
"loss": 1.6325,
"step": 559
},
{
"epoch": 0.24223417772851388,
"grad_norm": 17.063501358032227,
"learning_rate": 1.5319335083114613e-05,
"loss": 1.6513,
"step": 560
},
{
"epoch": 0.24266673876017195,
"grad_norm": 16.421655654907227,
"learning_rate": 1.531058617672791e-05,
"loss": 1.683,
"step": 561
},
{
"epoch": 0.24309929979183,
"grad_norm": 20.072221755981445,
"learning_rate": 1.5301837270341208e-05,
"loss": 1.5857,
"step": 562
},
{
"epoch": 0.24353186082348807,
"grad_norm": 17.94641876220703,
"learning_rate": 1.5293088363954507e-05,
"loss": 1.7222,
"step": 563
},
{
"epoch": 0.24396442185514614,
"grad_norm": 18.8425235748291,
"learning_rate": 1.5284339457567807e-05,
"loss": 1.6236,
"step": 564
},
{
"epoch": 0.2443969828868042,
"grad_norm": 16.484027862548828,
"learning_rate": 1.5275590551181102e-05,
"loss": 1.6454,
"step": 565
},
{
"epoch": 0.24482954391846223,
"grad_norm": 16.227195739746094,
"learning_rate": 1.52668416447944e-05,
"loss": 1.5695,
"step": 566
},
{
"epoch": 0.2452621049501203,
"grad_norm": 17.701627731323242,
"learning_rate": 1.52580927384077e-05,
"loss": 1.723,
"step": 567
},
{
"epoch": 0.24569466598177836,
"grad_norm": 19.588029861450195,
"learning_rate": 1.5249343832021e-05,
"loss": 1.6525,
"step": 568
},
{
"epoch": 0.24612722701343642,
"grad_norm": 16.687175750732422,
"learning_rate": 1.5240594925634298e-05,
"loss": 1.6708,
"step": 569
},
{
"epoch": 0.24655978804509449,
"grad_norm": 16.77758026123047,
"learning_rate": 1.5231846019247595e-05,
"loss": 1.6512,
"step": 570
},
{
"epoch": 0.24699234907675255,
"grad_norm": 16.585819244384766,
"learning_rate": 1.5223097112860894e-05,
"loss": 1.5813,
"step": 571
},
{
"epoch": 0.2474249101084106,
"grad_norm": 18.844066619873047,
"learning_rate": 1.5214348206474192e-05,
"loss": 1.6259,
"step": 572
},
{
"epoch": 0.24785747114006867,
"grad_norm": 18.0266170501709,
"learning_rate": 1.520559930008749e-05,
"loss": 1.5875,
"step": 573
},
{
"epoch": 0.24829003217172674,
"grad_norm": 18.513761520385742,
"learning_rate": 1.5196850393700789e-05,
"loss": 1.6878,
"step": 574
},
{
"epoch": 0.2487225932033848,
"grad_norm": 16.963220596313477,
"learning_rate": 1.5188101487314086e-05,
"loss": 1.7002,
"step": 575
},
{
"epoch": 0.24915515423504286,
"grad_norm": 16.365562438964844,
"learning_rate": 1.5179352580927385e-05,
"loss": 1.6331,
"step": 576
},
{
"epoch": 0.2495877152667009,
"grad_norm": 18.376493453979492,
"learning_rate": 1.5170603674540683e-05,
"loss": 1.5584,
"step": 577
},
{
"epoch": 0.250020276298359,
"grad_norm": 15.05932903289795,
"learning_rate": 1.5161854768153984e-05,
"loss": 1.6453,
"step": 578
},
{
"epoch": 0.25045283733001705,
"grad_norm": 16.900829315185547,
"learning_rate": 1.5153105861767281e-05,
"loss": 1.6164,
"step": 579
},
{
"epoch": 0.2508853983616751,
"grad_norm": 18.5715389251709,
"learning_rate": 1.5144356955380579e-05,
"loss": 1.7586,
"step": 580
},
{
"epoch": 0.2513179593933332,
"grad_norm": 16.25688934326172,
"learning_rate": 1.5135608048993878e-05,
"loss": 1.5743,
"step": 581
},
{
"epoch": 0.2517505204249912,
"grad_norm": 16.32343292236328,
"learning_rate": 1.5126859142607175e-05,
"loss": 1.6409,
"step": 582
},
{
"epoch": 0.25218308145664925,
"grad_norm": 18.865724563598633,
"learning_rate": 1.5118110236220473e-05,
"loss": 1.5924,
"step": 583
},
{
"epoch": 0.2526156424883073,
"grad_norm": 16.306575775146484,
"learning_rate": 1.5109361329833772e-05,
"loss": 1.6264,
"step": 584
},
{
"epoch": 0.2530482035199654,
"grad_norm": 19.097639083862305,
"learning_rate": 1.510061242344707e-05,
"loss": 1.6059,
"step": 585
},
{
"epoch": 0.25348076455162344,
"grad_norm": 16.674352645874023,
"learning_rate": 1.5091863517060367e-05,
"loss": 1.7933,
"step": 586
},
{
"epoch": 0.2539133255832815,
"grad_norm": 19.233652114868164,
"learning_rate": 1.5083114610673668e-05,
"loss": 1.5857,
"step": 587
},
{
"epoch": 0.25434588661493956,
"grad_norm": 18.98441505432129,
"learning_rate": 1.5074365704286966e-05,
"loss": 1.709,
"step": 588
},
{
"epoch": 0.25477844764659763,
"grad_norm": 18.186471939086914,
"learning_rate": 1.5065616797900265e-05,
"loss": 1.6863,
"step": 589
},
{
"epoch": 0.2552110086782557,
"grad_norm": 18.54801368713379,
"learning_rate": 1.5056867891513562e-05,
"loss": 1.6825,
"step": 590
},
{
"epoch": 0.25564356970991375,
"grad_norm": 16.038410186767578,
"learning_rate": 1.504811898512686e-05,
"loss": 1.6853,
"step": 591
},
{
"epoch": 0.2560761307415718,
"grad_norm": 17.528871536254883,
"learning_rate": 1.5039370078740159e-05,
"loss": 1.7077,
"step": 592
},
{
"epoch": 0.2565086917732299,
"grad_norm": 15.960848808288574,
"learning_rate": 1.5030621172353457e-05,
"loss": 1.5823,
"step": 593
},
{
"epoch": 0.25694125280488794,
"grad_norm": 18.292795181274414,
"learning_rate": 1.5021872265966754e-05,
"loss": 1.5904,
"step": 594
},
{
"epoch": 0.257373813836546,
"grad_norm": 18.43050193786621,
"learning_rate": 1.5013123359580053e-05,
"loss": 1.6279,
"step": 595
},
{
"epoch": 0.25780637486820407,
"grad_norm": 18.099613189697266,
"learning_rate": 1.500437445319335e-05,
"loss": 1.6725,
"step": 596
},
{
"epoch": 0.25823893589986213,
"grad_norm": 17.15315055847168,
"learning_rate": 1.4995625546806652e-05,
"loss": 1.6643,
"step": 597
},
{
"epoch": 0.2586714969315202,
"grad_norm": 15.87186050415039,
"learning_rate": 1.498687664041995e-05,
"loss": 1.6733,
"step": 598
},
{
"epoch": 0.25910405796317826,
"grad_norm": 15.417596817016602,
"learning_rate": 1.4978127734033248e-05,
"loss": 1.743,
"step": 599
},
{
"epoch": 0.2595366189948363,
"grad_norm": 16.56854248046875,
"learning_rate": 1.4969378827646546e-05,
"loss": 1.7008,
"step": 600
},
{
"epoch": 0.2599691800264944,
"grad_norm": 16.416122436523438,
"learning_rate": 1.4960629921259843e-05,
"loss": 1.7122,
"step": 601
},
{
"epoch": 0.26040174105815245,
"grad_norm": 16.724794387817383,
"learning_rate": 1.4951881014873143e-05,
"loss": 1.7211,
"step": 602
},
{
"epoch": 0.2608343020898105,
"grad_norm": 16.47406768798828,
"learning_rate": 1.494313210848644e-05,
"loss": 1.6575,
"step": 603
},
{
"epoch": 0.2612668631214685,
"grad_norm": 18.724267959594727,
"learning_rate": 1.4934383202099738e-05,
"loss": 1.6323,
"step": 604
},
{
"epoch": 0.2616994241531266,
"grad_norm": 17.24701499938965,
"learning_rate": 1.4925634295713037e-05,
"loss": 1.7832,
"step": 605
},
{
"epoch": 0.26213198518478464,
"grad_norm": 18.684057235717773,
"learning_rate": 1.4916885389326334e-05,
"loss": 1.5967,
"step": 606
},
{
"epoch": 0.2625645462164427,
"grad_norm": 16.600284576416016,
"learning_rate": 1.4908136482939635e-05,
"loss": 1.5627,
"step": 607
},
{
"epoch": 0.26299710724810077,
"grad_norm": 17.85321044921875,
"learning_rate": 1.4899387576552933e-05,
"loss": 1.7005,
"step": 608
},
{
"epoch": 0.26342966827975883,
"grad_norm": 16.918190002441406,
"learning_rate": 1.489063867016623e-05,
"loss": 1.6876,
"step": 609
},
{
"epoch": 0.2638622293114169,
"grad_norm": 16.909236907958984,
"learning_rate": 1.488188976377953e-05,
"loss": 1.5946,
"step": 610
},
{
"epoch": 0.26429479034307496,
"grad_norm": 17.064176559448242,
"learning_rate": 1.4873140857392827e-05,
"loss": 1.7075,
"step": 611
},
{
"epoch": 0.264727351374733,
"grad_norm": 19.011877059936523,
"learning_rate": 1.4864391951006125e-05,
"loss": 1.765,
"step": 612
},
{
"epoch": 0.2651599124063911,
"grad_norm": 17.99385643005371,
"learning_rate": 1.4855643044619424e-05,
"loss": 1.6925,
"step": 613
},
{
"epoch": 0.26559247343804915,
"grad_norm": 17.17367172241211,
"learning_rate": 1.4846894138232721e-05,
"loss": 1.5354,
"step": 614
},
{
"epoch": 0.2660250344697072,
"grad_norm": 19.10651206970215,
"learning_rate": 1.4838145231846019e-05,
"loss": 1.5447,
"step": 615
},
{
"epoch": 0.2664575955013653,
"grad_norm": 18.000473022460938,
"learning_rate": 1.482939632545932e-05,
"loss": 1.5983,
"step": 616
},
{
"epoch": 0.26689015653302334,
"grad_norm": 17.769773483276367,
"learning_rate": 1.4820647419072617e-05,
"loss": 1.6972,
"step": 617
},
{
"epoch": 0.2673227175646814,
"grad_norm": 17.360748291015625,
"learning_rate": 1.4811898512685916e-05,
"loss": 1.6898,
"step": 618
},
{
"epoch": 0.26775527859633946,
"grad_norm": 18.90128517150879,
"learning_rate": 1.4803149606299214e-05,
"loss": 1.5293,
"step": 619
},
{
"epoch": 0.2681878396279975,
"grad_norm": 18.458005905151367,
"learning_rate": 1.4794400699912513e-05,
"loss": 1.6708,
"step": 620
},
{
"epoch": 0.2686204006596556,
"grad_norm": 16.414493560791016,
"learning_rate": 1.478565179352581e-05,
"loss": 1.5625,
"step": 621
},
{
"epoch": 0.26905296169131365,
"grad_norm": 17.863723754882812,
"learning_rate": 1.4776902887139108e-05,
"loss": 1.6055,
"step": 622
},
{
"epoch": 0.2694855227229717,
"grad_norm": 19.638256072998047,
"learning_rate": 1.4768153980752407e-05,
"loss": 1.6344,
"step": 623
},
{
"epoch": 0.2699180837546298,
"grad_norm": 19.117496490478516,
"learning_rate": 1.4759405074365705e-05,
"loss": 1.6323,
"step": 624
},
{
"epoch": 0.27035064478628784,
"grad_norm": 17.23756217956543,
"learning_rate": 1.4750656167979002e-05,
"loss": 1.6586,
"step": 625
},
{
"epoch": 0.27078320581794585,
"grad_norm": 17.25014877319336,
"learning_rate": 1.4741907261592303e-05,
"loss": 1.6476,
"step": 626
},
{
"epoch": 0.2712157668496039,
"grad_norm": 22.015825271606445,
"learning_rate": 1.47331583552056e-05,
"loss": 1.5858,
"step": 627
},
{
"epoch": 0.271648327881262,
"grad_norm": 19.36150360107422,
"learning_rate": 1.47244094488189e-05,
"loss": 1.6196,
"step": 628
},
{
"epoch": 0.27208088891292004,
"grad_norm": 17.298595428466797,
"learning_rate": 1.4715660542432198e-05,
"loss": 1.6995,
"step": 629
},
{
"epoch": 0.2725134499445781,
"grad_norm": 17.826095581054688,
"learning_rate": 1.4706911636045495e-05,
"loss": 1.7035,
"step": 630
},
{
"epoch": 0.27294601097623616,
"grad_norm": 17.059179306030273,
"learning_rate": 1.4698162729658794e-05,
"loss": 1.603,
"step": 631
},
{
"epoch": 0.2733785720078942,
"grad_norm": 18.210107803344727,
"learning_rate": 1.4689413823272092e-05,
"loss": 1.7385,
"step": 632
},
{
"epoch": 0.2738111330395523,
"grad_norm": 18.413015365600586,
"learning_rate": 1.468066491688539e-05,
"loss": 1.588,
"step": 633
},
{
"epoch": 0.27424369407121035,
"grad_norm": 17.428800582885742,
"learning_rate": 1.4671916010498688e-05,
"loss": 1.673,
"step": 634
},
{
"epoch": 0.2746762551028684,
"grad_norm": 17.01789665222168,
"learning_rate": 1.4663167104111988e-05,
"loss": 1.6085,
"step": 635
},
{
"epoch": 0.2751088161345265,
"grad_norm": 16.585988998413086,
"learning_rate": 1.4654418197725287e-05,
"loss": 1.7314,
"step": 636
},
{
"epoch": 0.27554137716618454,
"grad_norm": 18.863544464111328,
"learning_rate": 1.4645669291338584e-05,
"loss": 1.685,
"step": 637
},
{
"epoch": 0.2759739381978426,
"grad_norm": 18.15181541442871,
"learning_rate": 1.4636920384951882e-05,
"loss": 1.7304,
"step": 638
},
{
"epoch": 0.27640649922950067,
"grad_norm": 18.61130142211914,
"learning_rate": 1.4628171478565181e-05,
"loss": 1.5327,
"step": 639
},
{
"epoch": 0.27683906026115873,
"grad_norm": 17.54844856262207,
"learning_rate": 1.4619422572178479e-05,
"loss": 1.6265,
"step": 640
},
{
"epoch": 0.2772716212928168,
"grad_norm": 17.166112899780273,
"learning_rate": 1.4610673665791776e-05,
"loss": 1.6124,
"step": 641
},
{
"epoch": 0.27770418232447486,
"grad_norm": 19.315031051635742,
"learning_rate": 1.4601924759405075e-05,
"loss": 1.5918,
"step": 642
},
{
"epoch": 0.2781367433561329,
"grad_norm": 18.814680099487305,
"learning_rate": 1.4593175853018373e-05,
"loss": 1.5706,
"step": 643
},
{
"epoch": 0.278569304387791,
"grad_norm": 17.746023178100586,
"learning_rate": 1.4584426946631672e-05,
"loss": 1.7087,
"step": 644
},
{
"epoch": 0.27900186541944905,
"grad_norm": 21.044361114501953,
"learning_rate": 1.4575678040244971e-05,
"loss": 1.6326,
"step": 645
},
{
"epoch": 0.2794344264511071,
"grad_norm": 20.996479034423828,
"learning_rate": 1.456692913385827e-05,
"loss": 1.5941,
"step": 646
},
{
"epoch": 0.2798669874827652,
"grad_norm": 18.970081329345703,
"learning_rate": 1.4558180227471568e-05,
"loss": 1.6649,
"step": 647
},
{
"epoch": 0.2802995485144232,
"grad_norm": 18.267818450927734,
"learning_rate": 1.4549431321084865e-05,
"loss": 1.6545,
"step": 648
},
{
"epoch": 0.28073210954608124,
"grad_norm": 16.575632095336914,
"learning_rate": 1.4540682414698165e-05,
"loss": 1.6283,
"step": 649
},
{
"epoch": 0.2811646705777393,
"grad_norm": 17.574951171875,
"learning_rate": 1.4531933508311462e-05,
"loss": 1.664,
"step": 650
},
{
"epoch": 0.28159723160939737,
"grad_norm": 18.63271713256836,
"learning_rate": 1.452318460192476e-05,
"loss": 1.6736,
"step": 651
},
{
"epoch": 0.28202979264105543,
"grad_norm": 18.147533416748047,
"learning_rate": 1.4514435695538059e-05,
"loss": 1.6374,
"step": 652
},
{
"epoch": 0.2824623536727135,
"grad_norm": 17.428810119628906,
"learning_rate": 1.4505686789151356e-05,
"loss": 1.7221,
"step": 653
},
{
"epoch": 0.28289491470437156,
"grad_norm": 17.664213180541992,
"learning_rate": 1.4496937882764654e-05,
"loss": 1.6235,
"step": 654
},
{
"epoch": 0.2833274757360296,
"grad_norm": 18.84585952758789,
"learning_rate": 1.4488188976377955e-05,
"loss": 1.7157,
"step": 655
},
{
"epoch": 0.2837600367676877,
"grad_norm": 19.42424774169922,
"learning_rate": 1.4479440069991252e-05,
"loss": 1.6526,
"step": 656
},
{
"epoch": 0.28419259779934575,
"grad_norm": 18.132667541503906,
"learning_rate": 1.4470691163604552e-05,
"loss": 1.7109,
"step": 657
},
{
"epoch": 0.2846251588310038,
"grad_norm": 18.288928985595703,
"learning_rate": 1.4461942257217849e-05,
"loss": 1.6393,
"step": 658
},
{
"epoch": 0.2850577198626619,
"grad_norm": 15.85251522064209,
"learning_rate": 1.4453193350831147e-05,
"loss": 1.6375,
"step": 659
},
{
"epoch": 0.28549028089431994,
"grad_norm": 17.576475143432617,
"learning_rate": 1.4444444444444446e-05,
"loss": 1.6288,
"step": 660
},
{
"epoch": 0.285922841925978,
"grad_norm": 16.68917465209961,
"learning_rate": 1.4435695538057743e-05,
"loss": 1.5772,
"step": 661
},
{
"epoch": 0.28635540295763606,
"grad_norm": 19.292522430419922,
"learning_rate": 1.442694663167104e-05,
"loss": 1.6764,
"step": 662
},
{
"epoch": 0.2867879639892941,
"grad_norm": 19.660804748535156,
"learning_rate": 1.441819772528434e-05,
"loss": 1.7028,
"step": 663
},
{
"epoch": 0.2872205250209522,
"grad_norm": 17.166290283203125,
"learning_rate": 1.440944881889764e-05,
"loss": 1.5549,
"step": 664
},
{
"epoch": 0.28765308605261025,
"grad_norm": 15.878508567810059,
"learning_rate": 1.4400699912510938e-05,
"loss": 1.6366,
"step": 665
},
{
"epoch": 0.2880856470842683,
"grad_norm": 16.062185287475586,
"learning_rate": 1.4391951006124236e-05,
"loss": 1.6623,
"step": 666
},
{
"epoch": 0.2885182081159264,
"grad_norm": 18.430627822875977,
"learning_rate": 1.4383202099737535e-05,
"loss": 1.7102,
"step": 667
},
{
"epoch": 0.28895076914758444,
"grad_norm": 18.56020736694336,
"learning_rate": 1.4374453193350833e-05,
"loss": 1.6928,
"step": 668
},
{
"epoch": 0.2893833301792425,
"grad_norm": 77.00767517089844,
"learning_rate": 1.436570428696413e-05,
"loss": 1.6911,
"step": 669
},
{
"epoch": 0.2898158912109005,
"grad_norm": 18.834728240966797,
"learning_rate": 1.435695538057743e-05,
"loss": 1.6888,
"step": 670
},
{
"epoch": 0.2902484522425586,
"grad_norm": 17.199195861816406,
"learning_rate": 1.4348206474190727e-05,
"loss": 1.691,
"step": 671
},
{
"epoch": 0.29068101327421664,
"grad_norm": 18.331899642944336,
"learning_rate": 1.4339457567804024e-05,
"loss": 1.6973,
"step": 672
},
{
"epoch": 0.2911135743058747,
"grad_norm": 16.407562255859375,
"learning_rate": 1.4330708661417324e-05,
"loss": 1.6564,
"step": 673
},
{
"epoch": 0.29154613533753276,
"grad_norm": 18.343046188354492,
"learning_rate": 1.4321959755030623e-05,
"loss": 1.717,
"step": 674
},
{
"epoch": 0.2919786963691908,
"grad_norm": 19.638093948364258,
"learning_rate": 1.4313210848643922e-05,
"loss": 1.7019,
"step": 675
},
{
"epoch": 0.2924112574008489,
"grad_norm": 31.77922821044922,
"learning_rate": 1.430446194225722e-05,
"loss": 1.6391,
"step": 676
},
{
"epoch": 0.29284381843250695,
"grad_norm": 18.263328552246094,
"learning_rate": 1.4295713035870517e-05,
"loss": 1.5636,
"step": 677
},
{
"epoch": 0.293276379464165,
"grad_norm": 17.560848236083984,
"learning_rate": 1.4286964129483816e-05,
"loss": 1.6535,
"step": 678
},
{
"epoch": 0.2937089404958231,
"grad_norm": 18.677249908447266,
"learning_rate": 1.4278215223097114e-05,
"loss": 1.6181,
"step": 679
},
{
"epoch": 0.29414150152748114,
"grad_norm": 18.061887741088867,
"learning_rate": 1.4269466316710411e-05,
"loss": 1.5453,
"step": 680
},
{
"epoch": 0.2945740625591392,
"grad_norm": 17.87262535095215,
"learning_rate": 1.426071741032371e-05,
"loss": 1.6855,
"step": 681
},
{
"epoch": 0.29500662359079727,
"grad_norm": 18.203630447387695,
"learning_rate": 1.4251968503937008e-05,
"loss": 1.5759,
"step": 682
},
{
"epoch": 0.29543918462245533,
"grad_norm": 17.520793914794922,
"learning_rate": 1.4243219597550309e-05,
"loss": 1.5989,
"step": 683
},
{
"epoch": 0.2958717456541134,
"grad_norm": 19.39581298828125,
"learning_rate": 1.4234470691163606e-05,
"loss": 1.674,
"step": 684
},
{
"epoch": 0.29630430668577146,
"grad_norm": 18.8718318939209,
"learning_rate": 1.4225721784776904e-05,
"loss": 1.5593,
"step": 685
},
{
"epoch": 0.2967368677174295,
"grad_norm": 20.18855094909668,
"learning_rate": 1.4216972878390203e-05,
"loss": 1.5447,
"step": 686
},
{
"epoch": 0.2971694287490876,
"grad_norm": 18.366270065307617,
"learning_rate": 1.42082239720035e-05,
"loss": 1.5408,
"step": 687
},
{
"epoch": 0.29760198978074565,
"grad_norm": 20.94564437866211,
"learning_rate": 1.4199475065616798e-05,
"loss": 1.6166,
"step": 688
},
{
"epoch": 0.2980345508124037,
"grad_norm": 19.897859573364258,
"learning_rate": 1.4190726159230097e-05,
"loss": 1.5997,
"step": 689
},
{
"epoch": 0.29846711184406177,
"grad_norm": 18.334325790405273,
"learning_rate": 1.4181977252843395e-05,
"loss": 1.6338,
"step": 690
},
{
"epoch": 0.29889967287571984,
"grad_norm": 19.647480010986328,
"learning_rate": 1.4173228346456694e-05,
"loss": 1.6537,
"step": 691
},
{
"epoch": 0.29933223390737784,
"grad_norm": 16.615379333496094,
"learning_rate": 1.4164479440069992e-05,
"loss": 1.6717,
"step": 692
},
{
"epoch": 0.2997647949390359,
"grad_norm": 20.802907943725586,
"learning_rate": 1.4155730533683293e-05,
"loss": 1.6626,
"step": 693
},
{
"epoch": 0.30019735597069397,
"grad_norm": 17.72242546081543,
"learning_rate": 1.414698162729659e-05,
"loss": 1.6199,
"step": 694
},
{
"epoch": 0.30062991700235203,
"grad_norm": 17.221155166625977,
"learning_rate": 1.4138232720909888e-05,
"loss": 1.7148,
"step": 695
},
{
"epoch": 0.3010624780340101,
"grad_norm": 18.239665985107422,
"learning_rate": 1.4129483814523187e-05,
"loss": 1.654,
"step": 696
},
{
"epoch": 0.30149503906566816,
"grad_norm": 20.456558227539062,
"learning_rate": 1.4120734908136484e-05,
"loss": 1.6063,
"step": 697
},
{
"epoch": 0.3019276000973262,
"grad_norm": 18.604167938232422,
"learning_rate": 1.4111986001749782e-05,
"loss": 1.6348,
"step": 698
},
{
"epoch": 0.3023601611289843,
"grad_norm": 17.619157791137695,
"learning_rate": 1.4103237095363081e-05,
"loss": 1.5949,
"step": 699
},
{
"epoch": 0.30279272216064235,
"grad_norm": 17.160158157348633,
"learning_rate": 1.4094488188976379e-05,
"loss": 1.608,
"step": 700
},
{
"epoch": 0.3032252831923004,
"grad_norm": 17.992366790771484,
"learning_rate": 1.4085739282589676e-05,
"loss": 1.6816,
"step": 701
},
{
"epoch": 0.3036578442239585,
"grad_norm": 18.279861450195312,
"learning_rate": 1.4076990376202975e-05,
"loss": 1.5903,
"step": 702
},
{
"epoch": 0.30409040525561654,
"grad_norm": 17.513811111450195,
"learning_rate": 1.4068241469816274e-05,
"loss": 1.6141,
"step": 703
},
{
"epoch": 0.3045229662872746,
"grad_norm": 19.576257705688477,
"learning_rate": 1.4059492563429574e-05,
"loss": 1.5614,
"step": 704
},
{
"epoch": 0.30495552731893266,
"grad_norm": 17.985549926757812,
"learning_rate": 1.4050743657042871e-05,
"loss": 1.6339,
"step": 705
},
{
"epoch": 0.3053880883505907,
"grad_norm": 20.642486572265625,
"learning_rate": 1.4041994750656169e-05,
"loss": 1.5741,
"step": 706
},
{
"epoch": 0.3058206493822488,
"grad_norm": 18.464508056640625,
"learning_rate": 1.4033245844269468e-05,
"loss": 1.6104,
"step": 707
},
{
"epoch": 0.30625321041390685,
"grad_norm": 17.332229614257812,
"learning_rate": 1.4024496937882765e-05,
"loss": 1.6038,
"step": 708
},
{
"epoch": 0.3066857714455649,
"grad_norm": 21.05571937561035,
"learning_rate": 1.4015748031496063e-05,
"loss": 1.7033,
"step": 709
},
{
"epoch": 0.307118332477223,
"grad_norm": 18.811159133911133,
"learning_rate": 1.4006999125109362e-05,
"loss": 1.6677,
"step": 710
},
{
"epoch": 0.30755089350888104,
"grad_norm": 17.237346649169922,
"learning_rate": 1.399825021872266e-05,
"loss": 1.6081,
"step": 711
},
{
"epoch": 0.3079834545405391,
"grad_norm": 17.864810943603516,
"learning_rate": 1.398950131233596e-05,
"loss": 1.5892,
"step": 712
},
{
"epoch": 0.30841601557219717,
"grad_norm": 17.532041549682617,
"learning_rate": 1.3980752405949258e-05,
"loss": 1.5575,
"step": 713
},
{
"epoch": 0.3088485766038552,
"grad_norm": 18.569576263427734,
"learning_rate": 1.3972003499562557e-05,
"loss": 1.6348,
"step": 714
},
{
"epoch": 0.30928113763551324,
"grad_norm": 18.27239227294922,
"learning_rate": 1.3963254593175855e-05,
"loss": 1.6954,
"step": 715
},
{
"epoch": 0.3097136986671713,
"grad_norm": 20.6396484375,
"learning_rate": 1.3954505686789152e-05,
"loss": 1.6909,
"step": 716
},
{
"epoch": 0.31014625969882936,
"grad_norm": 19.61980628967285,
"learning_rate": 1.3945756780402451e-05,
"loss": 1.6499,
"step": 717
},
{
"epoch": 0.3105788207304874,
"grad_norm": 19.68534278869629,
"learning_rate": 1.3937007874015749e-05,
"loss": 1.5033,
"step": 718
},
{
"epoch": 0.3110113817621455,
"grad_norm": 22.719221115112305,
"learning_rate": 1.3928258967629047e-05,
"loss": 1.6344,
"step": 719
},
{
"epoch": 0.31144394279380355,
"grad_norm": 18.73038673400879,
"learning_rate": 1.3919510061242346e-05,
"loss": 1.6237,
"step": 720
},
{
"epoch": 0.3118765038254616,
"grad_norm": 17.38966941833496,
"learning_rate": 1.3910761154855643e-05,
"loss": 1.5169,
"step": 721
},
{
"epoch": 0.3123090648571197,
"grad_norm": 23.203266143798828,
"learning_rate": 1.3902012248468944e-05,
"loss": 1.6486,
"step": 722
},
{
"epoch": 0.31274162588877774,
"grad_norm": 20.895124435424805,
"learning_rate": 1.3893263342082242e-05,
"loss": 1.5868,
"step": 723
},
{
"epoch": 0.3131741869204358,
"grad_norm": 18.580778121948242,
"learning_rate": 1.388451443569554e-05,
"loss": 1.5986,
"step": 724
},
{
"epoch": 0.31360674795209387,
"grad_norm": 18.76534652709961,
"learning_rate": 1.3875765529308838e-05,
"loss": 1.5955,
"step": 725
},
{
"epoch": 0.31403930898375193,
"grad_norm": 19.820236206054688,
"learning_rate": 1.3867016622922136e-05,
"loss": 1.6397,
"step": 726
},
{
"epoch": 0.31447187001541,
"grad_norm": 18.899864196777344,
"learning_rate": 1.3858267716535433e-05,
"loss": 1.633,
"step": 727
},
{
"epoch": 0.31490443104706806,
"grad_norm": 19.738550186157227,
"learning_rate": 1.3849518810148733e-05,
"loss": 1.6484,
"step": 728
},
{
"epoch": 0.3153369920787261,
"grad_norm": 20.931293487548828,
"learning_rate": 1.384076990376203e-05,
"loss": 1.6189,
"step": 729
},
{
"epoch": 0.3157695531103842,
"grad_norm": 18.17987060546875,
"learning_rate": 1.3832020997375328e-05,
"loss": 1.6628,
"step": 730
},
{
"epoch": 0.31620211414204225,
"grad_norm": 19.147859573364258,
"learning_rate": 1.3823272090988629e-05,
"loss": 1.5953,
"step": 731
},
{
"epoch": 0.3166346751737003,
"grad_norm": 18.06837272644043,
"learning_rate": 1.3814523184601926e-05,
"loss": 1.5991,
"step": 732
},
{
"epoch": 0.31706723620535837,
"grad_norm": 17.540843963623047,
"learning_rate": 1.3805774278215225e-05,
"loss": 1.6227,
"step": 733
},
{
"epoch": 0.31749979723701643,
"grad_norm": 20.706098556518555,
"learning_rate": 1.3797025371828523e-05,
"loss": 1.5718,
"step": 734
},
{
"epoch": 0.3179323582686745,
"grad_norm": 17.51397132873535,
"learning_rate": 1.378827646544182e-05,
"loss": 1.6043,
"step": 735
},
{
"epoch": 0.3183649193003325,
"grad_norm": 20.64704132080078,
"learning_rate": 1.377952755905512e-05,
"loss": 1.5771,
"step": 736
},
{
"epoch": 0.31879748033199057,
"grad_norm": 20.126808166503906,
"learning_rate": 1.3770778652668417e-05,
"loss": 1.7199,
"step": 737
},
{
"epoch": 0.31923004136364863,
"grad_norm": 17.626785278320312,
"learning_rate": 1.3762029746281716e-05,
"loss": 1.6152,
"step": 738
},
{
"epoch": 0.3196626023953067,
"grad_norm": 19.35613250732422,
"learning_rate": 1.3753280839895014e-05,
"loss": 1.5903,
"step": 739
},
{
"epoch": 0.32009516342696476,
"grad_norm": 19.347549438476562,
"learning_rate": 1.3744531933508311e-05,
"loss": 1.5552,
"step": 740
},
{
"epoch": 0.3205277244586228,
"grad_norm": 22.455907821655273,
"learning_rate": 1.3735783027121612e-05,
"loss": 1.6141,
"step": 741
},
{
"epoch": 0.3209602854902809,
"grad_norm": 18.277362823486328,
"learning_rate": 1.372703412073491e-05,
"loss": 1.6331,
"step": 742
},
{
"epoch": 0.32139284652193895,
"grad_norm": 18.168556213378906,
"learning_rate": 1.3718285214348209e-05,
"loss": 1.7211,
"step": 743
},
{
"epoch": 0.321825407553597,
"grad_norm": 18.769269943237305,
"learning_rate": 1.3709536307961506e-05,
"loss": 1.5252,
"step": 744
},
{
"epoch": 0.3222579685852551,
"grad_norm": 22.99976348876953,
"learning_rate": 1.3700787401574804e-05,
"loss": 1.6573,
"step": 745
},
{
"epoch": 0.32269052961691314,
"grad_norm": 20.37944793701172,
"learning_rate": 1.3692038495188103e-05,
"loss": 1.6173,
"step": 746
},
{
"epoch": 0.3231230906485712,
"grad_norm": 22.109603881835938,
"learning_rate": 1.36832895888014e-05,
"loss": 1.6456,
"step": 747
},
{
"epoch": 0.32355565168022926,
"grad_norm": 17.85908317565918,
"learning_rate": 1.3674540682414698e-05,
"loss": 1.5592,
"step": 748
},
{
"epoch": 0.3239882127118873,
"grad_norm": 17.462928771972656,
"learning_rate": 1.3665791776027997e-05,
"loss": 1.5589,
"step": 749
},
{
"epoch": 0.3244207737435454,
"grad_norm": 17.60029411315918,
"learning_rate": 1.3657042869641295e-05,
"loss": 1.6156,
"step": 750
},
{
"epoch": 0.32485333477520345,
"grad_norm": 18.957937240600586,
"learning_rate": 1.3648293963254596e-05,
"loss": 1.7443,
"step": 751
},
{
"epoch": 0.3252858958068615,
"grad_norm": 22.84882926940918,
"learning_rate": 1.3639545056867893e-05,
"loss": 1.55,
"step": 752
},
{
"epoch": 0.3257184568385196,
"grad_norm": 20.238740921020508,
"learning_rate": 1.363079615048119e-05,
"loss": 1.5943,
"step": 753
},
{
"epoch": 0.32615101787017764,
"grad_norm": 17.576885223388672,
"learning_rate": 1.362204724409449e-05,
"loss": 1.6363,
"step": 754
},
{
"epoch": 0.3265835789018357,
"grad_norm": 20.89413070678711,
"learning_rate": 1.3613298337707787e-05,
"loss": 1.6913,
"step": 755
},
{
"epoch": 0.32701613993349377,
"grad_norm": 18.79582977294922,
"learning_rate": 1.3604549431321085e-05,
"loss": 1.6068,
"step": 756
},
{
"epoch": 0.32744870096515183,
"grad_norm": 16.650836944580078,
"learning_rate": 1.3595800524934384e-05,
"loss": 1.634,
"step": 757
},
{
"epoch": 0.32788126199680984,
"grad_norm": 17.64925765991211,
"learning_rate": 1.3587051618547682e-05,
"loss": 1.6472,
"step": 758
},
{
"epoch": 0.3283138230284679,
"grad_norm": 20.90283966064453,
"learning_rate": 1.357830271216098e-05,
"loss": 1.5816,
"step": 759
},
{
"epoch": 0.32874638406012596,
"grad_norm": 22.616561889648438,
"learning_rate": 1.356955380577428e-05,
"loss": 1.5406,
"step": 760
},
{
"epoch": 0.329178945091784,
"grad_norm": 18.393531799316406,
"learning_rate": 1.356080489938758e-05,
"loss": 1.5632,
"step": 761
},
{
"epoch": 0.3296115061234421,
"grad_norm": 17.92359733581543,
"learning_rate": 1.3552055993000877e-05,
"loss": 1.6438,
"step": 762
},
{
"epoch": 0.33004406715510015,
"grad_norm": 18.8277530670166,
"learning_rate": 1.3543307086614174e-05,
"loss": 1.607,
"step": 763
},
{
"epoch": 0.3304766281867582,
"grad_norm": 19.19496726989746,
"learning_rate": 1.3534558180227474e-05,
"loss": 1.6057,
"step": 764
},
{
"epoch": 0.3309091892184163,
"grad_norm": 19.613494873046875,
"learning_rate": 1.3525809273840771e-05,
"loss": 1.6079,
"step": 765
},
{
"epoch": 0.33134175025007434,
"grad_norm": 19.068227767944336,
"learning_rate": 1.3517060367454069e-05,
"loss": 1.5423,
"step": 766
},
{
"epoch": 0.3317743112817324,
"grad_norm": 20.14394760131836,
"learning_rate": 1.3508311461067368e-05,
"loss": 1.643,
"step": 767
},
{
"epoch": 0.33220687231339047,
"grad_norm": 20.151247024536133,
"learning_rate": 1.3499562554680665e-05,
"loss": 1.6013,
"step": 768
},
{
"epoch": 0.33263943334504853,
"grad_norm": 19.042261123657227,
"learning_rate": 1.3490813648293963e-05,
"loss": 1.5702,
"step": 769
},
{
"epoch": 0.3330719943767066,
"grad_norm": 18.928590774536133,
"learning_rate": 1.3482064741907264e-05,
"loss": 1.5902,
"step": 770
},
{
"epoch": 0.33350455540836466,
"grad_norm": 20.279457092285156,
"learning_rate": 1.3473315835520561e-05,
"loss": 1.6624,
"step": 771
},
{
"epoch": 0.3339371164400227,
"grad_norm": 19.55834197998047,
"learning_rate": 1.346456692913386e-05,
"loss": 1.5899,
"step": 772
},
{
"epoch": 0.3343696774716808,
"grad_norm": 19.354820251464844,
"learning_rate": 1.3455818022747158e-05,
"loss": 1.586,
"step": 773
},
{
"epoch": 0.33480223850333884,
"grad_norm": 21.938444137573242,
"learning_rate": 1.3447069116360455e-05,
"loss": 1.6159,
"step": 774
},
{
"epoch": 0.3352347995349969,
"grad_norm": 17.96108055114746,
"learning_rate": 1.3438320209973755e-05,
"loss": 1.6662,
"step": 775
},
{
"epoch": 0.33566736056665497,
"grad_norm": 18.945526123046875,
"learning_rate": 1.3429571303587052e-05,
"loss": 1.5927,
"step": 776
},
{
"epoch": 0.33609992159831303,
"grad_norm": 19.803407669067383,
"learning_rate": 1.342082239720035e-05,
"loss": 1.6594,
"step": 777
},
{
"epoch": 0.3365324826299711,
"grad_norm": 18.89844512939453,
"learning_rate": 1.3412073490813649e-05,
"loss": 1.6048,
"step": 778
},
{
"epoch": 0.33696504366162916,
"grad_norm": 20.112564086914062,
"learning_rate": 1.3403324584426948e-05,
"loss": 1.6187,
"step": 779
},
{
"epoch": 0.33739760469328717,
"grad_norm": 17.415624618530273,
"learning_rate": 1.3394575678040247e-05,
"loss": 1.5409,
"step": 780
},
{
"epoch": 0.33783016572494523,
"grad_norm": 16.357385635375977,
"learning_rate": 1.3385826771653545e-05,
"loss": 1.6128,
"step": 781
},
{
"epoch": 0.3382627267566033,
"grad_norm": 18.058176040649414,
"learning_rate": 1.3377077865266842e-05,
"loss": 1.5935,
"step": 782
},
{
"epoch": 0.33869528778826136,
"grad_norm": 19.19011688232422,
"learning_rate": 1.3368328958880142e-05,
"loss": 1.5617,
"step": 783
},
{
"epoch": 0.3391278488199194,
"grad_norm": 19.17144203186035,
"learning_rate": 1.3359580052493439e-05,
"loss": 1.6607,
"step": 784
},
{
"epoch": 0.3395604098515775,
"grad_norm": 18.61704444885254,
"learning_rate": 1.3350831146106738e-05,
"loss": 1.612,
"step": 785
},
{
"epoch": 0.33999297088323555,
"grad_norm": 20.21958351135254,
"learning_rate": 1.3342082239720036e-05,
"loss": 1.5809,
"step": 786
},
{
"epoch": 0.3404255319148936,
"grad_norm": 19.189098358154297,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.5807,
"step": 787
},
{
"epoch": 0.34085809294655167,
"grad_norm": 22.04606819152832,
"learning_rate": 1.3324584426946633e-05,
"loss": 1.6273,
"step": 788
},
{
"epoch": 0.34129065397820973,
"grad_norm": 19.73443031311035,
"learning_rate": 1.3315835520559932e-05,
"loss": 1.6006,
"step": 789
},
{
"epoch": 0.3417232150098678,
"grad_norm": 20.105873107910156,
"learning_rate": 1.3307086614173231e-05,
"loss": 1.573,
"step": 790
},
{
"epoch": 0.34215577604152586,
"grad_norm": 20.053709030151367,
"learning_rate": 1.3298337707786528e-05,
"loss": 1.5847,
"step": 791
},
{
"epoch": 0.3425883370731839,
"grad_norm": 17.113309860229492,
"learning_rate": 1.3289588801399826e-05,
"loss": 1.5184,
"step": 792
},
{
"epoch": 0.343020898104842,
"grad_norm": 17.02407455444336,
"learning_rate": 1.3280839895013125e-05,
"loss": 1.5491,
"step": 793
},
{
"epoch": 0.34345345913650005,
"grad_norm": 19.07799530029297,
"learning_rate": 1.3272090988626423e-05,
"loss": 1.6541,
"step": 794
},
{
"epoch": 0.3438860201681581,
"grad_norm": 19.018844604492188,
"learning_rate": 1.326334208223972e-05,
"loss": 1.5478,
"step": 795
},
{
"epoch": 0.3443185811998162,
"grad_norm": 18.546709060668945,
"learning_rate": 1.325459317585302e-05,
"loss": 1.6061,
"step": 796
},
{
"epoch": 0.34475114223147424,
"grad_norm": 20.176528930664062,
"learning_rate": 1.3245844269466317e-05,
"loss": 1.7127,
"step": 797
},
{
"epoch": 0.3451837032631323,
"grad_norm": 20.838176727294922,
"learning_rate": 1.3237095363079614e-05,
"loss": 1.6132,
"step": 798
},
{
"epoch": 0.34561626429479037,
"grad_norm": 17.911251068115234,
"learning_rate": 1.3228346456692915e-05,
"loss": 1.659,
"step": 799
},
{
"epoch": 0.34604882532644843,
"grad_norm": 20.308305740356445,
"learning_rate": 1.3219597550306213e-05,
"loss": 1.5575,
"step": 800
},
{
"epoch": 0.3464813863581065,
"grad_norm": 18.12563133239746,
"learning_rate": 1.3210848643919512e-05,
"loss": 1.7155,
"step": 801
},
{
"epoch": 0.3469139473897645,
"grad_norm": 18.247697830200195,
"learning_rate": 1.320209973753281e-05,
"loss": 1.6309,
"step": 802
},
{
"epoch": 0.34734650842142256,
"grad_norm": 20.070714950561523,
"learning_rate": 1.3193350831146107e-05,
"loss": 1.5833,
"step": 803
},
{
"epoch": 0.3477790694530806,
"grad_norm": 22.178564071655273,
"learning_rate": 1.3184601924759406e-05,
"loss": 1.6604,
"step": 804
},
{
"epoch": 0.3482116304847387,
"grad_norm": 19.786285400390625,
"learning_rate": 1.3175853018372704e-05,
"loss": 1.5634,
"step": 805
},
{
"epoch": 0.34864419151639675,
"grad_norm": 18.314327239990234,
"learning_rate": 1.3167104111986003e-05,
"loss": 1.657,
"step": 806
},
{
"epoch": 0.3490767525480548,
"grad_norm": 20.814380645751953,
"learning_rate": 1.31583552055993e-05,
"loss": 1.6143,
"step": 807
},
{
"epoch": 0.3495093135797129,
"grad_norm": 19.84009552001953,
"learning_rate": 1.3149606299212601e-05,
"loss": 1.6296,
"step": 808
},
{
"epoch": 0.34994187461137094,
"grad_norm": 18.69206428527832,
"learning_rate": 1.3140857392825899e-05,
"loss": 1.6647,
"step": 809
},
{
"epoch": 0.350374435643029,
"grad_norm": 19.76461410522461,
"learning_rate": 1.3132108486439196e-05,
"loss": 1.5763,
"step": 810
},
{
"epoch": 0.35080699667468707,
"grad_norm": 19.293476104736328,
"learning_rate": 1.3123359580052496e-05,
"loss": 1.5856,
"step": 811
},
{
"epoch": 0.35123955770634513,
"grad_norm": 20.5203914642334,
"learning_rate": 1.3114610673665793e-05,
"loss": 1.5266,
"step": 812
},
{
"epoch": 0.3516721187380032,
"grad_norm": 20.248397827148438,
"learning_rate": 1.310586176727909e-05,
"loss": 1.6702,
"step": 813
},
{
"epoch": 0.35210467976966126,
"grad_norm": 16.614885330200195,
"learning_rate": 1.309711286089239e-05,
"loss": 1.6621,
"step": 814
},
{
"epoch": 0.3525372408013193,
"grad_norm": 16.071029663085938,
"learning_rate": 1.3088363954505687e-05,
"loss": 1.6654,
"step": 815
},
{
"epoch": 0.3529698018329774,
"grad_norm": 18.72723960876465,
"learning_rate": 1.3079615048118985e-05,
"loss": 1.6519,
"step": 816
},
{
"epoch": 0.35340236286463544,
"grad_norm": 18.771820068359375,
"learning_rate": 1.3070866141732284e-05,
"loss": 1.688,
"step": 817
},
{
"epoch": 0.3538349238962935,
"grad_norm": 17.976329803466797,
"learning_rate": 1.3062117235345583e-05,
"loss": 1.6781,
"step": 818
},
{
"epoch": 0.35426748492795157,
"grad_norm": 19.907297134399414,
"learning_rate": 1.3053368328958883e-05,
"loss": 1.623,
"step": 819
},
{
"epoch": 0.35470004595960963,
"grad_norm": 19.37649154663086,
"learning_rate": 1.304461942257218e-05,
"loss": 1.6589,
"step": 820
},
{
"epoch": 0.3551326069912677,
"grad_norm": 21.129186630249023,
"learning_rate": 1.3035870516185478e-05,
"loss": 1.5147,
"step": 821
},
{
"epoch": 0.35556516802292576,
"grad_norm": 20.957626342773438,
"learning_rate": 1.3027121609798777e-05,
"loss": 1.6448,
"step": 822
},
{
"epoch": 0.3559977290545838,
"grad_norm": 21.617694854736328,
"learning_rate": 1.3018372703412074e-05,
"loss": 1.5604,
"step": 823
},
{
"epoch": 0.35643029008624183,
"grad_norm": 21.156492233276367,
"learning_rate": 1.3009623797025372e-05,
"loss": 1.6346,
"step": 824
},
{
"epoch": 0.3568628511178999,
"grad_norm": 20.160831451416016,
"learning_rate": 1.3000874890638671e-05,
"loss": 1.5343,
"step": 825
},
{
"epoch": 0.35729541214955796,
"grad_norm": 21.976581573486328,
"learning_rate": 1.2992125984251968e-05,
"loss": 1.615,
"step": 826
},
{
"epoch": 0.357727973181216,
"grad_norm": 21.678958892822266,
"learning_rate": 1.2983377077865266e-05,
"loss": 1.5747,
"step": 827
},
{
"epoch": 0.3581605342128741,
"grad_norm": 22.03537940979004,
"learning_rate": 1.2974628171478567e-05,
"loss": 1.5686,
"step": 828
},
{
"epoch": 0.35859309524453215,
"grad_norm": 18.182485580444336,
"learning_rate": 1.2965879265091864e-05,
"loss": 1.6549,
"step": 829
},
{
"epoch": 0.3590256562761902,
"grad_norm": 18.341358184814453,
"learning_rate": 1.2957130358705164e-05,
"loss": 1.7054,
"step": 830
},
{
"epoch": 0.35945821730784827,
"grad_norm": 20.54411506652832,
"learning_rate": 1.2948381452318461e-05,
"loss": 1.6214,
"step": 831
},
{
"epoch": 0.35989077833950633,
"grad_norm": 17.28512954711914,
"learning_rate": 1.293963254593176e-05,
"loss": 1.6161,
"step": 832
},
{
"epoch": 0.3603233393711644,
"grad_norm": 20.21026039123535,
"learning_rate": 1.2930883639545058e-05,
"loss": 1.5949,
"step": 833
},
{
"epoch": 0.36075590040282246,
"grad_norm": 20.83098030090332,
"learning_rate": 1.2922134733158355e-05,
"loss": 1.556,
"step": 834
},
{
"epoch": 0.3611884614344805,
"grad_norm": 19.548084259033203,
"learning_rate": 1.2913385826771655e-05,
"loss": 1.6087,
"step": 835
},
{
"epoch": 0.3616210224661386,
"grad_norm": 19.833934783935547,
"learning_rate": 1.2904636920384952e-05,
"loss": 1.6511,
"step": 836
},
{
"epoch": 0.36205358349779665,
"grad_norm": 24.85418701171875,
"learning_rate": 1.2895888013998253e-05,
"loss": 1.6686,
"step": 837
},
{
"epoch": 0.3624861445294547,
"grad_norm": 20.29999542236328,
"learning_rate": 1.288713910761155e-05,
"loss": 1.6503,
"step": 838
},
{
"epoch": 0.3629187055611128,
"grad_norm": 18.538761138916016,
"learning_rate": 1.2878390201224848e-05,
"loss": 1.5934,
"step": 839
},
{
"epoch": 0.36335126659277084,
"grad_norm": 20.594356536865234,
"learning_rate": 1.2869641294838147e-05,
"loss": 1.6694,
"step": 840
},
{
"epoch": 0.3637838276244289,
"grad_norm": 18.274131774902344,
"learning_rate": 1.2860892388451445e-05,
"loss": 1.6111,
"step": 841
},
{
"epoch": 0.36421638865608696,
"grad_norm": 19.532379150390625,
"learning_rate": 1.2852143482064742e-05,
"loss": 1.5865,
"step": 842
},
{
"epoch": 0.364648949687745,
"grad_norm": 19.116188049316406,
"learning_rate": 1.2843394575678041e-05,
"loss": 1.5406,
"step": 843
},
{
"epoch": 0.3650815107194031,
"grad_norm": 21.525840759277344,
"learning_rate": 1.2834645669291339e-05,
"loss": 1.5579,
"step": 844
},
{
"epoch": 0.36551407175106115,
"grad_norm": 19.764007568359375,
"learning_rate": 1.2825896762904636e-05,
"loss": 1.5497,
"step": 845
},
{
"epoch": 0.36594663278271916,
"grad_norm": 20.76372528076172,
"learning_rate": 1.2817147856517936e-05,
"loss": 1.5076,
"step": 846
},
{
"epoch": 0.3663791938143772,
"grad_norm": 26.738079071044922,
"learning_rate": 1.2808398950131235e-05,
"loss": 1.5783,
"step": 847
},
{
"epoch": 0.3668117548460353,
"grad_norm": 21.190170288085938,
"learning_rate": 1.2799650043744534e-05,
"loss": 1.4697,
"step": 848
},
{
"epoch": 0.36724431587769335,
"grad_norm": 21.188650131225586,
"learning_rate": 1.2790901137357832e-05,
"loss": 1.6,
"step": 849
},
{
"epoch": 0.3676768769093514,
"grad_norm": 20.70604705810547,
"learning_rate": 1.2782152230971129e-05,
"loss": 1.6808,
"step": 850
},
{
"epoch": 0.3681094379410095,
"grad_norm": 20.689743041992188,
"learning_rate": 1.2773403324584428e-05,
"loss": 1.5621,
"step": 851
},
{
"epoch": 0.36854199897266754,
"grad_norm": 20.194637298583984,
"learning_rate": 1.2764654418197726e-05,
"loss": 1.6808,
"step": 852
},
{
"epoch": 0.3689745600043256,
"grad_norm": 19.224369049072266,
"learning_rate": 1.2755905511811025e-05,
"loss": 1.6184,
"step": 853
},
{
"epoch": 0.36940712103598367,
"grad_norm": 19.203989028930664,
"learning_rate": 1.2747156605424323e-05,
"loss": 1.5306,
"step": 854
},
{
"epoch": 0.36983968206764173,
"grad_norm": 20.381193161010742,
"learning_rate": 1.273840769903762e-05,
"loss": 1.6061,
"step": 855
},
{
"epoch": 0.3702722430992998,
"grad_norm": 19.450544357299805,
"learning_rate": 1.2729658792650921e-05,
"loss": 1.6247,
"step": 856
},
{
"epoch": 0.37070480413095785,
"grad_norm": 21.83024787902832,
"learning_rate": 1.2720909886264218e-05,
"loss": 1.5597,
"step": 857
},
{
"epoch": 0.3711373651626159,
"grad_norm": 21.240888595581055,
"learning_rate": 1.2712160979877518e-05,
"loss": 1.604,
"step": 858
},
{
"epoch": 0.371569926194274,
"grad_norm": 21.858312606811523,
"learning_rate": 1.2703412073490815e-05,
"loss": 1.5974,
"step": 859
},
{
"epoch": 0.37200248722593204,
"grad_norm": 19.26487159729004,
"learning_rate": 1.2694663167104113e-05,
"loss": 1.7254,
"step": 860
},
{
"epoch": 0.3724350482575901,
"grad_norm": 22.703330993652344,
"learning_rate": 1.2685914260717412e-05,
"loss": 1.6511,
"step": 861
},
{
"epoch": 0.37286760928924817,
"grad_norm": 20.9957218170166,
"learning_rate": 1.267716535433071e-05,
"loss": 1.5367,
"step": 862
},
{
"epoch": 0.37330017032090623,
"grad_norm": 19.078935623168945,
"learning_rate": 1.2668416447944007e-05,
"loss": 1.5111,
"step": 863
},
{
"epoch": 0.3737327313525643,
"grad_norm": 20.05775260925293,
"learning_rate": 1.2659667541557306e-05,
"loss": 1.5996,
"step": 864
},
{
"epoch": 0.37416529238422236,
"grad_norm": 18.394546508789062,
"learning_rate": 1.2650918635170604e-05,
"loss": 1.6372,
"step": 865
},
{
"epoch": 0.3745978534158804,
"grad_norm": 19.035602569580078,
"learning_rate": 1.2642169728783905e-05,
"loss": 1.6267,
"step": 866
},
{
"epoch": 0.3750304144475385,
"grad_norm": 19.66758918762207,
"learning_rate": 1.2633420822397202e-05,
"loss": 1.5181,
"step": 867
},
{
"epoch": 0.3754629754791965,
"grad_norm": 19.294878005981445,
"learning_rate": 1.26246719160105e-05,
"loss": 1.6038,
"step": 868
},
{
"epoch": 0.37589553651085456,
"grad_norm": 21.874311447143555,
"learning_rate": 1.2615923009623799e-05,
"loss": 1.5302,
"step": 869
},
{
"epoch": 0.3763280975425126,
"grad_norm": 20.003787994384766,
"learning_rate": 1.2607174103237096e-05,
"loss": 1.5961,
"step": 870
},
{
"epoch": 0.3767606585741707,
"grad_norm": 19.584362030029297,
"learning_rate": 1.2598425196850394e-05,
"loss": 1.6231,
"step": 871
},
{
"epoch": 0.37719321960582874,
"grad_norm": 18.144039154052734,
"learning_rate": 1.2589676290463693e-05,
"loss": 1.4876,
"step": 872
},
{
"epoch": 0.3776257806374868,
"grad_norm": 20.27103042602539,
"learning_rate": 1.258092738407699e-05,
"loss": 1.5783,
"step": 873
},
{
"epoch": 0.37805834166914487,
"grad_norm": 24.131301879882812,
"learning_rate": 1.2572178477690288e-05,
"loss": 1.6346,
"step": 874
},
{
"epoch": 0.37849090270080293,
"grad_norm": 25.75758934020996,
"learning_rate": 1.2563429571303587e-05,
"loss": 1.7327,
"step": 875
},
{
"epoch": 0.378923463732461,
"grad_norm": 22.106565475463867,
"learning_rate": 1.2554680664916886e-05,
"loss": 1.618,
"step": 876
},
{
"epoch": 0.37935602476411906,
"grad_norm": 19.74208641052246,
"learning_rate": 1.2545931758530186e-05,
"loss": 1.5391,
"step": 877
},
{
"epoch": 0.3797885857957771,
"grad_norm": 22.241756439208984,
"learning_rate": 1.2537182852143483e-05,
"loss": 1.597,
"step": 878
},
{
"epoch": 0.3802211468274352,
"grad_norm": 18.778261184692383,
"learning_rate": 1.2528433945756782e-05,
"loss": 1.5784,
"step": 879
},
{
"epoch": 0.38065370785909325,
"grad_norm": 20.45719337463379,
"learning_rate": 1.251968503937008e-05,
"loss": 1.5788,
"step": 880
},
{
"epoch": 0.3810862688907513,
"grad_norm": 23.036163330078125,
"learning_rate": 1.2510936132983377e-05,
"loss": 1.7009,
"step": 881
},
{
"epoch": 0.3815188299224094,
"grad_norm": 20.001171112060547,
"learning_rate": 1.2502187226596677e-05,
"loss": 1.4824,
"step": 882
},
{
"epoch": 0.38195139095406744,
"grad_norm": 19.137182235717773,
"learning_rate": 1.2493438320209974e-05,
"loss": 1.5866,
"step": 883
},
{
"epoch": 0.3823839519857255,
"grad_norm": 21.097896575927734,
"learning_rate": 1.2484689413823272e-05,
"loss": 1.5977,
"step": 884
},
{
"epoch": 0.38281651301738356,
"grad_norm": 20.685922622680664,
"learning_rate": 1.2475940507436573e-05,
"loss": 1.6055,
"step": 885
},
{
"epoch": 0.3832490740490416,
"grad_norm": 19.775070190429688,
"learning_rate": 1.246719160104987e-05,
"loss": 1.6436,
"step": 886
},
{
"epoch": 0.3836816350806997,
"grad_norm": 20.196359634399414,
"learning_rate": 1.245844269466317e-05,
"loss": 1.5265,
"step": 887
},
{
"epoch": 0.38411419611235775,
"grad_norm": 19.682497024536133,
"learning_rate": 1.2449693788276467e-05,
"loss": 1.5624,
"step": 888
},
{
"epoch": 0.3845467571440158,
"grad_norm": 20.493722915649414,
"learning_rate": 1.2440944881889764e-05,
"loss": 1.6223,
"step": 889
},
{
"epoch": 0.3849793181756738,
"grad_norm": 19.67377281188965,
"learning_rate": 1.2432195975503064e-05,
"loss": 1.6164,
"step": 890
},
{
"epoch": 0.3854118792073319,
"grad_norm": 18.345895767211914,
"learning_rate": 1.2423447069116361e-05,
"loss": 1.6155,
"step": 891
},
{
"epoch": 0.38584444023898995,
"grad_norm": 18.30042266845703,
"learning_rate": 1.2414698162729659e-05,
"loss": 1.5257,
"step": 892
},
{
"epoch": 0.386277001270648,
"grad_norm": 19.78774070739746,
"learning_rate": 1.2405949256342958e-05,
"loss": 1.716,
"step": 893
},
{
"epoch": 0.3867095623023061,
"grad_norm": 21.33734130859375,
"learning_rate": 1.2397200349956255e-05,
"loss": 1.5729,
"step": 894
},
{
"epoch": 0.38714212333396414,
"grad_norm": 18.871843338012695,
"learning_rate": 1.2388451443569556e-05,
"loss": 1.6478,
"step": 895
},
{
"epoch": 0.3875746843656222,
"grad_norm": 20.553613662719727,
"learning_rate": 1.2379702537182854e-05,
"loss": 1.6663,
"step": 896
},
{
"epoch": 0.38800724539728026,
"grad_norm": 19.660924911499023,
"learning_rate": 1.2370953630796151e-05,
"loss": 1.6294,
"step": 897
},
{
"epoch": 0.38843980642893833,
"grad_norm": 17.97296714782715,
"learning_rate": 1.236220472440945e-05,
"loss": 1.5321,
"step": 898
},
{
"epoch": 0.3888723674605964,
"grad_norm": 20.643564224243164,
"learning_rate": 1.2353455818022748e-05,
"loss": 1.5242,
"step": 899
},
{
"epoch": 0.38930492849225445,
"grad_norm": 19.187049865722656,
"learning_rate": 1.2344706911636047e-05,
"loss": 1.5819,
"step": 900
},
{
"epoch": 0.3897374895239125,
"grad_norm": 18.788549423217773,
"learning_rate": 1.2335958005249345e-05,
"loss": 1.6182,
"step": 901
},
{
"epoch": 0.3901700505555706,
"grad_norm": 20.54323959350586,
"learning_rate": 1.2327209098862642e-05,
"loss": 1.6235,
"step": 902
},
{
"epoch": 0.39060261158722864,
"grad_norm": 20.15620231628418,
"learning_rate": 1.2318460192475941e-05,
"loss": 1.5536,
"step": 903
},
{
"epoch": 0.3910351726188867,
"grad_norm": 18.571413040161133,
"learning_rate": 1.230971128608924e-05,
"loss": 1.6502,
"step": 904
},
{
"epoch": 0.39146773365054477,
"grad_norm": 19.422834396362305,
"learning_rate": 1.230096237970254e-05,
"loss": 1.5417,
"step": 905
},
{
"epoch": 0.39190029468220283,
"grad_norm": 20.704635620117188,
"learning_rate": 1.2292213473315837e-05,
"loss": 1.6313,
"step": 906
},
{
"epoch": 0.3923328557138609,
"grad_norm": 17.548553466796875,
"learning_rate": 1.2283464566929135e-05,
"loss": 1.6678,
"step": 907
},
{
"epoch": 0.39276541674551896,
"grad_norm": 19.540618896484375,
"learning_rate": 1.2274715660542434e-05,
"loss": 1.5068,
"step": 908
},
{
"epoch": 0.393197977777177,
"grad_norm": 18.6761531829834,
"learning_rate": 1.2265966754155732e-05,
"loss": 1.5907,
"step": 909
},
{
"epoch": 0.3936305388088351,
"grad_norm": 19.963176727294922,
"learning_rate": 1.2257217847769029e-05,
"loss": 1.5861,
"step": 910
},
{
"epoch": 0.39406309984049315,
"grad_norm": 20.782676696777344,
"learning_rate": 1.2248468941382328e-05,
"loss": 1.5452,
"step": 911
},
{
"epoch": 0.39449566087215115,
"grad_norm": 20.450653076171875,
"learning_rate": 1.2239720034995626e-05,
"loss": 1.5404,
"step": 912
},
{
"epoch": 0.3949282219038092,
"grad_norm": 18.47747230529785,
"learning_rate": 1.2230971128608923e-05,
"loss": 1.6799,
"step": 913
},
{
"epoch": 0.3953607829354673,
"grad_norm": 21.109474182128906,
"learning_rate": 1.2222222222222224e-05,
"loss": 1.5815,
"step": 914
},
{
"epoch": 0.39579334396712534,
"grad_norm": 20.80393409729004,
"learning_rate": 1.2213473315835522e-05,
"loss": 1.6559,
"step": 915
},
{
"epoch": 0.3962259049987834,
"grad_norm": 21.14255714416504,
"learning_rate": 1.2204724409448821e-05,
"loss": 1.6067,
"step": 916
},
{
"epoch": 0.39665846603044147,
"grad_norm": 18.84864616394043,
"learning_rate": 1.2195975503062118e-05,
"loss": 1.5728,
"step": 917
},
{
"epoch": 0.39709102706209953,
"grad_norm": 21.547277450561523,
"learning_rate": 1.2187226596675416e-05,
"loss": 1.5919,
"step": 918
},
{
"epoch": 0.3975235880937576,
"grad_norm": 20.30232048034668,
"learning_rate": 1.2178477690288715e-05,
"loss": 1.5967,
"step": 919
},
{
"epoch": 0.39795614912541566,
"grad_norm": 19.81118392944336,
"learning_rate": 1.2169728783902013e-05,
"loss": 1.5498,
"step": 920
},
{
"epoch": 0.3983887101570737,
"grad_norm": 20.070619583129883,
"learning_rate": 1.216097987751531e-05,
"loss": 1.5291,
"step": 921
},
{
"epoch": 0.3988212711887318,
"grad_norm": 18.752845764160156,
"learning_rate": 1.215223097112861e-05,
"loss": 1.595,
"step": 922
},
{
"epoch": 0.39925383222038985,
"grad_norm": 20.85921287536621,
"learning_rate": 1.2143482064741907e-05,
"loss": 1.4615,
"step": 923
},
{
"epoch": 0.3996863932520479,
"grad_norm": 20.928272247314453,
"learning_rate": 1.2134733158355208e-05,
"loss": 1.5868,
"step": 924
},
{
"epoch": 0.400118954283706,
"grad_norm": 21.330936431884766,
"learning_rate": 1.2125984251968505e-05,
"loss": 1.5825,
"step": 925
},
{
"epoch": 0.40055151531536404,
"grad_norm": 19.58514404296875,
"learning_rate": 1.2117235345581804e-05,
"loss": 1.6101,
"step": 926
},
{
"epoch": 0.4009840763470221,
"grad_norm": 19.794700622558594,
"learning_rate": 1.2108486439195102e-05,
"loss": 1.5484,
"step": 927
},
{
"epoch": 0.40141663737868016,
"grad_norm": 20.019779205322266,
"learning_rate": 1.20997375328084e-05,
"loss": 1.6467,
"step": 928
},
{
"epoch": 0.4018491984103382,
"grad_norm": 18.108291625976562,
"learning_rate": 1.2090988626421699e-05,
"loss": 1.568,
"step": 929
},
{
"epoch": 0.4022817594419963,
"grad_norm": 20.311721801757812,
"learning_rate": 1.2082239720034996e-05,
"loss": 1.4256,
"step": 930
},
{
"epoch": 0.40271432047365435,
"grad_norm": 20.440967559814453,
"learning_rate": 1.2073490813648294e-05,
"loss": 1.5777,
"step": 931
},
{
"epoch": 0.4031468815053124,
"grad_norm": 20.755502700805664,
"learning_rate": 1.2064741907261593e-05,
"loss": 1.5757,
"step": 932
},
{
"epoch": 0.4035794425369704,
"grad_norm": 21.64055633544922,
"learning_rate": 1.2055993000874892e-05,
"loss": 1.6168,
"step": 933
},
{
"epoch": 0.4040120035686285,
"grad_norm": 19.491615295410156,
"learning_rate": 1.2047244094488191e-05,
"loss": 1.5655,
"step": 934
},
{
"epoch": 0.40444456460028655,
"grad_norm": 19.454198837280273,
"learning_rate": 1.2038495188101489e-05,
"loss": 1.5896,
"step": 935
},
{
"epoch": 0.4048771256319446,
"grad_norm": 20.64267349243164,
"learning_rate": 1.2029746281714786e-05,
"loss": 1.5161,
"step": 936
},
{
"epoch": 0.4053096866636027,
"grad_norm": 23.738571166992188,
"learning_rate": 1.2020997375328086e-05,
"loss": 1.7056,
"step": 937
},
{
"epoch": 0.40574224769526074,
"grad_norm": 22.766353607177734,
"learning_rate": 1.2012248468941383e-05,
"loss": 1.5408,
"step": 938
},
{
"epoch": 0.4061748087269188,
"grad_norm": 21.712682723999023,
"learning_rate": 1.200349956255468e-05,
"loss": 1.5679,
"step": 939
},
{
"epoch": 0.40660736975857686,
"grad_norm": 20.704023361206055,
"learning_rate": 1.199475065616798e-05,
"loss": 1.5291,
"step": 940
},
{
"epoch": 0.4070399307902349,
"grad_norm": 21.490806579589844,
"learning_rate": 1.1986001749781277e-05,
"loss": 1.5927,
"step": 941
},
{
"epoch": 0.407472491821893,
"grad_norm": 18.206605911254883,
"learning_rate": 1.1977252843394575e-05,
"loss": 1.5576,
"step": 942
},
{
"epoch": 0.40790505285355105,
"grad_norm": 19.448074340820312,
"learning_rate": 1.1968503937007876e-05,
"loss": 1.5126,
"step": 943
},
{
"epoch": 0.4083376138852091,
"grad_norm": 19.33980369567871,
"learning_rate": 1.1959755030621173e-05,
"loss": 1.5412,
"step": 944
},
{
"epoch": 0.4087701749168672,
"grad_norm": 20.316843032836914,
"learning_rate": 1.1951006124234472e-05,
"loss": 1.697,
"step": 945
},
{
"epoch": 0.40920273594852524,
"grad_norm": 19.3831787109375,
"learning_rate": 1.194225721784777e-05,
"loss": 1.5501,
"step": 946
},
{
"epoch": 0.4096352969801833,
"grad_norm": 23.11265754699707,
"learning_rate": 1.193350831146107e-05,
"loss": 1.6124,
"step": 947
},
{
"epoch": 0.41006785801184137,
"grad_norm": 23.18746566772461,
"learning_rate": 1.1924759405074367e-05,
"loss": 1.61,
"step": 948
},
{
"epoch": 0.41050041904349943,
"grad_norm": 20.464345932006836,
"learning_rate": 1.1916010498687664e-05,
"loss": 1.6349,
"step": 949
},
{
"epoch": 0.4109329800751575,
"grad_norm": 18.070871353149414,
"learning_rate": 1.1907261592300963e-05,
"loss": 1.6273,
"step": 950
},
{
"epoch": 0.41136554110681556,
"grad_norm": 19.049097061157227,
"learning_rate": 1.1898512685914261e-05,
"loss": 1.4902,
"step": 951
},
{
"epoch": 0.4117981021384736,
"grad_norm": 20.535675048828125,
"learning_rate": 1.1889763779527562e-05,
"loss": 1.5897,
"step": 952
},
{
"epoch": 0.4122306631701317,
"grad_norm": 19.585886001586914,
"learning_rate": 1.188101487314086e-05,
"loss": 1.6457,
"step": 953
},
{
"epoch": 0.41266322420178975,
"grad_norm": 19.01354217529297,
"learning_rate": 1.1872265966754157e-05,
"loss": 1.6315,
"step": 954
},
{
"epoch": 0.41309578523344775,
"grad_norm": 21.235271453857422,
"learning_rate": 1.1863517060367456e-05,
"loss": 1.4923,
"step": 955
},
{
"epoch": 0.4135283462651058,
"grad_norm": 19.011507034301758,
"learning_rate": 1.1854768153980754e-05,
"loss": 1.5508,
"step": 956
},
{
"epoch": 0.4139609072967639,
"grad_norm": 20.91914176940918,
"learning_rate": 1.1846019247594051e-05,
"loss": 1.5762,
"step": 957
},
{
"epoch": 0.41439346832842194,
"grad_norm": 23.34760284423828,
"learning_rate": 1.183727034120735e-05,
"loss": 1.5603,
"step": 958
},
{
"epoch": 0.41482602936008,
"grad_norm": 23.663442611694336,
"learning_rate": 1.1828521434820648e-05,
"loss": 1.607,
"step": 959
},
{
"epoch": 0.41525859039173807,
"grad_norm": 18.84950065612793,
"learning_rate": 1.1819772528433945e-05,
"loss": 1.5646,
"step": 960
},
{
"epoch": 0.41569115142339613,
"grad_norm": 20.305644989013672,
"learning_rate": 1.1811023622047245e-05,
"loss": 1.5428,
"step": 961
},
{
"epoch": 0.4161237124550542,
"grad_norm": 19.10517692565918,
"learning_rate": 1.1802274715660544e-05,
"loss": 1.5305,
"step": 962
},
{
"epoch": 0.41655627348671226,
"grad_norm": 22.100454330444336,
"learning_rate": 1.1793525809273843e-05,
"loss": 1.5978,
"step": 963
},
{
"epoch": 0.4169888345183703,
"grad_norm": 19.938411712646484,
"learning_rate": 1.178477690288714e-05,
"loss": 1.5133,
"step": 964
},
{
"epoch": 0.4174213955500284,
"grad_norm": 20.516897201538086,
"learning_rate": 1.1776027996500438e-05,
"loss": 1.6113,
"step": 965
},
{
"epoch": 0.41785395658168645,
"grad_norm": 19.226377487182617,
"learning_rate": 1.1767279090113737e-05,
"loss": 1.5843,
"step": 966
},
{
"epoch": 0.4182865176133445,
"grad_norm": 25.08070182800293,
"learning_rate": 1.1758530183727035e-05,
"loss": 1.6075,
"step": 967
},
{
"epoch": 0.4187190786450026,
"grad_norm": 20.30937385559082,
"learning_rate": 1.1749781277340332e-05,
"loss": 1.5732,
"step": 968
},
{
"epoch": 0.41915163967666064,
"grad_norm": 20.013835906982422,
"learning_rate": 1.1741032370953631e-05,
"loss": 1.6688,
"step": 969
},
{
"epoch": 0.4195842007083187,
"grad_norm": 21.13936424255371,
"learning_rate": 1.1732283464566929e-05,
"loss": 1.5732,
"step": 970
},
{
"epoch": 0.42001676173997676,
"grad_norm": 20.149782180786133,
"learning_rate": 1.1723534558180228e-05,
"loss": 1.5247,
"step": 971
},
{
"epoch": 0.4204493227716348,
"grad_norm": 22.26946449279785,
"learning_rate": 1.1714785651793527e-05,
"loss": 1.5106,
"step": 972
},
{
"epoch": 0.4208818838032929,
"grad_norm": 20.233417510986328,
"learning_rate": 1.1706036745406827e-05,
"loss": 1.5702,
"step": 973
},
{
"epoch": 0.42131444483495095,
"grad_norm": 20.490550994873047,
"learning_rate": 1.1697287839020124e-05,
"loss": 1.6082,
"step": 974
},
{
"epoch": 0.421747005866609,
"grad_norm": 19.094743728637695,
"learning_rate": 1.1688538932633422e-05,
"loss": 1.5653,
"step": 975
},
{
"epoch": 0.4221795668982671,
"grad_norm": 18.951610565185547,
"learning_rate": 1.167979002624672e-05,
"loss": 1.5294,
"step": 976
},
{
"epoch": 0.4226121279299251,
"grad_norm": 19.141788482666016,
"learning_rate": 1.1671041119860018e-05,
"loss": 1.5549,
"step": 977
},
{
"epoch": 0.42304468896158315,
"grad_norm": 22.37894630432129,
"learning_rate": 1.1662292213473316e-05,
"loss": 1.5739,
"step": 978
},
{
"epoch": 0.4234772499932412,
"grad_norm": 21.99265480041504,
"learning_rate": 1.1653543307086615e-05,
"loss": 1.5732,
"step": 979
},
{
"epoch": 0.4239098110248993,
"grad_norm": 21.509435653686523,
"learning_rate": 1.1644794400699913e-05,
"loss": 1.6226,
"step": 980
},
{
"epoch": 0.42434237205655734,
"grad_norm": 21.833322525024414,
"learning_rate": 1.1636045494313213e-05,
"loss": 1.561,
"step": 981
},
{
"epoch": 0.4247749330882154,
"grad_norm": 20.158653259277344,
"learning_rate": 1.1627296587926511e-05,
"loss": 1.5053,
"step": 982
},
{
"epoch": 0.42520749411987346,
"grad_norm": 23.664594650268555,
"learning_rate": 1.1618547681539808e-05,
"loss": 1.6409,
"step": 983
},
{
"epoch": 0.4256400551515315,
"grad_norm": 20.181825637817383,
"learning_rate": 1.1609798775153108e-05,
"loss": 1.615,
"step": 984
},
{
"epoch": 0.4260726161831896,
"grad_norm": 23.865310668945312,
"learning_rate": 1.1601049868766405e-05,
"loss": 1.6681,
"step": 985
},
{
"epoch": 0.42650517721484765,
"grad_norm": 22.003345489501953,
"learning_rate": 1.1592300962379703e-05,
"loss": 1.594,
"step": 986
},
{
"epoch": 0.4269377382465057,
"grad_norm": 21.85173797607422,
"learning_rate": 1.1583552055993002e-05,
"loss": 1.5251,
"step": 987
},
{
"epoch": 0.4273702992781638,
"grad_norm": 22.519914627075195,
"learning_rate": 1.15748031496063e-05,
"loss": 1.6031,
"step": 988
},
{
"epoch": 0.42780286030982184,
"grad_norm": 18.701091766357422,
"learning_rate": 1.1566054243219597e-05,
"loss": 1.5812,
"step": 989
},
{
"epoch": 0.4282354213414799,
"grad_norm": 18.49114418029785,
"learning_rate": 1.1557305336832896e-05,
"loss": 1.573,
"step": 990
},
{
"epoch": 0.42866798237313797,
"grad_norm": 21.027111053466797,
"learning_rate": 1.1548556430446195e-05,
"loss": 1.5618,
"step": 991
},
{
"epoch": 0.42910054340479603,
"grad_norm": 22.11174964904785,
"learning_rate": 1.1539807524059495e-05,
"loss": 1.5999,
"step": 992
},
{
"epoch": 0.4295331044364541,
"grad_norm": 23.42853355407715,
"learning_rate": 1.1531058617672792e-05,
"loss": 1.5065,
"step": 993
},
{
"epoch": 0.42996566546811216,
"grad_norm": 20.36092185974121,
"learning_rate": 1.1522309711286091e-05,
"loss": 1.5454,
"step": 994
},
{
"epoch": 0.4303982264997702,
"grad_norm": 20.617124557495117,
"learning_rate": 1.1513560804899389e-05,
"loss": 1.6106,
"step": 995
},
{
"epoch": 0.4308307875314283,
"grad_norm": 21.008230209350586,
"learning_rate": 1.1504811898512686e-05,
"loss": 1.6075,
"step": 996
},
{
"epoch": 0.43126334856308635,
"grad_norm": 23.843276977539062,
"learning_rate": 1.1496062992125985e-05,
"loss": 1.5715,
"step": 997
},
{
"epoch": 0.4316959095947444,
"grad_norm": 23.46642303466797,
"learning_rate": 1.1487314085739283e-05,
"loss": 1.5887,
"step": 998
},
{
"epoch": 0.4321284706264024,
"grad_norm": 32.012420654296875,
"learning_rate": 1.147856517935258e-05,
"loss": 1.5999,
"step": 999
},
{
"epoch": 0.4325610316580605,
"grad_norm": 22.296770095825195,
"learning_rate": 1.1469816272965881e-05,
"loss": 1.5889,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 2311,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2407280868764634e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}