gemma2-9b / trainer_state.json
omeryentur's picture
Upload folder using huggingface_hub
2ece9de verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.08740303725554463,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 8.740303725554463e-05,
"grad_norm": 7.065422058105469,
"learning_rate": 0.0001,
"loss": 1.8318,
"step": 1
},
{
"epoch": 0.00017480607451108925,
"grad_norm": 12.618020057678223,
"learning_rate": 0.0002,
"loss": 2.6514,
"step": 2
},
{
"epoch": 0.0002622091117666339,
"grad_norm": 5.291403770446777,
"learning_rate": 0.0003,
"loss": 1.1527,
"step": 3
},
{
"epoch": 0.0003496121490221785,
"grad_norm": 0.6172698736190796,
"learning_rate": 0.0004,
"loss": 0.9539,
"step": 4
},
{
"epoch": 0.00043701518627772313,
"grad_norm": 2.0148203372955322,
"learning_rate": 0.0005,
"loss": 1.4452,
"step": 5
},
{
"epoch": 0.0005244182235332678,
"grad_norm": 8.47681999206543,
"learning_rate": 0.0004999562784190276,
"loss": 1.8725,
"step": 6
},
{
"epoch": 0.0006118212607888124,
"grad_norm": 1.3222665786743164,
"learning_rate": 0.0004999125568380553,
"loss": 1.4179,
"step": 7
},
{
"epoch": 0.000699224298044357,
"grad_norm": 2.153110980987549,
"learning_rate": 0.0004998688352570829,
"loss": 1.1031,
"step": 8
},
{
"epoch": 0.0007866273352999017,
"grad_norm": 1.160366415977478,
"learning_rate": 0.0004998251136761106,
"loss": 0.9552,
"step": 9
},
{
"epoch": 0.0008740303725554463,
"grad_norm": 0.7029749155044556,
"learning_rate": 0.0004997813920951382,
"loss": 1.0771,
"step": 10
},
{
"epoch": 0.000961433409810991,
"grad_norm": 0.7599214315414429,
"learning_rate": 0.0004997376705141658,
"loss": 1.0371,
"step": 11
},
{
"epoch": 0.0010488364470665355,
"grad_norm": 1.3291207551956177,
"learning_rate": 0.0004996939489331935,
"loss": 0.7945,
"step": 12
},
{
"epoch": 0.00113623948432208,
"grad_norm": 0.6687347888946533,
"learning_rate": 0.0004996502273522211,
"loss": 0.9751,
"step": 13
},
{
"epoch": 0.0012236425215776249,
"grad_norm": 0.5787840485572815,
"learning_rate": 0.0004996065057712488,
"loss": 1.234,
"step": 14
},
{
"epoch": 0.0013110455588331695,
"grad_norm": 0.8155117034912109,
"learning_rate": 0.0004995627841902764,
"loss": 1.2566,
"step": 15
},
{
"epoch": 0.001398448596088714,
"grad_norm": 0.5109673142433167,
"learning_rate": 0.0004995190626093039,
"loss": 0.8717,
"step": 16
},
{
"epoch": 0.0014858516333442588,
"grad_norm": 0.4625360667705536,
"learning_rate": 0.0004994753410283315,
"loss": 0.8922,
"step": 17
},
{
"epoch": 0.0015732546705998034,
"grad_norm": 0.714952826499939,
"learning_rate": 0.0004994316194473592,
"loss": 0.921,
"step": 18
},
{
"epoch": 0.001660657707855348,
"grad_norm": 0.48220372200012207,
"learning_rate": 0.0004993878978663869,
"loss": 1.0207,
"step": 19
},
{
"epoch": 0.0017480607451108925,
"grad_norm": 5.717684745788574,
"learning_rate": 0.0004993441762854145,
"loss": 1.3551,
"step": 20
},
{
"epoch": 0.0018354637823664373,
"grad_norm": 0.5429579615592957,
"learning_rate": 0.0004993004547044421,
"loss": 0.8929,
"step": 21
},
{
"epoch": 0.001922866819621982,
"grad_norm": 6.894193172454834,
"learning_rate": 0.0004992567331234697,
"loss": 1.2508,
"step": 22
},
{
"epoch": 0.0020102698568775267,
"grad_norm": 0.4427785277366638,
"learning_rate": 0.0004992130115424974,
"loss": 0.9662,
"step": 23
},
{
"epoch": 0.002097672894133071,
"grad_norm": 0.5576323866844177,
"learning_rate": 0.000499169289961525,
"loss": 1.0545,
"step": 24
},
{
"epoch": 0.002185075931388616,
"grad_norm": 1.3581053018569946,
"learning_rate": 0.0004991255683805527,
"loss": 1.1777,
"step": 25
},
{
"epoch": 0.00227247896864416,
"grad_norm": 0.609951376914978,
"learning_rate": 0.0004990818467995803,
"loss": 1.5921,
"step": 26
},
{
"epoch": 0.002359882005899705,
"grad_norm": 1.3641082048416138,
"learning_rate": 0.0004990381252186079,
"loss": 0.8309,
"step": 27
},
{
"epoch": 0.0024472850431552498,
"grad_norm": 0.5976356267929077,
"learning_rate": 0.0004989944036376356,
"loss": 0.828,
"step": 28
},
{
"epoch": 0.002534688080410794,
"grad_norm": 0.6889556646347046,
"learning_rate": 0.0004989506820566632,
"loss": 1.4536,
"step": 29
},
{
"epoch": 0.002622091117666339,
"grad_norm": 0.5091891884803772,
"learning_rate": 0.0004989069604756908,
"loss": 1.054,
"step": 30
},
{
"epoch": 0.0027094941549218837,
"grad_norm": 1.0312514305114746,
"learning_rate": 0.0004988632388947185,
"loss": 0.8454,
"step": 31
},
{
"epoch": 0.002796897192177428,
"grad_norm": 1.136455774307251,
"learning_rate": 0.000498819517313746,
"loss": 0.9365,
"step": 32
},
{
"epoch": 0.002884300229432973,
"grad_norm": 0.5671233534812927,
"learning_rate": 0.0004987757957327737,
"loss": 0.9139,
"step": 33
},
{
"epoch": 0.0029717032666885176,
"grad_norm": 0.38321638107299805,
"learning_rate": 0.0004987320741518013,
"loss": 0.9383,
"step": 34
},
{
"epoch": 0.003059106303944062,
"grad_norm": 0.49962496757507324,
"learning_rate": 0.0004986883525708289,
"loss": 1.1371,
"step": 35
},
{
"epoch": 0.003146509341199607,
"grad_norm": 0.4470585584640503,
"learning_rate": 0.0004986446309898566,
"loss": 1.2636,
"step": 36
},
{
"epoch": 0.003233912378455151,
"grad_norm": 0.4494791626930237,
"learning_rate": 0.0004986009094088842,
"loss": 0.8846,
"step": 37
},
{
"epoch": 0.003321315415710696,
"grad_norm": 1.8432437181472778,
"learning_rate": 0.0004985571878279119,
"loss": 1.0042,
"step": 38
},
{
"epoch": 0.0034087184529662407,
"grad_norm": 0.512199878692627,
"learning_rate": 0.0004985134662469395,
"loss": 0.9648,
"step": 39
},
{
"epoch": 0.003496121490221785,
"grad_norm": 0.7086130380630493,
"learning_rate": 0.0004984697446659671,
"loss": 0.8634,
"step": 40
},
{
"epoch": 0.00358352452747733,
"grad_norm": 0.34971296787261963,
"learning_rate": 0.0004984260230849947,
"loss": 1.1422,
"step": 41
},
{
"epoch": 0.0036709275647328747,
"grad_norm": 0.5125827193260193,
"learning_rate": 0.0004983823015040224,
"loss": 0.9885,
"step": 42
},
{
"epoch": 0.003758330601988419,
"grad_norm": 0.363505482673645,
"learning_rate": 0.0004983385799230501,
"loss": 0.9047,
"step": 43
},
{
"epoch": 0.003845733639243964,
"grad_norm": 0.36858850717544556,
"learning_rate": 0.0004982948583420777,
"loss": 0.8149,
"step": 44
},
{
"epoch": 0.003933136676499509,
"grad_norm": 0.3395627439022064,
"learning_rate": 0.0004982511367611053,
"loss": 0.6765,
"step": 45
},
{
"epoch": 0.004020539713755053,
"grad_norm": 0.8366663455963135,
"learning_rate": 0.0004982074151801329,
"loss": 1.4199,
"step": 46
},
{
"epoch": 0.004107942751010597,
"grad_norm": 0.4986715614795685,
"learning_rate": 0.0004981636935991606,
"loss": 1.0475,
"step": 47
},
{
"epoch": 0.004195345788266142,
"grad_norm": 0.39106953144073486,
"learning_rate": 0.0004981199720181882,
"loss": 0.8671,
"step": 48
},
{
"epoch": 0.004282748825521687,
"grad_norm": 1.129980206489563,
"learning_rate": 0.0004980762504372159,
"loss": 0.6251,
"step": 49
},
{
"epoch": 0.004370151862777232,
"grad_norm": 1.9613661766052246,
"learning_rate": 0.0004980325288562434,
"loss": 1.5782,
"step": 50
},
{
"epoch": 0.0044575549000327765,
"grad_norm": 0.3839377164840698,
"learning_rate": 0.000497988807275271,
"loss": 0.8171,
"step": 51
},
{
"epoch": 0.00454495793728832,
"grad_norm": 1.2072890996932983,
"learning_rate": 0.0004979450856942987,
"loss": 1.3112,
"step": 52
},
{
"epoch": 0.004632360974543865,
"grad_norm": 0.4228273630142212,
"learning_rate": 0.0004979013641133263,
"loss": 0.8507,
"step": 53
},
{
"epoch": 0.00471976401179941,
"grad_norm": 0.3379599452018738,
"learning_rate": 0.000497857642532354,
"loss": 0.9112,
"step": 54
},
{
"epoch": 0.004807167049054955,
"grad_norm": 0.4163492023944855,
"learning_rate": 0.0004978139209513816,
"loss": 0.9839,
"step": 55
},
{
"epoch": 0.0048945700863104995,
"grad_norm": 1.4194269180297852,
"learning_rate": 0.0004977701993704092,
"loss": 1.194,
"step": 56
},
{
"epoch": 0.004981973123566044,
"grad_norm": 0.8857583999633789,
"learning_rate": 0.0004977264777894369,
"loss": 0.9047,
"step": 57
},
{
"epoch": 0.005069376160821588,
"grad_norm": 0.8493141531944275,
"learning_rate": 0.0004976827562084645,
"loss": 0.921,
"step": 58
},
{
"epoch": 0.005156779198077133,
"grad_norm": 0.6385464668273926,
"learning_rate": 0.0004976390346274922,
"loss": 0.9945,
"step": 59
},
{
"epoch": 0.005244182235332678,
"grad_norm": 0.6642935872077942,
"learning_rate": 0.0004975953130465198,
"loss": 0.8654,
"step": 60
},
{
"epoch": 0.005331585272588223,
"grad_norm": 0.5619232654571533,
"learning_rate": 0.0004975515914655474,
"loss": 0.9012,
"step": 61
},
{
"epoch": 0.005418988309843767,
"grad_norm": 0.37755316495895386,
"learning_rate": 0.0004975078698845751,
"loss": 0.7285,
"step": 62
},
{
"epoch": 0.005506391347099311,
"grad_norm": 1.3131452798843384,
"learning_rate": 0.0004974641483036027,
"loss": 1.5863,
"step": 63
},
{
"epoch": 0.005593794384354856,
"grad_norm": 0.48203301429748535,
"learning_rate": 0.0004974204267226304,
"loss": 0.932,
"step": 64
},
{
"epoch": 0.005681197421610401,
"grad_norm": 1.7584421634674072,
"learning_rate": 0.000497376705141658,
"loss": 1.3908,
"step": 65
},
{
"epoch": 0.005768600458865946,
"grad_norm": 0.5197044610977173,
"learning_rate": 0.0004973329835606855,
"loss": 0.8429,
"step": 66
},
{
"epoch": 0.0058560034961214905,
"grad_norm": 1.9259709119796753,
"learning_rate": 0.0004972892619797131,
"loss": 0.9317,
"step": 67
},
{
"epoch": 0.005943406533377035,
"grad_norm": 1.0053375959396362,
"learning_rate": 0.0004972455403987408,
"loss": 0.9276,
"step": 68
},
{
"epoch": 0.006030809570632579,
"grad_norm": 85.76437377929688,
"learning_rate": 0.0004972018188177684,
"loss": 5.3967,
"step": 69
},
{
"epoch": 0.006118212607888124,
"grad_norm": 1.9150564670562744,
"learning_rate": 0.0004971580972367961,
"loss": 1.2467,
"step": 70
},
{
"epoch": 0.006205615645143669,
"grad_norm": 1.286971092224121,
"learning_rate": 0.0004971143756558237,
"loss": 1.055,
"step": 71
},
{
"epoch": 0.006293018682399214,
"grad_norm": 3.5728204250335693,
"learning_rate": 0.0004970706540748513,
"loss": 0.9154,
"step": 72
},
{
"epoch": 0.006380421719654758,
"grad_norm": 3.2489278316497803,
"learning_rate": 0.000497026932493879,
"loss": 1.0816,
"step": 73
},
{
"epoch": 0.006467824756910302,
"grad_norm": 0.7258114218711853,
"learning_rate": 0.0004969832109129066,
"loss": 0.8656,
"step": 74
},
{
"epoch": 0.006555227794165847,
"grad_norm": 1.0952316522598267,
"learning_rate": 0.0004969394893319343,
"loss": 0.9195,
"step": 75
},
{
"epoch": 0.006642630831421392,
"grad_norm": 5.054478645324707,
"learning_rate": 0.0004968957677509619,
"loss": 1.2343,
"step": 76
},
{
"epoch": 0.006730033868676937,
"grad_norm": 2.0239686965942383,
"learning_rate": 0.0004968520461699895,
"loss": 1.6315,
"step": 77
},
{
"epoch": 0.0068174369059324814,
"grad_norm": 1.3708548545837402,
"learning_rate": 0.0004968083245890172,
"loss": 0.8507,
"step": 78
},
{
"epoch": 0.006904839943188025,
"grad_norm": 0.6372014284133911,
"learning_rate": 0.0004967646030080448,
"loss": 0.9235,
"step": 79
},
{
"epoch": 0.00699224298044357,
"grad_norm": 1.0243886709213257,
"learning_rate": 0.0004967208814270724,
"loss": 1.0295,
"step": 80
},
{
"epoch": 0.007079646017699115,
"grad_norm": 0.6127680540084839,
"learning_rate": 0.0004966771598461001,
"loss": 0.8469,
"step": 81
},
{
"epoch": 0.00716704905495466,
"grad_norm": 0.7449392080307007,
"learning_rate": 0.0004966334382651277,
"loss": 1.5825,
"step": 82
},
{
"epoch": 0.0072544520922102045,
"grad_norm": 0.6267126798629761,
"learning_rate": 0.0004965897166841554,
"loss": 1.0257,
"step": 83
},
{
"epoch": 0.007341855129465749,
"grad_norm": 5.416685104370117,
"learning_rate": 0.0004965459951031829,
"loss": 1.0654,
"step": 84
},
{
"epoch": 0.007429258166721293,
"grad_norm": 1.0485210418701172,
"learning_rate": 0.0004965022735222105,
"loss": 0.8979,
"step": 85
},
{
"epoch": 0.007516661203976838,
"grad_norm": 1.0192244052886963,
"learning_rate": 0.0004964585519412381,
"loss": 1.1117,
"step": 86
},
{
"epoch": 0.007604064241232383,
"grad_norm": 0.7042039632797241,
"learning_rate": 0.0004964148303602658,
"loss": 0.9955,
"step": 87
},
{
"epoch": 0.007691467278487928,
"grad_norm": 0.649395227432251,
"learning_rate": 0.0004963711087792935,
"loss": 0.7092,
"step": 88
},
{
"epoch": 0.007778870315743472,
"grad_norm": 0.8017964959144592,
"learning_rate": 0.0004963273871983211,
"loss": 0.8941,
"step": 89
},
{
"epoch": 0.007866273352999017,
"grad_norm": 0.4518626630306244,
"learning_rate": 0.0004962836656173487,
"loss": 0.9088,
"step": 90
},
{
"epoch": 0.007953676390254561,
"grad_norm": 0.4033469259738922,
"learning_rate": 0.0004962399440363763,
"loss": 0.9251,
"step": 91
},
{
"epoch": 0.008041079427510107,
"grad_norm": 0.8128958940505981,
"learning_rate": 0.000496196222455404,
"loss": 0.975,
"step": 92
},
{
"epoch": 0.00812848246476565,
"grad_norm": 3.1504242420196533,
"learning_rate": 0.0004961525008744317,
"loss": 1.5942,
"step": 93
},
{
"epoch": 0.008215885502021195,
"grad_norm": 3.9139645099639893,
"learning_rate": 0.0004961087792934593,
"loss": 1.071,
"step": 94
},
{
"epoch": 0.00830328853927674,
"grad_norm": 0.7689482569694519,
"learning_rate": 0.0004960650577124869,
"loss": 1.038,
"step": 95
},
{
"epoch": 0.008390691576532284,
"grad_norm": 0.5784656405448914,
"learning_rate": 0.0004960213361315145,
"loss": 1.0943,
"step": 96
},
{
"epoch": 0.00847809461378783,
"grad_norm": 0.5716943144798279,
"learning_rate": 0.0004959776145505422,
"loss": 0.8874,
"step": 97
},
{
"epoch": 0.008565497651043374,
"grad_norm": 0.5122077465057373,
"learning_rate": 0.0004959338929695698,
"loss": 0.951,
"step": 98
},
{
"epoch": 0.008652900688298918,
"grad_norm": 0.8700870871543884,
"learning_rate": 0.0004958901713885975,
"loss": 0.9632,
"step": 99
},
{
"epoch": 0.008740303725554463,
"grad_norm": 0.5623646974563599,
"learning_rate": 0.000495846449807625,
"loss": 1.0711,
"step": 100
},
{
"epoch": 0.008827706762810007,
"grad_norm": 0.589887261390686,
"learning_rate": 0.0004958027282266526,
"loss": 0.781,
"step": 101
},
{
"epoch": 0.008915109800065553,
"grad_norm": 1.63577401638031,
"learning_rate": 0.0004957590066456803,
"loss": 0.9118,
"step": 102
},
{
"epoch": 0.009002512837321097,
"grad_norm": 0.7755091786384583,
"learning_rate": 0.0004957152850647079,
"loss": 1.192,
"step": 103
},
{
"epoch": 0.00908991587457664,
"grad_norm": 0.5463851094245911,
"learning_rate": 0.0004956715634837356,
"loss": 0.894,
"step": 104
},
{
"epoch": 0.009177318911832186,
"grad_norm": 0.5253966450691223,
"learning_rate": 0.0004956278419027632,
"loss": 0.9432,
"step": 105
},
{
"epoch": 0.00926472194908773,
"grad_norm": 0.4377374053001404,
"learning_rate": 0.0004955841203217908,
"loss": 1.09,
"step": 106
},
{
"epoch": 0.009352124986343276,
"grad_norm": 0.5025166869163513,
"learning_rate": 0.0004955403987408185,
"loss": 0.9262,
"step": 107
},
{
"epoch": 0.00943952802359882,
"grad_norm": 0.45846027135849,
"learning_rate": 0.0004954966771598461,
"loss": 0.9428,
"step": 108
},
{
"epoch": 0.009526931060854364,
"grad_norm": 0.4219333529472351,
"learning_rate": 0.0004954529555788738,
"loss": 1.026,
"step": 109
},
{
"epoch": 0.00961433409810991,
"grad_norm": 0.5737212896347046,
"learning_rate": 0.0004954092339979014,
"loss": 1.1012,
"step": 110
},
{
"epoch": 0.009701737135365453,
"grad_norm": 0.887387752532959,
"learning_rate": 0.000495365512416929,
"loss": 1.3495,
"step": 111
},
{
"epoch": 0.009789140172620999,
"grad_norm": 0.5145196914672852,
"learning_rate": 0.0004953217908359567,
"loss": 1.0266,
"step": 112
},
{
"epoch": 0.009876543209876543,
"grad_norm": 1.5954936742782593,
"learning_rate": 0.0004952780692549843,
"loss": 1.254,
"step": 113
},
{
"epoch": 0.009963946247132089,
"grad_norm": 0.9585819840431213,
"learning_rate": 0.0004952343476740119,
"loss": 1.4545,
"step": 114
},
{
"epoch": 0.010051349284387633,
"grad_norm": 0.8477827310562134,
"learning_rate": 0.0004951906260930396,
"loss": 0.9454,
"step": 115
},
{
"epoch": 0.010138752321643177,
"grad_norm": 1.2712616920471191,
"learning_rate": 0.0004951469045120672,
"loss": 0.9497,
"step": 116
},
{
"epoch": 0.010226155358898722,
"grad_norm": 0.5731809139251709,
"learning_rate": 0.0004951031829310947,
"loss": 1.0611,
"step": 117
},
{
"epoch": 0.010313558396154266,
"grad_norm": 2.106234550476074,
"learning_rate": 0.0004950594613501224,
"loss": 1.0015,
"step": 118
},
{
"epoch": 0.010400961433409812,
"grad_norm": 0.7425693273544312,
"learning_rate": 0.00049501573976915,
"loss": 1.0588,
"step": 119
},
{
"epoch": 0.010488364470665356,
"grad_norm": 0.5987507700920105,
"learning_rate": 0.0004949720181881777,
"loss": 1.0016,
"step": 120
},
{
"epoch": 0.0105757675079209,
"grad_norm": 0.3802410364151001,
"learning_rate": 0.0004949282966072053,
"loss": 0.9133,
"step": 121
},
{
"epoch": 0.010663170545176445,
"grad_norm": 0.42108240723609924,
"learning_rate": 0.0004948845750262329,
"loss": 0.8675,
"step": 122
},
{
"epoch": 0.01075057358243199,
"grad_norm": 0.6281617879867554,
"learning_rate": 0.0004948408534452606,
"loss": 0.8294,
"step": 123
},
{
"epoch": 0.010837976619687535,
"grad_norm": 0.8346467614173889,
"learning_rate": 0.0004947971318642882,
"loss": 0.8333,
"step": 124
},
{
"epoch": 0.010925379656943079,
"grad_norm": 0.5090304613113403,
"learning_rate": 0.0004947534102833158,
"loss": 1.0423,
"step": 125
},
{
"epoch": 0.011012782694198623,
"grad_norm": 0.39572426676750183,
"learning_rate": 0.0004947096887023435,
"loss": 0.8565,
"step": 126
},
{
"epoch": 0.011100185731454168,
"grad_norm": 1.1466861963272095,
"learning_rate": 0.0004946659671213711,
"loss": 1.4358,
"step": 127
},
{
"epoch": 0.011187588768709712,
"grad_norm": 0.36562782526016235,
"learning_rate": 0.0004946222455403988,
"loss": 0.8373,
"step": 128
},
{
"epoch": 0.011274991805965258,
"grad_norm": 0.49587374925613403,
"learning_rate": 0.0004945785239594264,
"loss": 1.3961,
"step": 129
},
{
"epoch": 0.011362394843220802,
"grad_norm": 0.4852742850780487,
"learning_rate": 0.000494534802378454,
"loss": 1.0804,
"step": 130
},
{
"epoch": 0.011449797880476346,
"grad_norm": 0.4050949215888977,
"learning_rate": 0.0004944910807974817,
"loss": 1.0482,
"step": 131
},
{
"epoch": 0.011537200917731891,
"grad_norm": 0.35284534096717834,
"learning_rate": 0.0004944473592165093,
"loss": 0.9467,
"step": 132
},
{
"epoch": 0.011624603954987435,
"grad_norm": 1.6482305526733398,
"learning_rate": 0.000494403637635537,
"loss": 1.0678,
"step": 133
},
{
"epoch": 0.011712006992242981,
"grad_norm": 1.103427767753601,
"learning_rate": 0.0004943599160545645,
"loss": 0.9495,
"step": 134
},
{
"epoch": 0.011799410029498525,
"grad_norm": 0.45183080434799194,
"learning_rate": 0.0004943161944735921,
"loss": 0.9117,
"step": 135
},
{
"epoch": 0.01188681306675407,
"grad_norm": 0.3565897047519684,
"learning_rate": 0.0004942724728926198,
"loss": 0.8209,
"step": 136
},
{
"epoch": 0.011974216104009614,
"grad_norm": 0.6118256449699402,
"learning_rate": 0.0004942287513116474,
"loss": 1.0973,
"step": 137
},
{
"epoch": 0.012061619141265158,
"grad_norm": 0.40304186940193176,
"learning_rate": 0.0004941850297306751,
"loss": 1.1167,
"step": 138
},
{
"epoch": 0.012149022178520704,
"grad_norm": 0.46548163890838623,
"learning_rate": 0.0004941413081497027,
"loss": 0.9813,
"step": 139
},
{
"epoch": 0.012236425215776248,
"grad_norm": 0.4140109121799469,
"learning_rate": 0.0004940975865687303,
"loss": 0.9859,
"step": 140
},
{
"epoch": 0.012323828253031794,
"grad_norm": 0.7219896912574768,
"learning_rate": 0.0004940538649877579,
"loss": 0.9464,
"step": 141
},
{
"epoch": 0.012411231290287338,
"grad_norm": 1.1531212329864502,
"learning_rate": 0.0004940101434067856,
"loss": 0.9439,
"step": 142
},
{
"epoch": 0.012498634327542881,
"grad_norm": 0.5690356492996216,
"learning_rate": 0.0004939664218258133,
"loss": 0.897,
"step": 143
},
{
"epoch": 0.012586037364798427,
"grad_norm": 4.290929317474365,
"learning_rate": 0.0004939227002448409,
"loss": 0.9462,
"step": 144
},
{
"epoch": 0.012673440402053971,
"grad_norm": 0.8283594250679016,
"learning_rate": 0.0004938789786638685,
"loss": 0.8452,
"step": 145
},
{
"epoch": 0.012760843439309517,
"grad_norm": 0.7647207975387573,
"learning_rate": 0.0004938352570828961,
"loss": 0.8869,
"step": 146
},
{
"epoch": 0.01284824647656506,
"grad_norm": 0.4244186580181122,
"learning_rate": 0.0004937915355019238,
"loss": 1.0727,
"step": 147
},
{
"epoch": 0.012935649513820605,
"grad_norm": 0.6509714722633362,
"learning_rate": 0.0004937478139209514,
"loss": 1.3135,
"step": 148
},
{
"epoch": 0.01302305255107615,
"grad_norm": 0.5276227593421936,
"learning_rate": 0.0004937040923399791,
"loss": 0.9124,
"step": 149
},
{
"epoch": 0.013110455588331694,
"grad_norm": 0.6556555032730103,
"learning_rate": 0.0004936603707590067,
"loss": 1.0882,
"step": 150
},
{
"epoch": 0.01319785862558724,
"grad_norm": 0.5422887802124023,
"learning_rate": 0.0004936166491780342,
"loss": 0.787,
"step": 151
},
{
"epoch": 0.013285261662842784,
"grad_norm": 0.4304672181606293,
"learning_rate": 0.0004935729275970619,
"loss": 0.9496,
"step": 152
},
{
"epoch": 0.013372664700098328,
"grad_norm": 1.1699761152267456,
"learning_rate": 0.0004935292060160895,
"loss": 2.1129,
"step": 153
},
{
"epoch": 0.013460067737353873,
"grad_norm": 2.376859664916992,
"learning_rate": 0.0004934854844351172,
"loss": 1.0353,
"step": 154
},
{
"epoch": 0.013547470774609417,
"grad_norm": 0.6845773458480835,
"learning_rate": 0.0004934417628541448,
"loss": 0.739,
"step": 155
},
{
"epoch": 0.013634873811864963,
"grad_norm": 1.45736563205719,
"learning_rate": 0.0004933980412731724,
"loss": 0.9946,
"step": 156
},
{
"epoch": 0.013722276849120507,
"grad_norm": 0.8025717735290527,
"learning_rate": 0.0004933543196922001,
"loss": 0.7987,
"step": 157
},
{
"epoch": 0.01380967988637605,
"grad_norm": 0.4995729625225067,
"learning_rate": 0.0004933105981112277,
"loss": 0.8258,
"step": 158
},
{
"epoch": 0.013897082923631596,
"grad_norm": 0.3529548645019531,
"learning_rate": 0.0004932668765302554,
"loss": 0.7891,
"step": 159
},
{
"epoch": 0.01398448596088714,
"grad_norm": 0.3970806300640106,
"learning_rate": 0.000493223154949283,
"loss": 0.8748,
"step": 160
},
{
"epoch": 0.014071888998142686,
"grad_norm": 0.46492478251457214,
"learning_rate": 0.0004931794333683106,
"loss": 0.83,
"step": 161
},
{
"epoch": 0.01415929203539823,
"grad_norm": 0.39829567074775696,
"learning_rate": 0.0004931357117873383,
"loss": 0.8678,
"step": 162
},
{
"epoch": 0.014246695072653776,
"grad_norm": 0.44665223360061646,
"learning_rate": 0.0004930919902063659,
"loss": 0.8311,
"step": 163
},
{
"epoch": 0.01433409810990932,
"grad_norm": 0.3569469451904297,
"learning_rate": 0.0004930482686253935,
"loss": 0.7291,
"step": 164
},
{
"epoch": 0.014421501147164863,
"grad_norm": 0.5544111132621765,
"learning_rate": 0.0004930045470444212,
"loss": 0.7815,
"step": 165
},
{
"epoch": 0.014508904184420409,
"grad_norm": 0.350799024105072,
"learning_rate": 0.0004929608254634488,
"loss": 0.7029,
"step": 166
},
{
"epoch": 0.014596307221675953,
"grad_norm": 0.8473671078681946,
"learning_rate": 0.0004929171038824765,
"loss": 0.929,
"step": 167
},
{
"epoch": 0.014683710258931499,
"grad_norm": 0.46682775020599365,
"learning_rate": 0.000492873382301504,
"loss": 0.9511,
"step": 168
},
{
"epoch": 0.014771113296187043,
"grad_norm": 0.40774253010749817,
"learning_rate": 0.0004928296607205316,
"loss": 0.9113,
"step": 169
},
{
"epoch": 0.014858516333442586,
"grad_norm": 0.38683247566223145,
"learning_rate": 0.0004927859391395592,
"loss": 0.8733,
"step": 170
},
{
"epoch": 0.014945919370698132,
"grad_norm": 0.3632119297981262,
"learning_rate": 0.0004927422175585869,
"loss": 0.802,
"step": 171
},
{
"epoch": 0.015033322407953676,
"grad_norm": 0.43275561928749084,
"learning_rate": 0.0004926984959776145,
"loss": 0.869,
"step": 172
},
{
"epoch": 0.015120725445209222,
"grad_norm": 0.34049132466316223,
"learning_rate": 0.0004926547743966422,
"loss": 0.9312,
"step": 173
},
{
"epoch": 0.015208128482464766,
"grad_norm": 0.3519800901412964,
"learning_rate": 0.0004926110528156698,
"loss": 0.9362,
"step": 174
},
{
"epoch": 0.01529553151972031,
"grad_norm": 0.47325399518013,
"learning_rate": 0.0004925673312346974,
"loss": 0.9907,
"step": 175
},
{
"epoch": 0.015382934556975855,
"grad_norm": 0.3297930359840393,
"learning_rate": 0.0004925236096537251,
"loss": 0.9065,
"step": 176
},
{
"epoch": 0.0154703375942314,
"grad_norm": 0.3259631097316742,
"learning_rate": 0.0004924798880727527,
"loss": 0.76,
"step": 177
},
{
"epoch": 0.015557740631486945,
"grad_norm": 0.3202175498008728,
"learning_rate": 0.0004924361664917804,
"loss": 0.8182,
"step": 178
},
{
"epoch": 0.01564514366874249,
"grad_norm": 1.7625497579574585,
"learning_rate": 0.000492392444910808,
"loss": 1.0324,
"step": 179
},
{
"epoch": 0.015732546705998034,
"grad_norm": 0.31030330061912537,
"learning_rate": 0.0004923487233298356,
"loss": 0.7945,
"step": 180
},
{
"epoch": 0.015819949743253577,
"grad_norm": 0.416181743144989,
"learning_rate": 0.0004923050017488633,
"loss": 0.829,
"step": 181
},
{
"epoch": 0.015907352780509122,
"grad_norm": 0.42921754717826843,
"learning_rate": 0.0004922612801678909,
"loss": 0.7401,
"step": 182
},
{
"epoch": 0.015994755817764668,
"grad_norm": 0.2919391989707947,
"learning_rate": 0.0004922175585869186,
"loss": 0.8488,
"step": 183
},
{
"epoch": 0.016082158855020214,
"grad_norm": 0.314208447933197,
"learning_rate": 0.0004921738370059462,
"loss": 0.7946,
"step": 184
},
{
"epoch": 0.016169561892275756,
"grad_norm": 0.503778338432312,
"learning_rate": 0.0004921301154249737,
"loss": 0.8052,
"step": 185
},
{
"epoch": 0.0162569649295313,
"grad_norm": 0.36193403601646423,
"learning_rate": 0.0004920863938440014,
"loss": 0.8649,
"step": 186
},
{
"epoch": 0.016344367966786847,
"grad_norm": 0.631439208984375,
"learning_rate": 0.000492042672263029,
"loss": 0.7121,
"step": 187
},
{
"epoch": 0.01643177100404239,
"grad_norm": 0.3578779399394989,
"learning_rate": 0.0004919989506820567,
"loss": 0.9566,
"step": 188
},
{
"epoch": 0.016519174041297935,
"grad_norm": 0.3394636809825897,
"learning_rate": 0.0004919552291010843,
"loss": 0.7892,
"step": 189
},
{
"epoch": 0.01660657707855348,
"grad_norm": 0.3014313876628876,
"learning_rate": 0.0004919115075201119,
"loss": 0.9773,
"step": 190
},
{
"epoch": 0.016693980115809023,
"grad_norm": 0.464288592338562,
"learning_rate": 0.0004918677859391395,
"loss": 0.8351,
"step": 191
},
{
"epoch": 0.01678138315306457,
"grad_norm": 0.3988270163536072,
"learning_rate": 0.0004918240643581672,
"loss": 0.9227,
"step": 192
},
{
"epoch": 0.016868786190320114,
"grad_norm": 0.3190634250640869,
"learning_rate": 0.0004917803427771949,
"loss": 1.0606,
"step": 193
},
{
"epoch": 0.01695618922757566,
"grad_norm": 0.6769363880157471,
"learning_rate": 0.0004917366211962225,
"loss": 1.0602,
"step": 194
},
{
"epoch": 0.017043592264831202,
"grad_norm": 0.3352043330669403,
"learning_rate": 0.0004916928996152501,
"loss": 0.9759,
"step": 195
},
{
"epoch": 0.017130995302086748,
"grad_norm": 0.32745465636253357,
"learning_rate": 0.0004916491780342777,
"loss": 0.7544,
"step": 196
},
{
"epoch": 0.017218398339342293,
"grad_norm": 0.6321395635604858,
"learning_rate": 0.0004916054564533054,
"loss": 0.6861,
"step": 197
},
{
"epoch": 0.017305801376597835,
"grad_norm": 0.32094526290893555,
"learning_rate": 0.000491561734872333,
"loss": 0.8258,
"step": 198
},
{
"epoch": 0.01739320441385338,
"grad_norm": 0.3911696970462799,
"learning_rate": 0.0004915180132913607,
"loss": 0.9963,
"step": 199
},
{
"epoch": 0.017480607451108927,
"grad_norm": 0.2953476905822754,
"learning_rate": 0.0004914742917103883,
"loss": 0.8456,
"step": 200
},
{
"epoch": 0.017568010488364472,
"grad_norm": 0.3092620372772217,
"learning_rate": 0.0004914305701294158,
"loss": 0.8644,
"step": 201
},
{
"epoch": 0.017655413525620015,
"grad_norm": 0.6630509495735168,
"learning_rate": 0.0004913868485484435,
"loss": 0.9363,
"step": 202
},
{
"epoch": 0.01774281656287556,
"grad_norm": 0.3516843616962433,
"learning_rate": 0.0004913431269674711,
"loss": 1.1422,
"step": 203
},
{
"epoch": 0.017830219600131106,
"grad_norm": 0.43253111839294434,
"learning_rate": 0.0004912994053864988,
"loss": 0.852,
"step": 204
},
{
"epoch": 0.017917622637386648,
"grad_norm": 0.324238657951355,
"learning_rate": 0.0004912556838055264,
"loss": 0.8587,
"step": 205
},
{
"epoch": 0.018005025674642194,
"grad_norm": 0.28279510140419006,
"learning_rate": 0.000491211962224554,
"loss": 1.0088,
"step": 206
},
{
"epoch": 0.01809242871189774,
"grad_norm": 1.4974584579467773,
"learning_rate": 0.0004911682406435817,
"loss": 1.0296,
"step": 207
},
{
"epoch": 0.01817983174915328,
"grad_norm": 0.3786958158016205,
"learning_rate": 0.0004911245190626093,
"loss": 1.0741,
"step": 208
},
{
"epoch": 0.018267234786408827,
"grad_norm": 0.294880747795105,
"learning_rate": 0.0004910807974816369,
"loss": 1.021,
"step": 209
},
{
"epoch": 0.018354637823664373,
"grad_norm": 0.36885932087898254,
"learning_rate": 0.0004910370759006646,
"loss": 0.9023,
"step": 210
},
{
"epoch": 0.01844204086091992,
"grad_norm": 0.37099695205688477,
"learning_rate": 0.0004909933543196922,
"loss": 0.961,
"step": 211
},
{
"epoch": 0.01852944389817546,
"grad_norm": 0.3451802432537079,
"learning_rate": 0.0004909496327387199,
"loss": 0.8744,
"step": 212
},
{
"epoch": 0.018616846935431006,
"grad_norm": 0.34541890025138855,
"learning_rate": 0.0004909059111577475,
"loss": 0.9766,
"step": 213
},
{
"epoch": 0.018704249972686552,
"grad_norm": 0.2827027440071106,
"learning_rate": 0.0004908621895767751,
"loss": 0.8569,
"step": 214
},
{
"epoch": 0.018791653009942094,
"grad_norm": 0.3254356384277344,
"learning_rate": 0.0004908184679958028,
"loss": 0.9091,
"step": 215
},
{
"epoch": 0.01887905604719764,
"grad_norm": 0.29408493638038635,
"learning_rate": 0.0004907747464148304,
"loss": 0.823,
"step": 216
},
{
"epoch": 0.018966459084453186,
"grad_norm": 0.3414423167705536,
"learning_rate": 0.0004907310248338581,
"loss": 0.8197,
"step": 217
},
{
"epoch": 0.019053862121708728,
"grad_norm": 0.33818957209587097,
"learning_rate": 0.0004906873032528857,
"loss": 0.8553,
"step": 218
},
{
"epoch": 0.019141265158964273,
"grad_norm": 0.28477659821510315,
"learning_rate": 0.0004906435816719132,
"loss": 0.9008,
"step": 219
},
{
"epoch": 0.01922866819621982,
"grad_norm": 0.30363160371780396,
"learning_rate": 0.0004905998600909408,
"loss": 0.8077,
"step": 220
},
{
"epoch": 0.019316071233475365,
"grad_norm": 0.5011153221130371,
"learning_rate": 0.0004905561385099685,
"loss": 0.8938,
"step": 221
},
{
"epoch": 0.019403474270730907,
"grad_norm": 0.33721473813056946,
"learning_rate": 0.0004905124169289961,
"loss": 0.7798,
"step": 222
},
{
"epoch": 0.019490877307986453,
"grad_norm": 0.3752390742301941,
"learning_rate": 0.0004904686953480238,
"loss": 0.9064,
"step": 223
},
{
"epoch": 0.019578280345241998,
"grad_norm": 0.32278257608413696,
"learning_rate": 0.0004904249737670514,
"loss": 1.0019,
"step": 224
},
{
"epoch": 0.01966568338249754,
"grad_norm": 0.5604023933410645,
"learning_rate": 0.000490381252186079,
"loss": 0.9579,
"step": 225
},
{
"epoch": 0.019753086419753086,
"grad_norm": 0.26056113839149475,
"learning_rate": 0.0004903375306051067,
"loss": 0.7596,
"step": 226
},
{
"epoch": 0.01984048945700863,
"grad_norm": 0.3333994448184967,
"learning_rate": 0.0004902938090241343,
"loss": 1.0804,
"step": 227
},
{
"epoch": 0.019927892494264177,
"grad_norm": 0.3021886944770813,
"learning_rate": 0.000490250087443162,
"loss": 0.959,
"step": 228
},
{
"epoch": 0.02001529553151972,
"grad_norm": 0.2865878641605377,
"learning_rate": 0.0004902063658621896,
"loss": 0.9816,
"step": 229
},
{
"epoch": 0.020102698568775265,
"grad_norm": 0.2981945276260376,
"learning_rate": 0.0004901626442812172,
"loss": 0.8672,
"step": 230
},
{
"epoch": 0.02019010160603081,
"grad_norm": 0.34836679697036743,
"learning_rate": 0.0004901189227002449,
"loss": 0.9012,
"step": 231
},
{
"epoch": 0.020277504643286353,
"grad_norm": 0.7560614347457886,
"learning_rate": 0.0004900752011192725,
"loss": 1.2521,
"step": 232
},
{
"epoch": 0.0203649076805419,
"grad_norm": 0.2899073362350464,
"learning_rate": 0.0004900314795383002,
"loss": 0.9376,
"step": 233
},
{
"epoch": 0.020452310717797444,
"grad_norm": 0.2944093644618988,
"learning_rate": 0.0004899877579573278,
"loss": 0.9158,
"step": 234
},
{
"epoch": 0.020539713755052987,
"grad_norm": 0.2837924361228943,
"learning_rate": 0.0004899440363763553,
"loss": 0.9397,
"step": 235
},
{
"epoch": 0.020627116792308532,
"grad_norm": 0.3069987893104553,
"learning_rate": 0.000489900314795383,
"loss": 0.9635,
"step": 236
},
{
"epoch": 0.020714519829564078,
"grad_norm": 0.29966363310813904,
"learning_rate": 0.0004898565932144106,
"loss": 0.9103,
"step": 237
},
{
"epoch": 0.020801922866819623,
"grad_norm": 0.3086193799972534,
"learning_rate": 0.0004898128716334383,
"loss": 0.9797,
"step": 238
},
{
"epoch": 0.020889325904075166,
"grad_norm": 0.28495675325393677,
"learning_rate": 0.0004897691500524659,
"loss": 0.8221,
"step": 239
},
{
"epoch": 0.02097672894133071,
"grad_norm": 0.27056995034217834,
"learning_rate": 0.0004897254284714935,
"loss": 0.9584,
"step": 240
},
{
"epoch": 0.021064131978586257,
"grad_norm": 0.2837945818901062,
"learning_rate": 0.0004896817068905211,
"loss": 1.0047,
"step": 241
},
{
"epoch": 0.0211515350158418,
"grad_norm": 0.4288729429244995,
"learning_rate": 0.0004896379853095488,
"loss": 1.3211,
"step": 242
},
{
"epoch": 0.021238938053097345,
"grad_norm": 1.1985094547271729,
"learning_rate": 0.0004895942637285765,
"loss": 1.4015,
"step": 243
},
{
"epoch": 0.02132634109035289,
"grad_norm": 0.3171183466911316,
"learning_rate": 0.0004895505421476041,
"loss": 0.7096,
"step": 244
},
{
"epoch": 0.021413744127608433,
"grad_norm": 3.1765527725219727,
"learning_rate": 0.0004895068205666317,
"loss": 1.5594,
"step": 245
},
{
"epoch": 0.02150114716486398,
"grad_norm": 0.35891321301460266,
"learning_rate": 0.0004894630989856593,
"loss": 1.0663,
"step": 246
},
{
"epoch": 0.021588550202119524,
"grad_norm": 0.7044485807418823,
"learning_rate": 0.000489419377404687,
"loss": 1.4146,
"step": 247
},
{
"epoch": 0.02167595323937507,
"grad_norm": 0.361392617225647,
"learning_rate": 0.0004893756558237146,
"loss": 0.7964,
"step": 248
},
{
"epoch": 0.021763356276630612,
"grad_norm": 0.31394776701927185,
"learning_rate": 0.0004893319342427423,
"loss": 0.8608,
"step": 249
},
{
"epoch": 0.021850759313886157,
"grad_norm": 0.2853809893131256,
"learning_rate": 0.0004892882126617699,
"loss": 0.8628,
"step": 250
},
{
"epoch": 0.021938162351141703,
"grad_norm": 0.3122541904449463,
"learning_rate": 0.0004892444910807975,
"loss": 0.7246,
"step": 251
},
{
"epoch": 0.022025565388397245,
"grad_norm": 12.120355606079102,
"learning_rate": 0.0004892007694998252,
"loss": 1.3082,
"step": 252
},
{
"epoch": 0.02211296842565279,
"grad_norm": 0.3758118450641632,
"learning_rate": 0.0004891570479188527,
"loss": 1.0478,
"step": 253
},
{
"epoch": 0.022200371462908337,
"grad_norm": 1.1910297870635986,
"learning_rate": 0.0004891133263378804,
"loss": 1.2477,
"step": 254
},
{
"epoch": 0.022287774500163882,
"grad_norm": 0.8632226586341858,
"learning_rate": 0.000489069604756908,
"loss": 1.0988,
"step": 255
},
{
"epoch": 0.022375177537419425,
"grad_norm": 0.381533145904541,
"learning_rate": 0.0004890258831759356,
"loss": 0.8892,
"step": 256
},
{
"epoch": 0.02246258057467497,
"grad_norm": 0.43683141469955444,
"learning_rate": 0.0004889821615949633,
"loss": 0.8526,
"step": 257
},
{
"epoch": 0.022549983611930516,
"grad_norm": 0.6212348341941833,
"learning_rate": 0.0004889384400139909,
"loss": 0.9791,
"step": 258
},
{
"epoch": 0.022637386649186058,
"grad_norm": 0.44247013330459595,
"learning_rate": 0.0004888947184330185,
"loss": 1.0408,
"step": 259
},
{
"epoch": 0.022724789686441604,
"grad_norm": 0.5239019989967346,
"learning_rate": 0.0004888509968520462,
"loss": 0.8948,
"step": 260
},
{
"epoch": 0.02281219272369715,
"grad_norm": 0.7413169145584106,
"learning_rate": 0.0004888072752710738,
"loss": 0.7135,
"step": 261
},
{
"epoch": 0.02289959576095269,
"grad_norm": 0.39856553077697754,
"learning_rate": 0.0004887635536901015,
"loss": 0.8587,
"step": 262
},
{
"epoch": 0.022986998798208237,
"grad_norm": 0.534248411655426,
"learning_rate": 0.0004887198321091291,
"loss": 0.9006,
"step": 263
},
{
"epoch": 0.023074401835463783,
"grad_norm": 0.4782329499721527,
"learning_rate": 0.0004886761105281567,
"loss": 0.9292,
"step": 264
},
{
"epoch": 0.02316180487271933,
"grad_norm": 2.2424156665802,
"learning_rate": 0.0004886323889471843,
"loss": 1.1921,
"step": 265
},
{
"epoch": 0.02324920790997487,
"grad_norm": 0.5274596810340881,
"learning_rate": 0.000488588667366212,
"loss": 0.9732,
"step": 266
},
{
"epoch": 0.023336610947230416,
"grad_norm": 1.5465450286865234,
"learning_rate": 0.0004885449457852397,
"loss": 0.9304,
"step": 267
},
{
"epoch": 0.023424013984485962,
"grad_norm": 0.5691818594932556,
"learning_rate": 0.0004885012242042673,
"loss": 0.9713,
"step": 268
},
{
"epoch": 0.023511417021741504,
"grad_norm": 0.7849003672599792,
"learning_rate": 0.0004884575026232948,
"loss": 0.957,
"step": 269
},
{
"epoch": 0.02359882005899705,
"grad_norm": 0.5940591096878052,
"learning_rate": 0.0004884137810423224,
"loss": 0.8786,
"step": 270
},
{
"epoch": 0.023686223096252595,
"grad_norm": 0.592288076877594,
"learning_rate": 0.0004883700594613501,
"loss": 0.8695,
"step": 271
},
{
"epoch": 0.02377362613350814,
"grad_norm": 0.3618888556957245,
"learning_rate": 0.0004883263378803777,
"loss": 0.9204,
"step": 272
},
{
"epoch": 0.023861029170763683,
"grad_norm": 0.5957768559455872,
"learning_rate": 0.0004882826162994054,
"loss": 0.9289,
"step": 273
},
{
"epoch": 0.02394843220801923,
"grad_norm": 2.2828385829925537,
"learning_rate": 0.000488238894718433,
"loss": 0.9809,
"step": 274
},
{
"epoch": 0.024035835245274775,
"grad_norm": 0.5379523634910583,
"learning_rate": 0.00048819517313746066,
"loss": 0.934,
"step": 275
},
{
"epoch": 0.024123238282530317,
"grad_norm": 1.698805809020996,
"learning_rate": 0.00048815145155648826,
"loss": 0.9954,
"step": 276
},
{
"epoch": 0.024210641319785862,
"grad_norm": 4.479689121246338,
"learning_rate": 0.00048810772997551595,
"loss": 1.3687,
"step": 277
},
{
"epoch": 0.024298044357041408,
"grad_norm": 2.58227276802063,
"learning_rate": 0.00048806400839454355,
"loss": 0.9305,
"step": 278
},
{
"epoch": 0.02438544739429695,
"grad_norm": 0.8035925030708313,
"learning_rate": 0.0004880202868135712,
"loss": 1.1649,
"step": 279
},
{
"epoch": 0.024472850431552496,
"grad_norm": 0.560945451259613,
"learning_rate": 0.00048797656523259884,
"loss": 0.7542,
"step": 280
},
{
"epoch": 0.02456025346880804,
"grad_norm": 1.6739729642868042,
"learning_rate": 0.0004879328436516264,
"loss": 1.5675,
"step": 281
},
{
"epoch": 0.024647656506063587,
"grad_norm": 1.0051480531692505,
"learning_rate": 0.0004878891220706541,
"loss": 0.9312,
"step": 282
},
{
"epoch": 0.02473505954331913,
"grad_norm": 0.43883591890335083,
"learning_rate": 0.0004878454004896817,
"loss": 0.9779,
"step": 283
},
{
"epoch": 0.024822462580574675,
"grad_norm": 0.668854832649231,
"learning_rate": 0.00048780167890870936,
"loss": 0.9906,
"step": 284
},
{
"epoch": 0.02490986561783022,
"grad_norm": 2.1563730239868164,
"learning_rate": 0.00048775795732773695,
"loss": 0.9536,
"step": 285
},
{
"epoch": 0.024997268655085763,
"grad_norm": 1.1613394021987915,
"learning_rate": 0.0004877142357467646,
"loss": 0.9793,
"step": 286
},
{
"epoch": 0.02508467169234131,
"grad_norm": 0.5452724695205688,
"learning_rate": 0.00048767051416579224,
"loss": 1.11,
"step": 287
},
{
"epoch": 0.025172074729596854,
"grad_norm": 1.7393804788589478,
"learning_rate": 0.0004876267925848199,
"loss": 1.249,
"step": 288
},
{
"epoch": 0.025259477766852396,
"grad_norm": 15.148497581481934,
"learning_rate": 0.00048758307100384753,
"loss": 1.4897,
"step": 289
},
{
"epoch": 0.025346880804107942,
"grad_norm": 0.8102678060531616,
"learning_rate": 0.0004875393494228751,
"loss": 1.0192,
"step": 290
},
{
"epoch": 0.025434283841363488,
"grad_norm": 3.7395308017730713,
"learning_rate": 0.00048749562784190277,
"loss": 1.05,
"step": 291
},
{
"epoch": 0.025521686878619033,
"grad_norm": 0.6473442316055298,
"learning_rate": 0.0004874519062609304,
"loss": 0.9341,
"step": 292
},
{
"epoch": 0.025609089915874576,
"grad_norm": 1.2162256240844727,
"learning_rate": 0.000487408184679958,
"loss": 0.9426,
"step": 293
},
{
"epoch": 0.02569649295313012,
"grad_norm": 0.7783584594726562,
"learning_rate": 0.0004873644630989857,
"loss": 0.9343,
"step": 294
},
{
"epoch": 0.025783895990385667,
"grad_norm": 0.7198899388313293,
"learning_rate": 0.0004873207415180133,
"loss": 0.89,
"step": 295
},
{
"epoch": 0.02587129902764121,
"grad_norm": 0.6314525604248047,
"learning_rate": 0.00048727701993704094,
"loss": 0.9523,
"step": 296
},
{
"epoch": 0.025958702064896755,
"grad_norm": 3.2664554119110107,
"learning_rate": 0.00048723329835606853,
"loss": 1.3729,
"step": 297
},
{
"epoch": 0.0260461051021523,
"grad_norm": 0.9869332909584045,
"learning_rate": 0.0004871895767750962,
"loss": 0.978,
"step": 298
},
{
"epoch": 0.026133508139407846,
"grad_norm": 0.9169254302978516,
"learning_rate": 0.0004871458551941239,
"loss": 0.7641,
"step": 299
},
{
"epoch": 0.02622091117666339,
"grad_norm": 2.386565685272217,
"learning_rate": 0.00048710213361315147,
"loss": 0.9728,
"step": 300
},
{
"epoch": 0.026308314213918934,
"grad_norm": 2.5879757404327393,
"learning_rate": 0.0004870584120321791,
"loss": 1.0264,
"step": 301
},
{
"epoch": 0.02639571725117448,
"grad_norm": 1.059586763381958,
"learning_rate": 0.0004870146904512067,
"loss": 0.9235,
"step": 302
},
{
"epoch": 0.026483120288430022,
"grad_norm": 1.9793821573257446,
"learning_rate": 0.00048697096887023435,
"loss": 1.5626,
"step": 303
},
{
"epoch": 0.026570523325685567,
"grad_norm": 1.2389543056488037,
"learning_rate": 0.00048692724728926194,
"loss": 0.9666,
"step": 304
},
{
"epoch": 0.026657926362941113,
"grad_norm": 1.1373975276947021,
"learning_rate": 0.00048688352570828964,
"loss": 0.993,
"step": 305
},
{
"epoch": 0.026745329400196655,
"grad_norm": 5.966507434844971,
"learning_rate": 0.0004868398041273173,
"loss": 1.0113,
"step": 306
},
{
"epoch": 0.0268327324374522,
"grad_norm": 1.2714189291000366,
"learning_rate": 0.0004867960825463449,
"loss": 0.9462,
"step": 307
},
{
"epoch": 0.026920135474707747,
"grad_norm": 1.397048830986023,
"learning_rate": 0.0004867523609653725,
"loss": 0.9511,
"step": 308
},
{
"epoch": 0.027007538511963292,
"grad_norm": 1.2888479232788086,
"learning_rate": 0.0004867086393844001,
"loss": 1.014,
"step": 309
},
{
"epoch": 0.027094941549218834,
"grad_norm": 3.5597853660583496,
"learning_rate": 0.0004866649178034278,
"loss": 1.2336,
"step": 310
},
{
"epoch": 0.02718234458647438,
"grad_norm": 1.4104827642440796,
"learning_rate": 0.00048662119622245545,
"loss": 1.0148,
"step": 311
},
{
"epoch": 0.027269747623729926,
"grad_norm": 1.064355492591858,
"learning_rate": 0.00048657747464148305,
"loss": 1.0645,
"step": 312
},
{
"epoch": 0.027357150660985468,
"grad_norm": 0.819186806678772,
"learning_rate": 0.0004865337530605107,
"loss": 0.8948,
"step": 313
},
{
"epoch": 0.027444553698241014,
"grad_norm": 3.036085605621338,
"learning_rate": 0.0004864900314795383,
"loss": 1.1567,
"step": 314
},
{
"epoch": 0.02753195673549656,
"grad_norm": 1.4990466833114624,
"learning_rate": 0.0004864463098985659,
"loss": 0.9445,
"step": 315
},
{
"epoch": 0.0276193597727521,
"grad_norm": 1.889307975769043,
"learning_rate": 0.00048640258831759357,
"loss": 1.1844,
"step": 316
},
{
"epoch": 0.027706762810007647,
"grad_norm": 2.072758913040161,
"learning_rate": 0.0004863588667366212,
"loss": 1.0734,
"step": 317
},
{
"epoch": 0.027794165847263193,
"grad_norm": 2.2393903732299805,
"learning_rate": 0.00048631514515564886,
"loss": 1.1427,
"step": 318
},
{
"epoch": 0.02788156888451874,
"grad_norm": 4.34975528717041,
"learning_rate": 0.00048627142357467645,
"loss": 1.2473,
"step": 319
},
{
"epoch": 0.02796897192177428,
"grad_norm": 2.8603451251983643,
"learning_rate": 0.0004862277019937041,
"loss": 1.1657,
"step": 320
},
{
"epoch": 0.028056374959029826,
"grad_norm": 3.665041923522949,
"learning_rate": 0.0004861839804127317,
"loss": 1.6031,
"step": 321
},
{
"epoch": 0.028143777996285372,
"grad_norm": 3.366703748703003,
"learning_rate": 0.0004861402588317594,
"loss": 1.0769,
"step": 322
},
{
"epoch": 0.028231181033540914,
"grad_norm": 1.470408320426941,
"learning_rate": 0.00048609653725078703,
"loss": 1.2034,
"step": 323
},
{
"epoch": 0.02831858407079646,
"grad_norm": 1.0659921169281006,
"learning_rate": 0.0004860528156698146,
"loss": 0.984,
"step": 324
},
{
"epoch": 0.028405987108052005,
"grad_norm": 4.098123550415039,
"learning_rate": 0.00048600909408884227,
"loss": 1.2241,
"step": 325
},
{
"epoch": 0.02849339014530755,
"grad_norm": 11.896109580993652,
"learning_rate": 0.00048596537250786986,
"loss": 2.0891,
"step": 326
},
{
"epoch": 0.028580793182563093,
"grad_norm": 3.2453126907348633,
"learning_rate": 0.00048592165092689756,
"loss": 1.1273,
"step": 327
},
{
"epoch": 0.02866819621981864,
"grad_norm": 2.6395857334136963,
"learning_rate": 0.00048587792934592515,
"loss": 1.6087,
"step": 328
},
{
"epoch": 0.028755599257074185,
"grad_norm": 2.1530113220214844,
"learning_rate": 0.0004858342077649528,
"loss": 1.2749,
"step": 329
},
{
"epoch": 0.028843002294329727,
"grad_norm": 4.572982311248779,
"learning_rate": 0.00048579048618398044,
"loss": 1.4111,
"step": 330
},
{
"epoch": 0.028930405331585272,
"grad_norm": 3.029306173324585,
"learning_rate": 0.00048574676460300803,
"loss": 1.2926,
"step": 331
},
{
"epoch": 0.029017808368840818,
"grad_norm": 1.7193225622177124,
"learning_rate": 0.0004857030430220357,
"loss": 1.1767,
"step": 332
},
{
"epoch": 0.02910521140609636,
"grad_norm": 10.779121398925781,
"learning_rate": 0.0004856593214410633,
"loss": 1.3369,
"step": 333
},
{
"epoch": 0.029192614443351906,
"grad_norm": 2.478919744491577,
"learning_rate": 0.00048561559986009097,
"loss": 1.093,
"step": 334
},
{
"epoch": 0.02928001748060745,
"grad_norm": 2.2353742122650146,
"learning_rate": 0.00048557187827911856,
"loss": 1.1168,
"step": 335
},
{
"epoch": 0.029367420517862997,
"grad_norm": 2.8225460052490234,
"learning_rate": 0.0004855281566981462,
"loss": 1.3248,
"step": 336
},
{
"epoch": 0.02945482355511854,
"grad_norm": 2.1292366981506348,
"learning_rate": 0.00048548443511717385,
"loss": 1.344,
"step": 337
},
{
"epoch": 0.029542226592374085,
"grad_norm": 7.299522399902344,
"learning_rate": 0.0004854407135362015,
"loss": 1.8145,
"step": 338
},
{
"epoch": 0.02962962962962963,
"grad_norm": 1.5046287775039673,
"learning_rate": 0.00048539699195522914,
"loss": 1.388,
"step": 339
},
{
"epoch": 0.029717032666885173,
"grad_norm": 3.0877699851989746,
"learning_rate": 0.00048535327037425673,
"loss": 1.3291,
"step": 340
},
{
"epoch": 0.02980443570414072,
"grad_norm": 3.4899399280548096,
"learning_rate": 0.0004853095487932844,
"loss": 2.0677,
"step": 341
},
{
"epoch": 0.029891838741396264,
"grad_norm": 11.234345436096191,
"learning_rate": 0.000485265827212312,
"loss": 1.625,
"step": 342
},
{
"epoch": 0.029979241778651806,
"grad_norm": 2.1975765228271484,
"learning_rate": 0.0004852221056313396,
"loss": 1.4517,
"step": 343
},
{
"epoch": 0.030066644815907352,
"grad_norm": 8.629820823669434,
"learning_rate": 0.0004851783840503673,
"loss": 1.5853,
"step": 344
},
{
"epoch": 0.030154047853162898,
"grad_norm": 2.3949103355407715,
"learning_rate": 0.0004851346624693949,
"loss": 1.2549,
"step": 345
},
{
"epoch": 0.030241450890418443,
"grad_norm": 159.31179809570312,
"learning_rate": 0.00048509094088842255,
"loss": 1.5771,
"step": 346
},
{
"epoch": 0.030328853927673986,
"grad_norm": 11.36462688446045,
"learning_rate": 0.00048504721930745014,
"loss": 1.9178,
"step": 347
},
{
"epoch": 0.03041625696492953,
"grad_norm": 7.807027816772461,
"learning_rate": 0.0004850034977264778,
"loss": 1.9789,
"step": 348
},
{
"epoch": 0.030503660002185077,
"grad_norm": 8.663688659667969,
"learning_rate": 0.0004849597761455054,
"loss": 2.0506,
"step": 349
},
{
"epoch": 0.03059106303944062,
"grad_norm": 2.205583095550537,
"learning_rate": 0.00048491605456453307,
"loss": 1.8671,
"step": 350
},
{
"epoch": 0.030678466076696165,
"grad_norm": 3.150911808013916,
"learning_rate": 0.0004848723329835607,
"loss": 1.333,
"step": 351
},
{
"epoch": 0.03076586911395171,
"grad_norm": 4.053075790405273,
"learning_rate": 0.0004848286114025883,
"loss": 1.5273,
"step": 352
},
{
"epoch": 0.030853272151207256,
"grad_norm": 2.823411703109741,
"learning_rate": 0.00048478488982161595,
"loss": 1.4247,
"step": 353
},
{
"epoch": 0.0309406751884628,
"grad_norm": 3.0909945964813232,
"learning_rate": 0.0004847411682406436,
"loss": 1.2206,
"step": 354
},
{
"epoch": 0.031028078225718344,
"grad_norm": 3.38694167137146,
"learning_rate": 0.00048469744665967124,
"loss": 1.3954,
"step": 355
},
{
"epoch": 0.03111548126297389,
"grad_norm": 1.5531120300292969,
"learning_rate": 0.0004846537250786989,
"loss": 1.4665,
"step": 356
},
{
"epoch": 0.031202884300229432,
"grad_norm": 2.2059831619262695,
"learning_rate": 0.0004846100034977265,
"loss": 1.6022,
"step": 357
},
{
"epoch": 0.03129028733748498,
"grad_norm": 5.113000869750977,
"learning_rate": 0.0004845662819167541,
"loss": 1.5966,
"step": 358
},
{
"epoch": 0.03137769037474052,
"grad_norm": 8.374882698059082,
"learning_rate": 0.0004845225603357817,
"loss": 1.7198,
"step": 359
},
{
"epoch": 0.03146509341199607,
"grad_norm": 6.680134296417236,
"learning_rate": 0.00048447883875480936,
"loss": 1.4896,
"step": 360
},
{
"epoch": 0.031552496449251614,
"grad_norm": 4.67073392868042,
"learning_rate": 0.00048443511717383706,
"loss": 1.8682,
"step": 361
},
{
"epoch": 0.03163989948650715,
"grad_norm": 4.780435562133789,
"learning_rate": 0.00048439139559286465,
"loss": 1.7389,
"step": 362
},
{
"epoch": 0.0317273025237627,
"grad_norm": 3.4517061710357666,
"learning_rate": 0.0004843476740118923,
"loss": 1.8797,
"step": 363
},
{
"epoch": 0.031814705561018244,
"grad_norm": 2.4916350841522217,
"learning_rate": 0.0004843039524309199,
"loss": 1.4436,
"step": 364
},
{
"epoch": 0.03190210859827379,
"grad_norm": 3.9899487495422363,
"learning_rate": 0.00048426023084994753,
"loss": 1.5546,
"step": 365
},
{
"epoch": 0.031989511635529336,
"grad_norm": 8.799160957336426,
"learning_rate": 0.0004842165092689752,
"loss": 1.6344,
"step": 366
},
{
"epoch": 0.03207691467278488,
"grad_norm": 2.636903762817383,
"learning_rate": 0.0004841727876880028,
"loss": 1.5937,
"step": 367
},
{
"epoch": 0.03216431771004043,
"grad_norm": 2.600330352783203,
"learning_rate": 0.00048412906610703047,
"loss": 1.5617,
"step": 368
},
{
"epoch": 0.032251720747295966,
"grad_norm": 2.9146833419799805,
"learning_rate": 0.00048408534452605806,
"loss": 2.2708,
"step": 369
},
{
"epoch": 0.03233912378455151,
"grad_norm": 1.6746532917022705,
"learning_rate": 0.0004840416229450857,
"loss": 1.3178,
"step": 370
},
{
"epoch": 0.03242652682180706,
"grad_norm": 2.1965625286102295,
"learning_rate": 0.0004839979013641133,
"loss": 1.2351,
"step": 371
},
{
"epoch": 0.0325139298590626,
"grad_norm": 4.235499858856201,
"learning_rate": 0.000483954179783141,
"loss": 1.8627,
"step": 372
},
{
"epoch": 0.03260133289631815,
"grad_norm": 1.5351746082305908,
"learning_rate": 0.00048391045820216864,
"loss": 1.2413,
"step": 373
},
{
"epoch": 0.032688735933573694,
"grad_norm": 1.5462607145309448,
"learning_rate": 0.00048386673662119623,
"loss": 1.3282,
"step": 374
},
{
"epoch": 0.03277613897082924,
"grad_norm": 2.4433155059814453,
"learning_rate": 0.0004838230150402239,
"loss": 1.3913,
"step": 375
},
{
"epoch": 0.03286354200808478,
"grad_norm": 2.431323528289795,
"learning_rate": 0.00048377929345925146,
"loss": 1.4269,
"step": 376
},
{
"epoch": 0.032950945045340324,
"grad_norm": 1.4146811962127686,
"learning_rate": 0.0004837355718782791,
"loss": 1.225,
"step": 377
},
{
"epoch": 0.03303834808259587,
"grad_norm": 1.0660099983215332,
"learning_rate": 0.00048369185029730675,
"loss": 1.2465,
"step": 378
},
{
"epoch": 0.033125751119851415,
"grad_norm": 16.820344924926758,
"learning_rate": 0.0004836481287163344,
"loss": 1.2228,
"step": 379
},
{
"epoch": 0.03321315415710696,
"grad_norm": 1.6520887613296509,
"learning_rate": 0.00048360440713536204,
"loss": 1.0955,
"step": 380
},
{
"epoch": 0.03330055719436251,
"grad_norm": 3.057648181915283,
"learning_rate": 0.00048356068555438964,
"loss": 1.3929,
"step": 381
},
{
"epoch": 0.033387960231618045,
"grad_norm": 5.74190092086792,
"learning_rate": 0.0004835169639734173,
"loss": 1.3873,
"step": 382
},
{
"epoch": 0.03347536326887359,
"grad_norm": 2.451111078262329,
"learning_rate": 0.0004834732423924449,
"loss": 1.2411,
"step": 383
},
{
"epoch": 0.03356276630612914,
"grad_norm": 7.096491813659668,
"learning_rate": 0.00048342952081147257,
"loss": 1.1512,
"step": 384
},
{
"epoch": 0.03365016934338468,
"grad_norm": 1.7510989904403687,
"learning_rate": 0.0004833857992305002,
"loss": 1.7508,
"step": 385
},
{
"epoch": 0.03373757238064023,
"grad_norm": 1.9392039775848389,
"learning_rate": 0.0004833420776495278,
"loss": 1.3254,
"step": 386
},
{
"epoch": 0.033824975417895774,
"grad_norm": 1.3087763786315918,
"learning_rate": 0.00048329835606855545,
"loss": 1.167,
"step": 387
},
{
"epoch": 0.03391237845515132,
"grad_norm": 1.0963687896728516,
"learning_rate": 0.00048325463448758304,
"loss": 1.193,
"step": 388
},
{
"epoch": 0.03399978149240686,
"grad_norm": 0.7981585264205933,
"learning_rate": 0.00048321091290661074,
"loss": 1.1383,
"step": 389
},
{
"epoch": 0.034087184529662404,
"grad_norm": 0.9217828512191772,
"learning_rate": 0.00048316719132563833,
"loss": 1.0119,
"step": 390
},
{
"epoch": 0.03417458756691795,
"grad_norm": 1.242906093597412,
"learning_rate": 0.000483123469744666,
"loss": 1.1663,
"step": 391
},
{
"epoch": 0.034261990604173495,
"grad_norm": 0.9021317362785339,
"learning_rate": 0.0004830797481636936,
"loss": 1.1384,
"step": 392
},
{
"epoch": 0.03434939364142904,
"grad_norm": 0.9118911623954773,
"learning_rate": 0.0004830360265827212,
"loss": 1.3087,
"step": 393
},
{
"epoch": 0.034436796678684586,
"grad_norm": 1.754934549331665,
"learning_rate": 0.0004829923050017489,
"loss": 1.3614,
"step": 394
},
{
"epoch": 0.03452419971594013,
"grad_norm": 0.8837860822677612,
"learning_rate": 0.0004829485834207765,
"loss": 1.1244,
"step": 395
},
{
"epoch": 0.03461160275319567,
"grad_norm": 2.6078360080718994,
"learning_rate": 0.00048290486183980415,
"loss": 1.0216,
"step": 396
},
{
"epoch": 0.034699005790451216,
"grad_norm": 5.406350135803223,
"learning_rate": 0.00048286114025883174,
"loss": 1.0928,
"step": 397
},
{
"epoch": 0.03478640882770676,
"grad_norm": 2.1140406131744385,
"learning_rate": 0.0004828174186778594,
"loss": 1.1857,
"step": 398
},
{
"epoch": 0.03487381186496231,
"grad_norm": 7.267689228057861,
"learning_rate": 0.00048277369709688703,
"loss": 1.7055,
"step": 399
},
{
"epoch": 0.03496121490221785,
"grad_norm": 1.1019072532653809,
"learning_rate": 0.0004827299755159147,
"loss": 2.0105,
"step": 400
},
{
"epoch": 0.0350486179394734,
"grad_norm": 7.888851165771484,
"learning_rate": 0.0004826862539349423,
"loss": 1.9483,
"step": 401
},
{
"epoch": 0.035136020976728945,
"grad_norm": 1.299735188484192,
"learning_rate": 0.0004826425323539699,
"loss": 1.2644,
"step": 402
},
{
"epoch": 0.03522342401398448,
"grad_norm": 1.5624737739562988,
"learning_rate": 0.00048259881077299756,
"loss": 1.0429,
"step": 403
},
{
"epoch": 0.03531082705124003,
"grad_norm": 1.350966453552246,
"learning_rate": 0.0004825550891920252,
"loss": 1.1749,
"step": 404
},
{
"epoch": 0.035398230088495575,
"grad_norm": 1.5936487913131714,
"learning_rate": 0.0004825113676110528,
"loss": 1.1733,
"step": 405
},
{
"epoch": 0.03548563312575112,
"grad_norm": 1.0757735967636108,
"learning_rate": 0.0004824676460300805,
"loss": 0.9944,
"step": 406
},
{
"epoch": 0.035573036163006666,
"grad_norm": 0.7153262495994568,
"learning_rate": 0.0004824239244491081,
"loss": 1.1921,
"step": 407
},
{
"epoch": 0.03566043920026221,
"grad_norm": 1.0734481811523438,
"learning_rate": 0.00048238020286813573,
"loss": 1.1752,
"step": 408
},
{
"epoch": 0.03574784223751775,
"grad_norm": 0.8831942081451416,
"learning_rate": 0.0004823364812871633,
"loss": 1.1402,
"step": 409
},
{
"epoch": 0.035835245274773296,
"grad_norm": 0.6179252862930298,
"learning_rate": 0.00048229275970619096,
"loss": 1.2101,
"step": 410
},
{
"epoch": 0.03592264831202884,
"grad_norm": 1.091264009475708,
"learning_rate": 0.00048224903812521866,
"loss": 1.1421,
"step": 411
},
{
"epoch": 0.03601005134928439,
"grad_norm": 0.8162115216255188,
"learning_rate": 0.00048220531654424625,
"loss": 1.2952,
"step": 412
},
{
"epoch": 0.03609745438653993,
"grad_norm": 1.0148085355758667,
"learning_rate": 0.0004821615949632739,
"loss": 0.9862,
"step": 413
},
{
"epoch": 0.03618485742379548,
"grad_norm": 0.9712663888931274,
"learning_rate": 0.0004821178733823015,
"loss": 1.1402,
"step": 414
},
{
"epoch": 0.036272260461051024,
"grad_norm": 0.9177207350730896,
"learning_rate": 0.00048207415180132914,
"loss": 1.2027,
"step": 415
},
{
"epoch": 0.03635966349830656,
"grad_norm": 3.5026392936706543,
"learning_rate": 0.0004820304302203567,
"loss": 1.4284,
"step": 416
},
{
"epoch": 0.03644706653556211,
"grad_norm": 1.7483121156692505,
"learning_rate": 0.0004819867086393844,
"loss": 1.2328,
"step": 417
},
{
"epoch": 0.036534469572817654,
"grad_norm": 1.423335075378418,
"learning_rate": 0.00048194298705841207,
"loss": 1.1085,
"step": 418
},
{
"epoch": 0.0366218726100732,
"grad_norm": 13.332382202148438,
"learning_rate": 0.00048189926547743966,
"loss": 1.2456,
"step": 419
},
{
"epoch": 0.036709275647328746,
"grad_norm": 1.2808276414871216,
"learning_rate": 0.0004818555438964673,
"loss": 1.1165,
"step": 420
},
{
"epoch": 0.03679667868458429,
"grad_norm": 1.293886661529541,
"learning_rate": 0.0004818118223154949,
"loss": 1.2171,
"step": 421
},
{
"epoch": 0.03688408172183984,
"grad_norm": 1.1845675706863403,
"learning_rate": 0.0004817681007345226,
"loss": 2.0462,
"step": 422
},
{
"epoch": 0.036971484759095376,
"grad_norm": 0.9728288054466248,
"learning_rate": 0.00048172437915355024,
"loss": 1.143,
"step": 423
},
{
"epoch": 0.03705888779635092,
"grad_norm": 0.816474437713623,
"learning_rate": 0.00048168065757257783,
"loss": 1.2092,
"step": 424
},
{
"epoch": 0.03714629083360647,
"grad_norm": 0.6224190592765808,
"learning_rate": 0.0004816369359916055,
"loss": 1.0575,
"step": 425
},
{
"epoch": 0.03723369387086201,
"grad_norm": 0.6718823313713074,
"learning_rate": 0.00048159321441063307,
"loss": 1.0947,
"step": 426
},
{
"epoch": 0.03732109690811756,
"grad_norm": 0.6595826148986816,
"learning_rate": 0.0004815494928296607,
"loss": 1.4427,
"step": 427
},
{
"epoch": 0.037408499945373104,
"grad_norm": 11.761706352233887,
"learning_rate": 0.00048150577124868836,
"loss": 1.0676,
"step": 428
},
{
"epoch": 0.03749590298262865,
"grad_norm": 0.8342620134353638,
"learning_rate": 0.000481462049667716,
"loss": 1.9127,
"step": 429
},
{
"epoch": 0.03758330601988419,
"grad_norm": 1.1234923601150513,
"learning_rate": 0.00048141832808674365,
"loss": 1.1633,
"step": 430
},
{
"epoch": 0.037670709057139734,
"grad_norm": 1.9076615571975708,
"learning_rate": 0.00048137460650577124,
"loss": 1.0639,
"step": 431
},
{
"epoch": 0.03775811209439528,
"grad_norm": 0.6750392913818359,
"learning_rate": 0.0004813308849247989,
"loss": 0.9955,
"step": 432
},
{
"epoch": 0.037845515131650825,
"grad_norm": 0.6759085655212402,
"learning_rate": 0.0004812871633438265,
"loss": 1.131,
"step": 433
},
{
"epoch": 0.03793291816890637,
"grad_norm": 1.4919787645339966,
"learning_rate": 0.0004812434417628542,
"loss": 1.6338,
"step": 434
},
{
"epoch": 0.03802032120616192,
"grad_norm": 0.8407806754112244,
"learning_rate": 0.0004811997201818818,
"loss": 1.6765,
"step": 435
},
{
"epoch": 0.038107724243417455,
"grad_norm": 0.5378815531730652,
"learning_rate": 0.0004811559986009094,
"loss": 1.1115,
"step": 436
},
{
"epoch": 0.038195127280673,
"grad_norm": 0.705746054649353,
"learning_rate": 0.00048111227701993706,
"loss": 0.8717,
"step": 437
},
{
"epoch": 0.03828253031792855,
"grad_norm": 0.6170596480369568,
"learning_rate": 0.00048106855543896465,
"loss": 1.113,
"step": 438
},
{
"epoch": 0.03836993335518409,
"grad_norm": 0.7694591283798218,
"learning_rate": 0.00048102483385799235,
"loss": 0.9803,
"step": 439
},
{
"epoch": 0.03845733639243964,
"grad_norm": 0.44214290380477905,
"learning_rate": 0.00048098111227701994,
"loss": 1.0997,
"step": 440
},
{
"epoch": 0.038544739429695184,
"grad_norm": 1.67384934425354,
"learning_rate": 0.0004809373906960476,
"loss": 1.473,
"step": 441
},
{
"epoch": 0.03863214246695073,
"grad_norm": 0.906971275806427,
"learning_rate": 0.00048089366911507523,
"loss": 1.4701,
"step": 442
},
{
"epoch": 0.03871954550420627,
"grad_norm": 1.0720627307891846,
"learning_rate": 0.0004808499475341028,
"loss": 1.2818,
"step": 443
},
{
"epoch": 0.038806948541461814,
"grad_norm": 0.9048315286636353,
"learning_rate": 0.00048080622595313046,
"loss": 1.0395,
"step": 444
},
{
"epoch": 0.03889435157871736,
"grad_norm": 0.6810390949249268,
"learning_rate": 0.0004807625043721581,
"loss": 0.9983,
"step": 445
},
{
"epoch": 0.038981754615972905,
"grad_norm": 2.8892154693603516,
"learning_rate": 0.00048071878279118575,
"loss": 1.4023,
"step": 446
},
{
"epoch": 0.03906915765322845,
"grad_norm": 2.2658865451812744,
"learning_rate": 0.00048067506121021335,
"loss": 1.2289,
"step": 447
},
{
"epoch": 0.039156560690483996,
"grad_norm": 0.6239084005355835,
"learning_rate": 0.000480631339629241,
"loss": 1.012,
"step": 448
},
{
"epoch": 0.03924396372773954,
"grad_norm": 1.147459864616394,
"learning_rate": 0.00048058761804826864,
"loss": 1.0538,
"step": 449
},
{
"epoch": 0.03933136676499508,
"grad_norm": 0.8646839261054993,
"learning_rate": 0.0004805438964672963,
"loss": 0.965,
"step": 450
},
{
"epoch": 0.039418769802250626,
"grad_norm": 0.9366894960403442,
"learning_rate": 0.0004805001748863239,
"loss": 0.8447,
"step": 451
},
{
"epoch": 0.03950617283950617,
"grad_norm": 0.6512202024459839,
"learning_rate": 0.0004804564533053515,
"loss": 1.0594,
"step": 452
},
{
"epoch": 0.03959357587676172,
"grad_norm": 0.5651702284812927,
"learning_rate": 0.00048041273172437916,
"loss": 1.1249,
"step": 453
},
{
"epoch": 0.03968097891401726,
"grad_norm": 1.0038714408874512,
"learning_rate": 0.0004803690101434068,
"loss": 1.1198,
"step": 454
},
{
"epoch": 0.03976838195127281,
"grad_norm": 1.0579853057861328,
"learning_rate": 0.0004803252885624344,
"loss": 1.0889,
"step": 455
},
{
"epoch": 0.039855784988528355,
"grad_norm": 0.4361538887023926,
"learning_rate": 0.0004802815669814621,
"loss": 0.876,
"step": 456
},
{
"epoch": 0.03994318802578389,
"grad_norm": 0.8685644865036011,
"learning_rate": 0.0004802378454004897,
"loss": 0.8344,
"step": 457
},
{
"epoch": 0.04003059106303944,
"grad_norm": 0.5350561141967773,
"learning_rate": 0.00048019412381951733,
"loss": 1.0352,
"step": 458
},
{
"epoch": 0.040117994100294985,
"grad_norm": 0.7722122669219971,
"learning_rate": 0.0004801504022385449,
"loss": 0.9144,
"step": 459
},
{
"epoch": 0.04020539713755053,
"grad_norm": 0.5645512938499451,
"learning_rate": 0.00048010668065757257,
"loss": 0.9014,
"step": 460
},
{
"epoch": 0.040292800174806076,
"grad_norm": 0.5366953015327454,
"learning_rate": 0.00048006295907660027,
"loss": 1.005,
"step": 461
},
{
"epoch": 0.04038020321206162,
"grad_norm": 0.5673419237136841,
"learning_rate": 0.00048001923749562786,
"loss": 0.9666,
"step": 462
},
{
"epoch": 0.04046760624931716,
"grad_norm": 0.5309872031211853,
"learning_rate": 0.0004799755159146555,
"loss": 1.017,
"step": 463
},
{
"epoch": 0.040555009286572706,
"grad_norm": 0.567584753036499,
"learning_rate": 0.0004799317943336831,
"loss": 0.9212,
"step": 464
},
{
"epoch": 0.04064241232382825,
"grad_norm": 0.5049634575843811,
"learning_rate": 0.00047988807275271074,
"loss": 1.0515,
"step": 465
},
{
"epoch": 0.0407298153610838,
"grad_norm": 0.5385315418243408,
"learning_rate": 0.00047984435117173833,
"loss": 1.1727,
"step": 466
},
{
"epoch": 0.04081721839833934,
"grad_norm": 0.4884001910686493,
"learning_rate": 0.00047980062959076603,
"loss": 1.1159,
"step": 467
},
{
"epoch": 0.04090462143559489,
"grad_norm": 0.7112920880317688,
"learning_rate": 0.0004797569080097937,
"loss": 1.235,
"step": 468
},
{
"epoch": 0.040992024472850434,
"grad_norm": 0.4838173985481262,
"learning_rate": 0.00047971318642882127,
"loss": 0.9681,
"step": 469
},
{
"epoch": 0.04107942751010597,
"grad_norm": 0.45457422733306885,
"learning_rate": 0.0004796694648478489,
"loss": 1.1104,
"step": 470
},
{
"epoch": 0.04116683054736152,
"grad_norm": 0.5703690648078918,
"learning_rate": 0.0004796257432668765,
"loss": 1.1248,
"step": 471
},
{
"epoch": 0.041254233584617064,
"grad_norm": 0.450735479593277,
"learning_rate": 0.00047958202168590415,
"loss": 0.8925,
"step": 472
},
{
"epoch": 0.04134163662187261,
"grad_norm": 0.5150513052940369,
"learning_rate": 0.00047953830010493185,
"loss": 1.3525,
"step": 473
},
{
"epoch": 0.041429039659128156,
"grad_norm": 0.3937002718448639,
"learning_rate": 0.00047949457852395944,
"loss": 0.9275,
"step": 474
},
{
"epoch": 0.0415164426963837,
"grad_norm": 0.3689919114112854,
"learning_rate": 0.0004794508569429871,
"loss": 1.0588,
"step": 475
},
{
"epoch": 0.04160384573363925,
"grad_norm": 0.34137895703315735,
"learning_rate": 0.0004794071353620147,
"loss": 1.0148,
"step": 476
},
{
"epoch": 0.041691248770894786,
"grad_norm": 0.33478084206581116,
"learning_rate": 0.0004793634137810423,
"loss": 1.1783,
"step": 477
},
{
"epoch": 0.04177865180815033,
"grad_norm": 0.36996185779571533,
"learning_rate": 0.00047931969220006996,
"loss": 0.9166,
"step": 478
},
{
"epoch": 0.04186605484540588,
"grad_norm": 0.40458017587661743,
"learning_rate": 0.0004792759706190976,
"loss": 1.039,
"step": 479
},
{
"epoch": 0.04195345788266142,
"grad_norm": 0.5270059704780579,
"learning_rate": 0.00047923224903812525,
"loss": 0.9331,
"step": 480
},
{
"epoch": 0.04204086091991697,
"grad_norm": 0.38086146116256714,
"learning_rate": 0.00047918852745715285,
"loss": 1.2488,
"step": 481
},
{
"epoch": 0.042128263957172514,
"grad_norm": 0.4206714332103729,
"learning_rate": 0.0004791448058761805,
"loss": 0.9509,
"step": 482
},
{
"epoch": 0.04221566699442806,
"grad_norm": 0.45416519045829773,
"learning_rate": 0.0004791010842952081,
"loss": 1.0384,
"step": 483
},
{
"epoch": 0.0423030700316836,
"grad_norm": 0.312229722738266,
"learning_rate": 0.0004790573627142358,
"loss": 1.0349,
"step": 484
},
{
"epoch": 0.042390473068939144,
"grad_norm": 0.4084686040878296,
"learning_rate": 0.0004790136411332634,
"loss": 0.9074,
"step": 485
},
{
"epoch": 0.04247787610619469,
"grad_norm": 12.558296203613281,
"learning_rate": 0.000478969919552291,
"loss": 1.4943,
"step": 486
},
{
"epoch": 0.042565279143450235,
"grad_norm": 0.5897109508514404,
"learning_rate": 0.00047892619797131866,
"loss": 1.0668,
"step": 487
},
{
"epoch": 0.04265268218070578,
"grad_norm": 0.6350471377372742,
"learning_rate": 0.00047888247639034625,
"loss": 0.9479,
"step": 488
},
{
"epoch": 0.04274008521796133,
"grad_norm": 0.4891508221626282,
"learning_rate": 0.00047883875480937395,
"loss": 1.1157,
"step": 489
},
{
"epoch": 0.042827488255216865,
"grad_norm": 0.3619961142539978,
"learning_rate": 0.00047879503322840154,
"loss": 0.9912,
"step": 490
},
{
"epoch": 0.04291489129247241,
"grad_norm": 0.3376581072807312,
"learning_rate": 0.0004787513116474292,
"loss": 0.8494,
"step": 491
},
{
"epoch": 0.04300229432972796,
"grad_norm": 0.6040793061256409,
"learning_rate": 0.00047870759006645683,
"loss": 1.3237,
"step": 492
},
{
"epoch": 0.0430896973669835,
"grad_norm": 2.6606392860412598,
"learning_rate": 0.0004786638684854844,
"loss": 1.7359,
"step": 493
},
{
"epoch": 0.04317710040423905,
"grad_norm": 0.5396057367324829,
"learning_rate": 0.00047862014690451207,
"loss": 1.552,
"step": 494
},
{
"epoch": 0.043264503441494594,
"grad_norm": 0.42991939187049866,
"learning_rate": 0.0004785764253235397,
"loss": 0.99,
"step": 495
},
{
"epoch": 0.04335190647875014,
"grad_norm": 0.40487632155418396,
"learning_rate": 0.00047853270374256736,
"loss": 1.0104,
"step": 496
},
{
"epoch": 0.04343930951600568,
"grad_norm": 0.9767838716506958,
"learning_rate": 0.00047848898216159495,
"loss": 1.0582,
"step": 497
},
{
"epoch": 0.043526712553261224,
"grad_norm": 0.3633114695549011,
"learning_rate": 0.0004784452605806226,
"loss": 0.92,
"step": 498
},
{
"epoch": 0.04361411559051677,
"grad_norm": 0.6365157961845398,
"learning_rate": 0.00047840153899965024,
"loss": 0.9564,
"step": 499
},
{
"epoch": 0.043701518627772315,
"grad_norm": 0.4060046076774597,
"learning_rate": 0.00047835781741867783,
"loss": 1.046,
"step": 500
},
{
"epoch": 0.04378892166502786,
"grad_norm": 0.3747900128364563,
"learning_rate": 0.00047831409583770553,
"loss": 1.0201,
"step": 501
},
{
"epoch": 0.043876324702283406,
"grad_norm": 0.3672393262386322,
"learning_rate": 0.0004782703742567331,
"loss": 1.0021,
"step": 502
},
{
"epoch": 0.04396372773953895,
"grad_norm": 0.3505338132381439,
"learning_rate": 0.00047822665267576077,
"loss": 1.0002,
"step": 503
},
{
"epoch": 0.04405113077679449,
"grad_norm": 5.722542762756348,
"learning_rate": 0.0004781829310947884,
"loss": 2.5431,
"step": 504
},
{
"epoch": 0.044138533814050036,
"grad_norm": 0.5349693298339844,
"learning_rate": 0.000478139209513816,
"loss": 1.151,
"step": 505
},
{
"epoch": 0.04422593685130558,
"grad_norm": 0.4468895494937897,
"learning_rate": 0.0004780954879328437,
"loss": 0.9958,
"step": 506
},
{
"epoch": 0.04431333988856113,
"grad_norm": 0.47205036878585815,
"learning_rate": 0.0004780517663518713,
"loss": 0.9401,
"step": 507
},
{
"epoch": 0.04440074292581667,
"grad_norm": 0.35336941480636597,
"learning_rate": 0.00047800804477089894,
"loss": 1.0982,
"step": 508
},
{
"epoch": 0.04448814596307222,
"grad_norm": 1.8884743452072144,
"learning_rate": 0.00047796432318992653,
"loss": 0.9199,
"step": 509
},
{
"epoch": 0.044575549000327765,
"grad_norm": 0.4091229736804962,
"learning_rate": 0.0004779206016089542,
"loss": 0.8953,
"step": 510
},
{
"epoch": 0.0446629520375833,
"grad_norm": 0.4730583131313324,
"learning_rate": 0.0004778768800279818,
"loss": 0.8085,
"step": 511
},
{
"epoch": 0.04475035507483885,
"grad_norm": 0.3801075220108032,
"learning_rate": 0.00047783315844700946,
"loss": 0.9914,
"step": 512
},
{
"epoch": 0.044837758112094395,
"grad_norm": 0.3660631477832794,
"learning_rate": 0.0004777894368660371,
"loss": 0.9804,
"step": 513
},
{
"epoch": 0.04492516114934994,
"grad_norm": 0.8466418981552124,
"learning_rate": 0.0004777457152850647,
"loss": 1.1207,
"step": 514
},
{
"epoch": 0.045012564186605486,
"grad_norm": 0.3560774624347687,
"learning_rate": 0.00047770199370409234,
"loss": 0.8773,
"step": 515
},
{
"epoch": 0.04509996722386103,
"grad_norm": 0.49633318185806274,
"learning_rate": 0.00047765827212312,
"loss": 1.0111,
"step": 516
},
{
"epoch": 0.04518737026111657,
"grad_norm": 0.6001185178756714,
"learning_rate": 0.00047761455054214764,
"loss": 1.2566,
"step": 517
},
{
"epoch": 0.045274773298372116,
"grad_norm": 0.7423095703125,
"learning_rate": 0.0004775708289611753,
"loss": 1.1431,
"step": 518
},
{
"epoch": 0.04536217633562766,
"grad_norm": 0.34218892455101013,
"learning_rate": 0.00047752710738020287,
"loss": 0.9254,
"step": 519
},
{
"epoch": 0.04544957937288321,
"grad_norm": 0.336230605840683,
"learning_rate": 0.0004774833857992305,
"loss": 1.0015,
"step": 520
},
{
"epoch": 0.04553698241013875,
"grad_norm": 0.39158111810684204,
"learning_rate": 0.0004774396642182581,
"loss": 0.8319,
"step": 521
},
{
"epoch": 0.0456243854473943,
"grad_norm": 0.4045357406139374,
"learning_rate": 0.00047739594263728575,
"loss": 0.8531,
"step": 522
},
{
"epoch": 0.045711788484649844,
"grad_norm": 0.5861966013908386,
"learning_rate": 0.00047735222105631345,
"loss": 0.9975,
"step": 523
},
{
"epoch": 0.04579919152190538,
"grad_norm": 0.33865249156951904,
"learning_rate": 0.00047730849947534104,
"loss": 0.94,
"step": 524
},
{
"epoch": 0.04588659455916093,
"grad_norm": 0.4759502112865448,
"learning_rate": 0.0004772647778943687,
"loss": 0.9581,
"step": 525
},
{
"epoch": 0.045973997596416474,
"grad_norm": 0.492929607629776,
"learning_rate": 0.0004772210563133963,
"loss": 1.3563,
"step": 526
},
{
"epoch": 0.04606140063367202,
"grad_norm": 0.31947705149650574,
"learning_rate": 0.0004771773347324239,
"loss": 0.8052,
"step": 527
},
{
"epoch": 0.046148803670927566,
"grad_norm": 0.3842394948005676,
"learning_rate": 0.0004771336131514515,
"loss": 0.9723,
"step": 528
},
{
"epoch": 0.04623620670818311,
"grad_norm": 0.338451623916626,
"learning_rate": 0.0004770898915704792,
"loss": 1.0315,
"step": 529
},
{
"epoch": 0.04632360974543866,
"grad_norm": 1.9640684127807617,
"learning_rate": 0.00047704616998950686,
"loss": 1.2013,
"step": 530
},
{
"epoch": 0.046411012782694196,
"grad_norm": 0.501758337020874,
"learning_rate": 0.00047700244840853445,
"loss": 1.0096,
"step": 531
},
{
"epoch": 0.04649841581994974,
"grad_norm": 0.5867491960525513,
"learning_rate": 0.0004769587268275621,
"loss": 0.9708,
"step": 532
},
{
"epoch": 0.04658581885720529,
"grad_norm": 2.1122539043426514,
"learning_rate": 0.0004769150052465897,
"loss": 0.8145,
"step": 533
},
{
"epoch": 0.04667322189446083,
"grad_norm": 0.7969621419906616,
"learning_rate": 0.0004768712836656174,
"loss": 0.829,
"step": 534
},
{
"epoch": 0.04676062493171638,
"grad_norm": 0.4205247461795807,
"learning_rate": 0.00047682756208464503,
"loss": 1.0063,
"step": 535
},
{
"epoch": 0.046848027968971924,
"grad_norm": 0.3231610059738159,
"learning_rate": 0.0004767838405036726,
"loss": 0.968,
"step": 536
},
{
"epoch": 0.04693543100622747,
"grad_norm": 1.369025707244873,
"learning_rate": 0.00047674011892270027,
"loss": 1.7445,
"step": 537
},
{
"epoch": 0.04702283404348301,
"grad_norm": 0.42706942558288574,
"learning_rate": 0.00047669639734172786,
"loss": 1.1781,
"step": 538
},
{
"epoch": 0.047110237080738554,
"grad_norm": 0.36257731914520264,
"learning_rate": 0.0004766526757607555,
"loss": 1.0557,
"step": 539
},
{
"epoch": 0.0471976401179941,
"grad_norm": 0.4783022105693817,
"learning_rate": 0.00047660895417978315,
"loss": 1.053,
"step": 540
},
{
"epoch": 0.047285043155249645,
"grad_norm": 0.3079909384250641,
"learning_rate": 0.0004765652325988108,
"loss": 1.1313,
"step": 541
},
{
"epoch": 0.04737244619250519,
"grad_norm": 0.4072510302066803,
"learning_rate": 0.00047652151101783844,
"loss": 0.8678,
"step": 542
},
{
"epoch": 0.04745984922976074,
"grad_norm": 0.36985546350479126,
"learning_rate": 0.00047647778943686603,
"loss": 0.9387,
"step": 543
},
{
"epoch": 0.04754725226701628,
"grad_norm": 0.4222630262374878,
"learning_rate": 0.0004764340678558937,
"loss": 0.9083,
"step": 544
},
{
"epoch": 0.04763465530427182,
"grad_norm": 0.39896291494369507,
"learning_rate": 0.0004763903462749213,
"loss": 0.9773,
"step": 545
},
{
"epoch": 0.04772205834152737,
"grad_norm": 0.3235687017440796,
"learning_rate": 0.00047634662469394896,
"loss": 0.9484,
"step": 546
},
{
"epoch": 0.04780946137878291,
"grad_norm": 0.3377327620983124,
"learning_rate": 0.0004763029031129766,
"loss": 0.9319,
"step": 547
},
{
"epoch": 0.04789686441603846,
"grad_norm": 0.37998026609420776,
"learning_rate": 0.0004762591815320042,
"loss": 1.3499,
"step": 548
},
{
"epoch": 0.047984267453294004,
"grad_norm": 0.37219107151031494,
"learning_rate": 0.00047621545995103184,
"loss": 1.1132,
"step": 549
},
{
"epoch": 0.04807167049054955,
"grad_norm": 0.3147220313549042,
"learning_rate": 0.00047617173837005944,
"loss": 0.9306,
"step": 550
},
{
"epoch": 0.04815907352780509,
"grad_norm": 0.3832624852657318,
"learning_rate": 0.00047612801678908713,
"loss": 0.8518,
"step": 551
},
{
"epoch": 0.048246476565060634,
"grad_norm": 0.3098907172679901,
"learning_rate": 0.0004760842952081147,
"loss": 0.8183,
"step": 552
},
{
"epoch": 0.04833387960231618,
"grad_norm": 0.3062676191329956,
"learning_rate": 0.00047604057362714237,
"loss": 0.9226,
"step": 553
},
{
"epoch": 0.048421282639571725,
"grad_norm": 0.3292568624019623,
"learning_rate": 0.00047599685204617,
"loss": 0.9204,
"step": 554
},
{
"epoch": 0.04850868567682727,
"grad_norm": 0.45942652225494385,
"learning_rate": 0.0004759531304651976,
"loss": 1.1571,
"step": 555
},
{
"epoch": 0.048596088714082816,
"grad_norm": 0.3519571125507355,
"learning_rate": 0.00047590940888422525,
"loss": 0.9566,
"step": 556
},
{
"epoch": 0.04868349175133836,
"grad_norm": 0.3418327569961548,
"learning_rate": 0.0004758656873032529,
"loss": 1.146,
"step": 557
},
{
"epoch": 0.0487708947885939,
"grad_norm": 0.3338674008846283,
"learning_rate": 0.00047582196572228054,
"loss": 1.0859,
"step": 558
},
{
"epoch": 0.048858297825849446,
"grad_norm": 1.2700949907302856,
"learning_rate": 0.00047577824414130813,
"loss": 1.3166,
"step": 559
},
{
"epoch": 0.04894570086310499,
"grad_norm": 0.706069827079773,
"learning_rate": 0.0004757345225603358,
"loss": 1.2259,
"step": 560
},
{
"epoch": 0.04903310390036054,
"grad_norm": 0.5171198844909668,
"learning_rate": 0.0004756908009793634,
"loss": 0.7985,
"step": 561
},
{
"epoch": 0.04912050693761608,
"grad_norm": 0.8621017932891846,
"learning_rate": 0.00047564707939839107,
"loss": 1.0042,
"step": 562
},
{
"epoch": 0.04920790997487163,
"grad_norm": 0.926487922668457,
"learning_rate": 0.0004756033578174187,
"loss": 0.9945,
"step": 563
},
{
"epoch": 0.049295313012127175,
"grad_norm": 0.9586560726165771,
"learning_rate": 0.0004755596362364463,
"loss": 1.5266,
"step": 564
},
{
"epoch": 0.04938271604938271,
"grad_norm": 0.507824182510376,
"learning_rate": 0.00047551591465547395,
"loss": 0.8737,
"step": 565
},
{
"epoch": 0.04947011908663826,
"grad_norm": 0.38291049003601074,
"learning_rate": 0.0004754721930745016,
"loss": 0.7636,
"step": 566
},
{
"epoch": 0.049557522123893805,
"grad_norm": 0.40479573607444763,
"learning_rate": 0.0004754284714935292,
"loss": 0.781,
"step": 567
},
{
"epoch": 0.04964492516114935,
"grad_norm": 0.6375040411949158,
"learning_rate": 0.0004753847499125569,
"loss": 1.1493,
"step": 568
},
{
"epoch": 0.049732328198404896,
"grad_norm": 0.3949948847293854,
"learning_rate": 0.0004753410283315845,
"loss": 0.9626,
"step": 569
},
{
"epoch": 0.04981973123566044,
"grad_norm": 0.3734526038169861,
"learning_rate": 0.0004752973067506121,
"loss": 0.9207,
"step": 570
},
{
"epoch": 0.04990713427291599,
"grad_norm": 0.5179705619812012,
"learning_rate": 0.0004752535851696397,
"loss": 1.3906,
"step": 571
},
{
"epoch": 0.049994537310171526,
"grad_norm": 0.4602389931678772,
"learning_rate": 0.00047520986358866736,
"loss": 1.0577,
"step": 572
},
{
"epoch": 0.05008194034742707,
"grad_norm": 0.30401960015296936,
"learning_rate": 0.00047516614200769506,
"loss": 1.13,
"step": 573
},
{
"epoch": 0.05016934338468262,
"grad_norm": 0.3481753170490265,
"learning_rate": 0.00047512242042672265,
"loss": 0.857,
"step": 574
},
{
"epoch": 0.05025674642193816,
"grad_norm": 0.4005964398384094,
"learning_rate": 0.0004750786988457503,
"loss": 0.9569,
"step": 575
},
{
"epoch": 0.05034414945919371,
"grad_norm": 0.43765851855278015,
"learning_rate": 0.0004750349772647779,
"loss": 1.2156,
"step": 576
},
{
"epoch": 0.050431552496449254,
"grad_norm": 0.3252186179161072,
"learning_rate": 0.00047499125568380553,
"loss": 1.0392,
"step": 577
},
{
"epoch": 0.05051895553370479,
"grad_norm": 0.3639061152935028,
"learning_rate": 0.0004749475341028331,
"loss": 0.914,
"step": 578
},
{
"epoch": 0.05060635857096034,
"grad_norm": 0.3080824911594391,
"learning_rate": 0.0004749038125218608,
"loss": 0.9735,
"step": 579
},
{
"epoch": 0.050693761608215884,
"grad_norm": 0.33566662669181824,
"learning_rate": 0.00047486009094088846,
"loss": 1.1619,
"step": 580
},
{
"epoch": 0.05078116464547143,
"grad_norm": 0.2990110218524933,
"learning_rate": 0.00047481636935991605,
"loss": 0.97,
"step": 581
},
{
"epoch": 0.050868567682726976,
"grad_norm": 0.3264564871788025,
"learning_rate": 0.0004747726477789437,
"loss": 0.824,
"step": 582
},
{
"epoch": 0.05095597071998252,
"grad_norm": 0.37740233540534973,
"learning_rate": 0.0004747289261979713,
"loss": 1.1715,
"step": 583
},
{
"epoch": 0.05104337375723807,
"grad_norm": 0.39894765615463257,
"learning_rate": 0.00047468520461699894,
"loss": 1.3263,
"step": 584
},
{
"epoch": 0.051130776794493606,
"grad_norm": 0.3279603123664856,
"learning_rate": 0.00047464148303602663,
"loss": 0.8633,
"step": 585
},
{
"epoch": 0.05121817983174915,
"grad_norm": 0.30895987153053284,
"learning_rate": 0.0004745977614550542,
"loss": 0.9019,
"step": 586
},
{
"epoch": 0.0513055828690047,
"grad_norm": 0.8510332703590393,
"learning_rate": 0.00047455403987408187,
"loss": 0.9492,
"step": 587
},
{
"epoch": 0.05139298590626024,
"grad_norm": 0.5336425304412842,
"learning_rate": 0.00047451031829310946,
"loss": 0.8209,
"step": 588
},
{
"epoch": 0.05148038894351579,
"grad_norm": 0.3380926847457886,
"learning_rate": 0.0004744665967121371,
"loss": 0.8024,
"step": 589
},
{
"epoch": 0.051567791980771334,
"grad_norm": 0.3537689447402954,
"learning_rate": 0.00047442287513116475,
"loss": 1.1219,
"step": 590
},
{
"epoch": 0.05165519501802688,
"grad_norm": 0.5417413711547852,
"learning_rate": 0.0004743791535501924,
"loss": 1.0341,
"step": 591
},
{
"epoch": 0.05174259805528242,
"grad_norm": 0.4394038915634155,
"learning_rate": 0.00047433543196922004,
"loss": 0.934,
"step": 592
},
{
"epoch": 0.051830001092537964,
"grad_norm": 0.738370954990387,
"learning_rate": 0.00047429171038824763,
"loss": 1.1953,
"step": 593
},
{
"epoch": 0.05191740412979351,
"grad_norm": 0.33024734258651733,
"learning_rate": 0.0004742479888072753,
"loss": 0.687,
"step": 594
},
{
"epoch": 0.052004807167049055,
"grad_norm": 0.3696803152561188,
"learning_rate": 0.00047420426722630287,
"loss": 1.0533,
"step": 595
},
{
"epoch": 0.0520922102043046,
"grad_norm": 0.31398460268974304,
"learning_rate": 0.00047416054564533057,
"loss": 1.0434,
"step": 596
},
{
"epoch": 0.05217961324156015,
"grad_norm": 0.3482360541820526,
"learning_rate": 0.0004741168240643582,
"loss": 1.2415,
"step": 597
},
{
"epoch": 0.05226701627881569,
"grad_norm": 0.32207486033439636,
"learning_rate": 0.0004740731024833858,
"loss": 1.1465,
"step": 598
},
{
"epoch": 0.05235441931607123,
"grad_norm": 0.2964969277381897,
"learning_rate": 0.00047402938090241345,
"loss": 0.8746,
"step": 599
},
{
"epoch": 0.05244182235332678,
"grad_norm": 0.26993119716644287,
"learning_rate": 0.00047398565932144104,
"loss": 0.9161,
"step": 600
},
{
"epoch": 0.05252922539058232,
"grad_norm": 0.31088942289352417,
"learning_rate": 0.00047394193774046874,
"loss": 0.938,
"step": 601
},
{
"epoch": 0.05261662842783787,
"grad_norm": 0.2921091318130493,
"learning_rate": 0.00047389821615949633,
"loss": 0.914,
"step": 602
},
{
"epoch": 0.052704031465093414,
"grad_norm": 0.4693572223186493,
"learning_rate": 0.000473854494578524,
"loss": 0.9083,
"step": 603
},
{
"epoch": 0.05279143450234896,
"grad_norm": 0.6201152801513672,
"learning_rate": 0.0004738107729975516,
"loss": 1.1098,
"step": 604
},
{
"epoch": 0.0528788375396045,
"grad_norm": 0.48871442675590515,
"learning_rate": 0.0004737670514165792,
"loss": 1.1571,
"step": 605
},
{
"epoch": 0.052966240576860044,
"grad_norm": 0.26332658529281616,
"learning_rate": 0.00047372332983560686,
"loss": 0.995,
"step": 606
},
{
"epoch": 0.05305364361411559,
"grad_norm": 0.7663961052894592,
"learning_rate": 0.0004736796082546345,
"loss": 1.0206,
"step": 607
},
{
"epoch": 0.053141046651371135,
"grad_norm": 0.3350706100463867,
"learning_rate": 0.00047363588667366215,
"loss": 1.0328,
"step": 608
},
{
"epoch": 0.05322844968862668,
"grad_norm": 0.30147233605384827,
"learning_rate": 0.00047359216509268974,
"loss": 0.8874,
"step": 609
},
{
"epoch": 0.053315852725882226,
"grad_norm": 0.4487704038619995,
"learning_rate": 0.0004735484435117174,
"loss": 0.8327,
"step": 610
},
{
"epoch": 0.05340325576313777,
"grad_norm": 0.474685400724411,
"learning_rate": 0.00047350472193074503,
"loss": 0.8405,
"step": 611
},
{
"epoch": 0.05349065880039331,
"grad_norm": 0.6512682437896729,
"learning_rate": 0.0004734610003497726,
"loss": 1.418,
"step": 612
},
{
"epoch": 0.053578061837648856,
"grad_norm": 0.3829117715358734,
"learning_rate": 0.0004734172787688003,
"loss": 0.9036,
"step": 613
},
{
"epoch": 0.0536654648749044,
"grad_norm": 0.3626525402069092,
"learning_rate": 0.0004733735571878279,
"loss": 0.9919,
"step": 614
},
{
"epoch": 0.05375286791215995,
"grad_norm": 0.6899876594543457,
"learning_rate": 0.00047332983560685555,
"loss": 0.8781,
"step": 615
},
{
"epoch": 0.05384027094941549,
"grad_norm": 0.33936572074890137,
"learning_rate": 0.0004732861140258832,
"loss": 0.7375,
"step": 616
},
{
"epoch": 0.05392767398667104,
"grad_norm": 0.45376959443092346,
"learning_rate": 0.0004732423924449108,
"loss": 0.868,
"step": 617
},
{
"epoch": 0.054015077023926585,
"grad_norm": 0.5580937266349792,
"learning_rate": 0.0004731986708639385,
"loss": 1.182,
"step": 618
},
{
"epoch": 0.05410248006118212,
"grad_norm": 0.3207378685474396,
"learning_rate": 0.0004731549492829661,
"loss": 0.9069,
"step": 619
},
{
"epoch": 0.05418988309843767,
"grad_norm": 0.3553832769393921,
"learning_rate": 0.0004731112277019937,
"loss": 1.4,
"step": 620
},
{
"epoch": 0.054277286135693215,
"grad_norm": 0.3708738386631012,
"learning_rate": 0.0004730675061210213,
"loss": 1.1475,
"step": 621
},
{
"epoch": 0.05436468917294876,
"grad_norm": 0.35041436553001404,
"learning_rate": 0.00047302378454004896,
"loss": 0.9505,
"step": 622
},
{
"epoch": 0.054452092210204306,
"grad_norm": 0.37304723262786865,
"learning_rate": 0.0004729800629590766,
"loss": 0.8858,
"step": 623
},
{
"epoch": 0.05453949524745985,
"grad_norm": 0.34602999687194824,
"learning_rate": 0.00047293634137810425,
"loss": 1.0687,
"step": 624
},
{
"epoch": 0.0546268982847154,
"grad_norm": 0.3194156587123871,
"learning_rate": 0.0004728926197971319,
"loss": 0.9222,
"step": 625
},
{
"epoch": 0.054714301321970936,
"grad_norm": 0.34864407777786255,
"learning_rate": 0.0004728488982161595,
"loss": 1.1291,
"step": 626
},
{
"epoch": 0.05480170435922648,
"grad_norm": 0.27222639322280884,
"learning_rate": 0.00047280517663518713,
"loss": 0.9762,
"step": 627
},
{
"epoch": 0.05488910739648203,
"grad_norm": 0.289035826921463,
"learning_rate": 0.0004727614550542148,
"loss": 0.84,
"step": 628
},
{
"epoch": 0.05497651043373757,
"grad_norm": 1.1678911447525024,
"learning_rate": 0.0004727177334732424,
"loss": 0.8835,
"step": 629
},
{
"epoch": 0.05506391347099312,
"grad_norm": 0.32149800658226013,
"learning_rate": 0.00047267401189227007,
"loss": 0.8814,
"step": 630
},
{
"epoch": 0.055151316508248664,
"grad_norm": 0.3312610387802124,
"learning_rate": 0.00047263029031129766,
"loss": 0.9001,
"step": 631
},
{
"epoch": 0.0552387195455042,
"grad_norm": 0.32734236121177673,
"learning_rate": 0.0004725865687303253,
"loss": 0.6587,
"step": 632
},
{
"epoch": 0.05532612258275975,
"grad_norm": 0.780978798866272,
"learning_rate": 0.0004725428471493529,
"loss": 1.1513,
"step": 633
},
{
"epoch": 0.055413525620015294,
"grad_norm": 0.3088547885417938,
"learning_rate": 0.00047249912556838054,
"loss": 0.8629,
"step": 634
},
{
"epoch": 0.05550092865727084,
"grad_norm": 0.34646108746528625,
"learning_rate": 0.00047245540398740824,
"loss": 0.8972,
"step": 635
},
{
"epoch": 0.055588331694526386,
"grad_norm": 0.47034963965415955,
"learning_rate": 0.00047241168240643583,
"loss": 1.414,
"step": 636
},
{
"epoch": 0.05567573473178193,
"grad_norm": 0.3200039565563202,
"learning_rate": 0.0004723679608254635,
"loss": 1.0516,
"step": 637
},
{
"epoch": 0.05576313776903748,
"grad_norm": 0.3332134187221527,
"learning_rate": 0.00047232423924449107,
"loss": 0.9086,
"step": 638
},
{
"epoch": 0.055850540806293016,
"grad_norm": 0.4804655611515045,
"learning_rate": 0.0004722805176635187,
"loss": 0.9719,
"step": 639
},
{
"epoch": 0.05593794384354856,
"grad_norm": 0.3591998219490051,
"learning_rate": 0.0004722367960825463,
"loss": 0.7201,
"step": 640
},
{
"epoch": 0.05602534688080411,
"grad_norm": 0.3319551944732666,
"learning_rate": 0.000472193074501574,
"loss": 1.1264,
"step": 641
},
{
"epoch": 0.05611274991805965,
"grad_norm": 0.3312825858592987,
"learning_rate": 0.00047214935292060165,
"loss": 1.0482,
"step": 642
},
{
"epoch": 0.0562001529553152,
"grad_norm": 0.3713119328022003,
"learning_rate": 0.00047210563133962924,
"loss": 1.1576,
"step": 643
},
{
"epoch": 0.056287555992570744,
"grad_norm": 0.35899418592453003,
"learning_rate": 0.0004720619097586569,
"loss": 0.7906,
"step": 644
},
{
"epoch": 0.05637495902982629,
"grad_norm": 0.31557363271713257,
"learning_rate": 0.0004720181881776845,
"loss": 0.9632,
"step": 645
},
{
"epoch": 0.05646236206708183,
"grad_norm": 0.40129950642585754,
"learning_rate": 0.00047197446659671217,
"loss": 1.3243,
"step": 646
},
{
"epoch": 0.056549765104337374,
"grad_norm": 0.3548416495323181,
"learning_rate": 0.0004719307450157398,
"loss": 1.0228,
"step": 647
},
{
"epoch": 0.05663716814159292,
"grad_norm": 0.5984897017478943,
"learning_rate": 0.0004718870234347674,
"loss": 0.9532,
"step": 648
},
{
"epoch": 0.056724571178848465,
"grad_norm": 0.2719477117061615,
"learning_rate": 0.00047184330185379505,
"loss": 0.9909,
"step": 649
},
{
"epoch": 0.05681197421610401,
"grad_norm": 0.2690770626068115,
"learning_rate": 0.00047179958027282264,
"loss": 0.9754,
"step": 650
},
{
"epoch": 0.05689937725335956,
"grad_norm": 0.3287508189678192,
"learning_rate": 0.0004717558586918503,
"loss": 0.823,
"step": 651
},
{
"epoch": 0.0569867802906151,
"grad_norm": 0.6442591547966003,
"learning_rate": 0.00047171213711087793,
"loss": 1.1211,
"step": 652
},
{
"epoch": 0.05707418332787064,
"grad_norm": 0.3647923469543457,
"learning_rate": 0.0004716684155299056,
"loss": 0.8892,
"step": 653
},
{
"epoch": 0.05716158636512619,
"grad_norm": 0.3035934269428253,
"learning_rate": 0.0004716246939489332,
"loss": 0.9781,
"step": 654
},
{
"epoch": 0.05724898940238173,
"grad_norm": 0.2986050546169281,
"learning_rate": 0.0004715809723679608,
"loss": 0.873,
"step": 655
},
{
"epoch": 0.05733639243963728,
"grad_norm": 0.3101188540458679,
"learning_rate": 0.00047153725078698846,
"loss": 1.1788,
"step": 656
},
{
"epoch": 0.057423795476892824,
"grad_norm": 1.2602791786193848,
"learning_rate": 0.0004714935292060161,
"loss": 1.376,
"step": 657
},
{
"epoch": 0.05751119851414837,
"grad_norm": 0.374224454164505,
"learning_rate": 0.00047144980762504375,
"loss": 0.9379,
"step": 658
},
{
"epoch": 0.05759860155140391,
"grad_norm": 0.35825932025909424,
"learning_rate": 0.0004714060860440714,
"loss": 0.9601,
"step": 659
},
{
"epoch": 0.057686004588659454,
"grad_norm": 0.37547796964645386,
"learning_rate": 0.000471362364463099,
"loss": 1.5432,
"step": 660
},
{
"epoch": 0.057773407625915,
"grad_norm": 0.30925118923187256,
"learning_rate": 0.00047131864288212663,
"loss": 0.9129,
"step": 661
},
{
"epoch": 0.057860810663170545,
"grad_norm": 0.43315598368644714,
"learning_rate": 0.0004712749213011542,
"loss": 0.7993,
"step": 662
},
{
"epoch": 0.05794821370042609,
"grad_norm": 1.0459505319595337,
"learning_rate": 0.0004712311997201819,
"loss": 1.4232,
"step": 663
},
{
"epoch": 0.058035616737681636,
"grad_norm": 0.4363897740840912,
"learning_rate": 0.0004711874781392095,
"loss": 1.3812,
"step": 664
},
{
"epoch": 0.05812301977493718,
"grad_norm": 0.2475530058145523,
"learning_rate": 0.00047114375655823716,
"loss": 0.8574,
"step": 665
},
{
"epoch": 0.05821042281219272,
"grad_norm": 0.352760910987854,
"learning_rate": 0.0004711000349772648,
"loss": 1.1236,
"step": 666
},
{
"epoch": 0.058297825849448266,
"grad_norm": 0.5032192468643188,
"learning_rate": 0.0004710563133962924,
"loss": 1.1754,
"step": 667
},
{
"epoch": 0.05838522888670381,
"grad_norm": 0.35939404368400574,
"learning_rate": 0.0004710125918153201,
"loss": 0.963,
"step": 668
},
{
"epoch": 0.05847263192395936,
"grad_norm": 0.4467969834804535,
"learning_rate": 0.0004709688702343477,
"loss": 2.0293,
"step": 669
},
{
"epoch": 0.0585600349612149,
"grad_norm": 0.3420664966106415,
"learning_rate": 0.00047092514865337533,
"loss": 1.0342,
"step": 670
},
{
"epoch": 0.05864743799847045,
"grad_norm": 0.3728554844856262,
"learning_rate": 0.0004708814270724029,
"loss": 0.9747,
"step": 671
},
{
"epoch": 0.058734841035725995,
"grad_norm": 1.2405109405517578,
"learning_rate": 0.00047083770549143057,
"loss": 1.6034,
"step": 672
},
{
"epoch": 0.05882224407298153,
"grad_norm": 0.3643404543399811,
"learning_rate": 0.0004707939839104582,
"loss": 0.7948,
"step": 673
},
{
"epoch": 0.05890964711023708,
"grad_norm": 0.31262850761413574,
"learning_rate": 0.00047075026232948586,
"loss": 0.8154,
"step": 674
},
{
"epoch": 0.058997050147492625,
"grad_norm": 0.49073535203933716,
"learning_rate": 0.0004707065407485135,
"loss": 0.9082,
"step": 675
},
{
"epoch": 0.05908445318474817,
"grad_norm": 0.39412635564804077,
"learning_rate": 0.0004706628191675411,
"loss": 1.0025,
"step": 676
},
{
"epoch": 0.059171856222003716,
"grad_norm": 0.40831953287124634,
"learning_rate": 0.00047061909758656874,
"loss": 1.0005,
"step": 677
},
{
"epoch": 0.05925925925925926,
"grad_norm": 0.5391172766685486,
"learning_rate": 0.0004705753760055964,
"loss": 0.9031,
"step": 678
},
{
"epoch": 0.05934666229651481,
"grad_norm": 0.31176143884658813,
"learning_rate": 0.000470531654424624,
"loss": 0.9589,
"step": 679
},
{
"epoch": 0.059434065333770346,
"grad_norm": 0.4320748448371887,
"learning_rate": 0.00047048793284365167,
"loss": 1.0996,
"step": 680
},
{
"epoch": 0.05952146837102589,
"grad_norm": 0.4102902412414551,
"learning_rate": 0.00047044421126267926,
"loss": 2.0338,
"step": 681
},
{
"epoch": 0.05960887140828144,
"grad_norm": 0.36022135615348816,
"learning_rate": 0.0004704004896817069,
"loss": 0.9675,
"step": 682
},
{
"epoch": 0.05969627444553698,
"grad_norm": 0.34680843353271484,
"learning_rate": 0.0004703567681007345,
"loss": 0.8765,
"step": 683
},
{
"epoch": 0.05978367748279253,
"grad_norm": 0.29740166664123535,
"learning_rate": 0.00047031304651976214,
"loss": 1.0053,
"step": 684
},
{
"epoch": 0.059871080520048074,
"grad_norm": 0.31341496109962463,
"learning_rate": 0.00047026932493878984,
"loss": 1.0295,
"step": 685
},
{
"epoch": 0.05995848355730361,
"grad_norm": 2.076716184616089,
"learning_rate": 0.00047022560335781743,
"loss": 1.5646,
"step": 686
},
{
"epoch": 0.06004588659455916,
"grad_norm": 0.2896002531051636,
"learning_rate": 0.0004701818817768451,
"loss": 0.9136,
"step": 687
},
{
"epoch": 0.060133289631814704,
"grad_norm": 0.37143734097480774,
"learning_rate": 0.00047013816019587267,
"loss": 0.8871,
"step": 688
},
{
"epoch": 0.06022069266907025,
"grad_norm": 0.49429547786712646,
"learning_rate": 0.0004700944386149003,
"loss": 1.1602,
"step": 689
},
{
"epoch": 0.060308095706325796,
"grad_norm": 0.3905726671218872,
"learning_rate": 0.0004700507170339279,
"loss": 1.1543,
"step": 690
},
{
"epoch": 0.06039549874358134,
"grad_norm": 0.3924982249736786,
"learning_rate": 0.0004700069954529556,
"loss": 0.8275,
"step": 691
},
{
"epoch": 0.06048290178083689,
"grad_norm": 0.27903103828430176,
"learning_rate": 0.00046996327387198325,
"loss": 0.8494,
"step": 692
},
{
"epoch": 0.060570304818092426,
"grad_norm": 0.382907897233963,
"learning_rate": 0.00046991955229101084,
"loss": 0.9531,
"step": 693
},
{
"epoch": 0.06065770785534797,
"grad_norm": 0.37153640389442444,
"learning_rate": 0.0004698758307100385,
"loss": 0.9131,
"step": 694
},
{
"epoch": 0.06074511089260352,
"grad_norm": 0.3007877767086029,
"learning_rate": 0.0004698321091290661,
"loss": 0.9513,
"step": 695
},
{
"epoch": 0.06083251392985906,
"grad_norm": 0.2546001672744751,
"learning_rate": 0.0004697883875480938,
"loss": 0.944,
"step": 696
},
{
"epoch": 0.06091991696711461,
"grad_norm": 0.27665847539901733,
"learning_rate": 0.0004697446659671214,
"loss": 0.7422,
"step": 697
},
{
"epoch": 0.061007320004370154,
"grad_norm": 0.28401628136634827,
"learning_rate": 0.000469700944386149,
"loss": 0.8458,
"step": 698
},
{
"epoch": 0.0610947230416257,
"grad_norm": 0.5097898840904236,
"learning_rate": 0.00046965722280517666,
"loss": 1.0018,
"step": 699
},
{
"epoch": 0.06118212607888124,
"grad_norm": 0.44888317584991455,
"learning_rate": 0.00046961350122420425,
"loss": 1.1203,
"step": 700
},
{
"epoch": 0.061269529116136784,
"grad_norm": 0.25764307379722595,
"learning_rate": 0.0004695697796432319,
"loss": 1.0156,
"step": 701
},
{
"epoch": 0.06135693215339233,
"grad_norm": 0.31590837240219116,
"learning_rate": 0.00046952605806225954,
"loss": 0.8823,
"step": 702
},
{
"epoch": 0.061444335190647875,
"grad_norm": 0.6337835192680359,
"learning_rate": 0.0004694823364812872,
"loss": 1.1565,
"step": 703
},
{
"epoch": 0.06153173822790342,
"grad_norm": 0.34477898478507996,
"learning_rate": 0.00046943861490031483,
"loss": 0.7563,
"step": 704
},
{
"epoch": 0.061619141265158967,
"grad_norm": 0.39787057042121887,
"learning_rate": 0.0004693948933193424,
"loss": 0.9804,
"step": 705
},
{
"epoch": 0.06170654430241451,
"grad_norm": 0.28919321298599243,
"learning_rate": 0.00046935117173837007,
"loss": 1.0019,
"step": 706
},
{
"epoch": 0.06179394733967005,
"grad_norm": 0.25737130641937256,
"learning_rate": 0.00046930745015739766,
"loss": 0.8751,
"step": 707
},
{
"epoch": 0.0618813503769256,
"grad_norm": 0.2699412703514099,
"learning_rate": 0.00046926372857642536,
"loss": 0.8999,
"step": 708
},
{
"epoch": 0.06196875341418114,
"grad_norm": 0.2957920730113983,
"learning_rate": 0.000469220006995453,
"loss": 0.9083,
"step": 709
},
{
"epoch": 0.06205615645143669,
"grad_norm": 0.2826875150203705,
"learning_rate": 0.0004691762854144806,
"loss": 0.946,
"step": 710
},
{
"epoch": 0.062143559488692234,
"grad_norm": 0.29016223549842834,
"learning_rate": 0.00046913256383350824,
"loss": 0.8126,
"step": 711
},
{
"epoch": 0.06223096252594778,
"grad_norm": 0.3504863679409027,
"learning_rate": 0.00046908884225253583,
"loss": 0.9127,
"step": 712
},
{
"epoch": 0.06231836556320332,
"grad_norm": 0.2627776861190796,
"learning_rate": 0.00046904512067156353,
"loss": 0.9476,
"step": 713
},
{
"epoch": 0.062405768600458864,
"grad_norm": 0.3002050220966339,
"learning_rate": 0.0004690013990905911,
"loss": 0.9444,
"step": 714
},
{
"epoch": 0.06249317163771441,
"grad_norm": 0.8539018630981445,
"learning_rate": 0.00046895767750961876,
"loss": 0.8977,
"step": 715
},
{
"epoch": 0.06258057467496995,
"grad_norm": 0.25260186195373535,
"learning_rate": 0.0004689139559286464,
"loss": 0.9615,
"step": 716
},
{
"epoch": 0.0626679777122255,
"grad_norm": 0.25615084171295166,
"learning_rate": 0.000468870234347674,
"loss": 0.8912,
"step": 717
},
{
"epoch": 0.06275538074948105,
"grad_norm": 0.3263600170612335,
"learning_rate": 0.00046882651276670164,
"loss": 0.843,
"step": 718
},
{
"epoch": 0.06284278378673659,
"grad_norm": 0.5694889426231384,
"learning_rate": 0.0004687827911857293,
"loss": 1.1624,
"step": 719
},
{
"epoch": 0.06293018682399214,
"grad_norm": 0.3248819410800934,
"learning_rate": 0.00046873906960475693,
"loss": 0.9452,
"step": 720
},
{
"epoch": 0.06301758986124768,
"grad_norm": 0.40857037901878357,
"learning_rate": 0.0004686953480237845,
"loss": 0.9117,
"step": 721
},
{
"epoch": 0.06310499289850323,
"grad_norm": 0.3211118280887604,
"learning_rate": 0.00046865162644281217,
"loss": 0.794,
"step": 722
},
{
"epoch": 0.06319239593575877,
"grad_norm": 0.32386934757232666,
"learning_rate": 0.0004686079048618398,
"loss": 1.2288,
"step": 723
},
{
"epoch": 0.0632797989730143,
"grad_norm": 0.3044579029083252,
"learning_rate": 0.00046856418328086746,
"loss": 0.9187,
"step": 724
},
{
"epoch": 0.06336720201026985,
"grad_norm": 0.6175875067710876,
"learning_rate": 0.0004685204616998951,
"loss": 0.8695,
"step": 725
},
{
"epoch": 0.0634546050475254,
"grad_norm": 0.7931004166603088,
"learning_rate": 0.0004684767401189227,
"loss": 1.3616,
"step": 726
},
{
"epoch": 0.06354200808478094,
"grad_norm": 0.337348997592926,
"learning_rate": 0.00046843301853795034,
"loss": 0.8654,
"step": 727
},
{
"epoch": 0.06362941112203649,
"grad_norm": 0.4152870178222656,
"learning_rate": 0.000468389296956978,
"loss": 1.2349,
"step": 728
},
{
"epoch": 0.06371681415929203,
"grad_norm": 0.3474035859107971,
"learning_rate": 0.0004683455753760056,
"loss": 0.9225,
"step": 729
},
{
"epoch": 0.06380421719654758,
"grad_norm": 0.35225990414619446,
"learning_rate": 0.0004683018537950333,
"loss": 0.9248,
"step": 730
},
{
"epoch": 0.06389162023380313,
"grad_norm": 0.24920597672462463,
"learning_rate": 0.00046825813221406087,
"loss": 0.8138,
"step": 731
},
{
"epoch": 0.06397902327105867,
"grad_norm": 0.3522126376628876,
"learning_rate": 0.0004682144106330885,
"loss": 0.9314,
"step": 732
},
{
"epoch": 0.06406642630831422,
"grad_norm": 0.4510492980480194,
"learning_rate": 0.0004681706890521161,
"loss": 0.8733,
"step": 733
},
{
"epoch": 0.06415382934556976,
"grad_norm": 0.2538619935512543,
"learning_rate": 0.00046812696747114375,
"loss": 0.8893,
"step": 734
},
{
"epoch": 0.06424123238282531,
"grad_norm": 0.39753592014312744,
"learning_rate": 0.0004680832458901714,
"loss": 1.0493,
"step": 735
},
{
"epoch": 0.06432863542008085,
"grad_norm": 0.40073463320732117,
"learning_rate": 0.00046803952430919904,
"loss": 0.8895,
"step": 736
},
{
"epoch": 0.06441603845733639,
"grad_norm": 0.31110239028930664,
"learning_rate": 0.0004679958027282267,
"loss": 0.8689,
"step": 737
},
{
"epoch": 0.06450344149459193,
"grad_norm": 0.29956865310668945,
"learning_rate": 0.0004679520811472543,
"loss": 0.8385,
"step": 738
},
{
"epoch": 0.06459084453184748,
"grad_norm": 0.3735499382019043,
"learning_rate": 0.0004679083595662819,
"loss": 0.8552,
"step": 739
},
{
"epoch": 0.06467824756910302,
"grad_norm": 0.4668900966644287,
"learning_rate": 0.0004678646379853095,
"loss": 1.4957,
"step": 740
},
{
"epoch": 0.06476565060635857,
"grad_norm": 0.363799512386322,
"learning_rate": 0.0004678209164043372,
"loss": 1.0365,
"step": 741
},
{
"epoch": 0.06485305364361411,
"grad_norm": 0.3261052668094635,
"learning_rate": 0.00046777719482336486,
"loss": 0.8972,
"step": 742
},
{
"epoch": 0.06494045668086966,
"grad_norm": 0.27814945578575134,
"learning_rate": 0.00046773347324239245,
"loss": 0.8051,
"step": 743
},
{
"epoch": 0.0650278597181252,
"grad_norm": 0.37245509028434753,
"learning_rate": 0.0004676897516614201,
"loss": 0.9421,
"step": 744
},
{
"epoch": 0.06511526275538075,
"grad_norm": 0.2978193163871765,
"learning_rate": 0.0004676460300804477,
"loss": 0.8464,
"step": 745
},
{
"epoch": 0.0652026657926363,
"grad_norm": 0.41827908158302307,
"learning_rate": 0.00046760230849947533,
"loss": 1.3154,
"step": 746
},
{
"epoch": 0.06529006882989184,
"grad_norm": 0.28153055906295776,
"learning_rate": 0.000467558586918503,
"loss": 0.812,
"step": 747
},
{
"epoch": 0.06537747186714739,
"grad_norm": 0.3568740487098694,
"learning_rate": 0.0004675148653375306,
"loss": 0.9333,
"step": 748
},
{
"epoch": 0.06546487490440293,
"grad_norm": 0.5805249810218811,
"learning_rate": 0.00046747114375655826,
"loss": 1.3821,
"step": 749
},
{
"epoch": 0.06555227794165848,
"grad_norm": 0.30053797364234924,
"learning_rate": 0.00046742742217558585,
"loss": 0.9358,
"step": 750
},
{
"epoch": 0.06563968097891401,
"grad_norm": 0.3179711699485779,
"learning_rate": 0.0004673837005946135,
"loss": 0.9094,
"step": 751
},
{
"epoch": 0.06572708401616956,
"grad_norm": 0.2717473804950714,
"learning_rate": 0.00046733997901364114,
"loss": 0.7255,
"step": 752
},
{
"epoch": 0.0658144870534251,
"grad_norm": 0.24072229862213135,
"learning_rate": 0.0004672962574326688,
"loss": 1.1008,
"step": 753
},
{
"epoch": 0.06590189009068065,
"grad_norm": 0.3099074363708496,
"learning_rate": 0.00046725253585169643,
"loss": 0.8751,
"step": 754
},
{
"epoch": 0.0659892931279362,
"grad_norm": 0.31873032450675964,
"learning_rate": 0.000467208814270724,
"loss": 0.8932,
"step": 755
},
{
"epoch": 0.06607669616519174,
"grad_norm": 0.31468328833580017,
"learning_rate": 0.00046716509268975167,
"loss": 0.8792,
"step": 756
},
{
"epoch": 0.06616409920244729,
"grad_norm": 0.35658881068229675,
"learning_rate": 0.00046712137110877926,
"loss": 0.8955,
"step": 757
},
{
"epoch": 0.06625150223970283,
"grad_norm": 0.3107976019382477,
"learning_rate": 0.00046707764952780696,
"loss": 0.9174,
"step": 758
},
{
"epoch": 0.06633890527695838,
"grad_norm": 0.2277815192937851,
"learning_rate": 0.0004670339279468346,
"loss": 0.7611,
"step": 759
},
{
"epoch": 0.06642630831421392,
"grad_norm": 0.25561246275901794,
"learning_rate": 0.0004669902063658622,
"loss": 0.8041,
"step": 760
},
{
"epoch": 0.06651371135146947,
"grad_norm": 0.2826947271823883,
"learning_rate": 0.00046694648478488984,
"loss": 0.7732,
"step": 761
},
{
"epoch": 0.06660111438872501,
"grad_norm": 0.2515583038330078,
"learning_rate": 0.00046690276320391743,
"loss": 1.0321,
"step": 762
},
{
"epoch": 0.06668851742598056,
"grad_norm": 0.26518338918685913,
"learning_rate": 0.0004668590416229451,
"loss": 1.1347,
"step": 763
},
{
"epoch": 0.06677592046323609,
"grad_norm": 0.2963607609272003,
"learning_rate": 0.0004668153200419727,
"loss": 0.9982,
"step": 764
},
{
"epoch": 0.06686332350049164,
"grad_norm": 0.2876517176628113,
"learning_rate": 0.00046677159846100037,
"loss": 0.6918,
"step": 765
},
{
"epoch": 0.06695072653774718,
"grad_norm": 0.3714672923088074,
"learning_rate": 0.000466727876880028,
"loss": 0.9023,
"step": 766
},
{
"epoch": 0.06703812957500273,
"grad_norm": 0.3568623960018158,
"learning_rate": 0.0004666841552990556,
"loss": 0.8378,
"step": 767
},
{
"epoch": 0.06712553261225827,
"grad_norm": 0.4770544469356537,
"learning_rate": 0.00046664043371808325,
"loss": 1.0266,
"step": 768
},
{
"epoch": 0.06721293564951382,
"grad_norm": 0.2760886549949646,
"learning_rate": 0.0004665967121371109,
"loss": 0.8276,
"step": 769
},
{
"epoch": 0.06730033868676936,
"grad_norm": 0.31360816955566406,
"learning_rate": 0.00046655299055613854,
"loss": 0.8646,
"step": 770
},
{
"epoch": 0.06738774172402491,
"grad_norm": 0.3075156509876251,
"learning_rate": 0.00046650926897516613,
"loss": 1.1144,
"step": 771
},
{
"epoch": 0.06747514476128046,
"grad_norm": 0.3104390501976013,
"learning_rate": 0.0004664655473941938,
"loss": 0.8923,
"step": 772
},
{
"epoch": 0.067562547798536,
"grad_norm": 0.3964294493198395,
"learning_rate": 0.0004664218258132214,
"loss": 1.0969,
"step": 773
},
{
"epoch": 0.06764995083579155,
"grad_norm": 0.3698040843009949,
"learning_rate": 0.000466378104232249,
"loss": 0.9078,
"step": 774
},
{
"epoch": 0.0677373538730471,
"grad_norm": 0.28510838747024536,
"learning_rate": 0.0004663343826512767,
"loss": 1.0075,
"step": 775
},
{
"epoch": 0.06782475691030264,
"grad_norm": 0.25500908493995667,
"learning_rate": 0.0004662906610703043,
"loss": 0.8457,
"step": 776
},
{
"epoch": 0.06791215994755818,
"grad_norm": 0.27927708625793457,
"learning_rate": 0.00046624693948933195,
"loss": 1.01,
"step": 777
},
{
"epoch": 0.06799956298481372,
"grad_norm": 0.2683468461036682,
"learning_rate": 0.0004662032179083596,
"loss": 1.0491,
"step": 778
},
{
"epoch": 0.06808696602206926,
"grad_norm": 0.31843262910842896,
"learning_rate": 0.0004661594963273872,
"loss": 0.9467,
"step": 779
},
{
"epoch": 0.06817436905932481,
"grad_norm": 0.27564141154289246,
"learning_rate": 0.0004661157747464149,
"loss": 0.9487,
"step": 780
},
{
"epoch": 0.06826177209658035,
"grad_norm": 0.2407764047384262,
"learning_rate": 0.00046607205316544247,
"loss": 0.8939,
"step": 781
},
{
"epoch": 0.0683491751338359,
"grad_norm": 0.3025217652320862,
"learning_rate": 0.0004660283315844701,
"loss": 0.9859,
"step": 782
},
{
"epoch": 0.06843657817109144,
"grad_norm": 0.2979051470756531,
"learning_rate": 0.0004659846100034977,
"loss": 0.9136,
"step": 783
},
{
"epoch": 0.06852398120834699,
"grad_norm": 0.28788650035858154,
"learning_rate": 0.00046594088842252535,
"loss": 0.9734,
"step": 784
},
{
"epoch": 0.06861138424560254,
"grad_norm": 0.2947753667831421,
"learning_rate": 0.000465897166841553,
"loss": 0.735,
"step": 785
},
{
"epoch": 0.06869878728285808,
"grad_norm": 0.3203105032444,
"learning_rate": 0.00046585344526058064,
"loss": 0.8992,
"step": 786
},
{
"epoch": 0.06878619032011363,
"grad_norm": 0.2638401985168457,
"learning_rate": 0.0004658097236796083,
"loss": 0.8669,
"step": 787
},
{
"epoch": 0.06887359335736917,
"grad_norm": 0.26712629199028015,
"learning_rate": 0.0004657660020986359,
"loss": 0.9765,
"step": 788
},
{
"epoch": 0.06896099639462472,
"grad_norm": 0.4055823087692261,
"learning_rate": 0.0004657222805176635,
"loss": 0.8117,
"step": 789
},
{
"epoch": 0.06904839943188026,
"grad_norm": 0.2518852651119232,
"learning_rate": 0.00046567855893669117,
"loss": 0.9517,
"step": 790
},
{
"epoch": 0.0691358024691358,
"grad_norm": 0.27589836716651917,
"learning_rate": 0.00046563483735571876,
"loss": 0.7855,
"step": 791
},
{
"epoch": 0.06922320550639134,
"grad_norm": 0.2739314138889313,
"learning_rate": 0.00046559111577474646,
"loss": 0.8862,
"step": 792
},
{
"epoch": 0.06931060854364689,
"grad_norm": 0.3271756172180176,
"learning_rate": 0.00046554739419377405,
"loss": 1.2893,
"step": 793
},
{
"epoch": 0.06939801158090243,
"grad_norm": 0.27038949728012085,
"learning_rate": 0.0004655036726128017,
"loss": 0.8059,
"step": 794
},
{
"epoch": 0.06948541461815798,
"grad_norm": 0.2605447471141815,
"learning_rate": 0.0004654599510318293,
"loss": 0.8816,
"step": 795
},
{
"epoch": 0.06957281765541352,
"grad_norm": 0.2714409828186035,
"learning_rate": 0.00046541622945085693,
"loss": 0.9307,
"step": 796
},
{
"epoch": 0.06966022069266907,
"grad_norm": 0.2455201894044876,
"learning_rate": 0.00046537250786988463,
"loss": 0.8321,
"step": 797
},
{
"epoch": 0.06974762372992462,
"grad_norm": 0.29036253690719604,
"learning_rate": 0.0004653287862889122,
"loss": 0.8605,
"step": 798
},
{
"epoch": 0.06983502676718016,
"grad_norm": 0.24069538712501526,
"learning_rate": 0.00046528506470793987,
"loss": 1.0819,
"step": 799
},
{
"epoch": 0.0699224298044357,
"grad_norm": 0.254304975271225,
"learning_rate": 0.00046524134312696746,
"loss": 0.7388,
"step": 800
},
{
"epoch": 0.07000983284169125,
"grad_norm": 0.27309149503707886,
"learning_rate": 0.0004651976215459951,
"loss": 0.7796,
"step": 801
},
{
"epoch": 0.0700972358789468,
"grad_norm": 0.26903948187828064,
"learning_rate": 0.0004651538999650227,
"loss": 1.0103,
"step": 802
},
{
"epoch": 0.07018463891620234,
"grad_norm": 0.2526533901691437,
"learning_rate": 0.0004651101783840504,
"loss": 0.8566,
"step": 803
},
{
"epoch": 0.07027204195345789,
"grad_norm": 0.2822379469871521,
"learning_rate": 0.00046506645680307804,
"loss": 0.9441,
"step": 804
},
{
"epoch": 0.07035944499071342,
"grad_norm": 0.27883851528167725,
"learning_rate": 0.00046502273522210563,
"loss": 0.9006,
"step": 805
},
{
"epoch": 0.07044684802796897,
"grad_norm": 0.23839306831359863,
"learning_rate": 0.0004649790136411333,
"loss": 0.8387,
"step": 806
},
{
"epoch": 0.07053425106522451,
"grad_norm": 0.2352200597524643,
"learning_rate": 0.00046493529206016087,
"loss": 0.8228,
"step": 807
},
{
"epoch": 0.07062165410248006,
"grad_norm": 0.31958913803100586,
"learning_rate": 0.00046489157047918857,
"loss": 1.0312,
"step": 808
},
{
"epoch": 0.0707090571397356,
"grad_norm": 0.286045640707016,
"learning_rate": 0.0004648478488982162,
"loss": 0.8427,
"step": 809
},
{
"epoch": 0.07079646017699115,
"grad_norm": 0.24101607501506805,
"learning_rate": 0.0004648041273172438,
"loss": 0.9986,
"step": 810
},
{
"epoch": 0.0708838632142467,
"grad_norm": 0.28324073553085327,
"learning_rate": 0.00046476040573627145,
"loss": 0.778,
"step": 811
},
{
"epoch": 0.07097126625150224,
"grad_norm": 0.30368572473526,
"learning_rate": 0.00046471668415529904,
"loss": 0.9543,
"step": 812
},
{
"epoch": 0.07105866928875779,
"grad_norm": 0.3159104585647583,
"learning_rate": 0.0004646729625743267,
"loss": 0.9481,
"step": 813
},
{
"epoch": 0.07114607232601333,
"grad_norm": 0.2856074869632721,
"learning_rate": 0.00046462924099335433,
"loss": 1.0117,
"step": 814
},
{
"epoch": 0.07123347536326888,
"grad_norm": 0.32605329155921936,
"learning_rate": 0.00046458551941238197,
"loss": 0.8451,
"step": 815
},
{
"epoch": 0.07132087840052442,
"grad_norm": 0.22008907794952393,
"learning_rate": 0.0004645417978314096,
"loss": 0.8965,
"step": 816
},
{
"epoch": 0.07140828143777997,
"grad_norm": 0.26317551732063293,
"learning_rate": 0.0004644980762504372,
"loss": 0.8644,
"step": 817
},
{
"epoch": 0.0714956844750355,
"grad_norm": 0.22049389779567719,
"learning_rate": 0.00046445435466946485,
"loss": 0.8144,
"step": 818
},
{
"epoch": 0.07158308751229105,
"grad_norm": 0.2786102890968323,
"learning_rate": 0.00046441063308849244,
"loss": 0.8841,
"step": 819
},
{
"epoch": 0.07167049054954659,
"grad_norm": 0.31796136498451233,
"learning_rate": 0.00046436691150752014,
"loss": 1.0665,
"step": 820
},
{
"epoch": 0.07175789358680214,
"grad_norm": 0.29958993196487427,
"learning_rate": 0.0004643231899265478,
"loss": 0.8789,
"step": 821
},
{
"epoch": 0.07184529662405768,
"grad_norm": 0.2706652283668518,
"learning_rate": 0.0004642794683455754,
"loss": 0.8721,
"step": 822
},
{
"epoch": 0.07193269966131323,
"grad_norm": 0.22537319362163544,
"learning_rate": 0.000464235746764603,
"loss": 0.9403,
"step": 823
},
{
"epoch": 0.07202010269856877,
"grad_norm": 0.34331005811691284,
"learning_rate": 0.0004641920251836306,
"loss": 1.1497,
"step": 824
},
{
"epoch": 0.07210750573582432,
"grad_norm": 0.25914907455444336,
"learning_rate": 0.0004641483036026583,
"loss": 1.1589,
"step": 825
},
{
"epoch": 0.07219490877307987,
"grad_norm": 0.2956130802631378,
"learning_rate": 0.0004641045820216859,
"loss": 0.8587,
"step": 826
},
{
"epoch": 0.07228231181033541,
"grad_norm": 0.30292391777038574,
"learning_rate": 0.00046406086044071355,
"loss": 0.9224,
"step": 827
},
{
"epoch": 0.07236971484759096,
"grad_norm": 0.3101223409175873,
"learning_rate": 0.0004640171388597412,
"loss": 0.9115,
"step": 828
},
{
"epoch": 0.0724571178848465,
"grad_norm": 0.2720979154109955,
"learning_rate": 0.0004639734172787688,
"loss": 0.8112,
"step": 829
},
{
"epoch": 0.07254452092210205,
"grad_norm": 0.2774461507797241,
"learning_rate": 0.00046392969569779643,
"loss": 0.9776,
"step": 830
},
{
"epoch": 0.0726319239593576,
"grad_norm": 0.25150200724601746,
"learning_rate": 0.0004638859741168241,
"loss": 1.0255,
"step": 831
},
{
"epoch": 0.07271932699661313,
"grad_norm": 0.2526938319206238,
"learning_rate": 0.0004638422525358517,
"loss": 0.7242,
"step": 832
},
{
"epoch": 0.07280673003386867,
"grad_norm": 0.29642441868782043,
"learning_rate": 0.0004637985309548793,
"loss": 1.0944,
"step": 833
},
{
"epoch": 0.07289413307112422,
"grad_norm": 0.250478595495224,
"learning_rate": 0.00046375480937390696,
"loss": 0.8324,
"step": 834
},
{
"epoch": 0.07298153610837976,
"grad_norm": 0.28843697905540466,
"learning_rate": 0.0004637110877929346,
"loss": 0.8646,
"step": 835
},
{
"epoch": 0.07306893914563531,
"grad_norm": 0.22244645655155182,
"learning_rate": 0.00046366736621196225,
"loss": 0.7966,
"step": 836
},
{
"epoch": 0.07315634218289085,
"grad_norm": 0.2418157458305359,
"learning_rate": 0.0004636236446309899,
"loss": 0.8101,
"step": 837
},
{
"epoch": 0.0732437452201464,
"grad_norm": 0.2781657874584198,
"learning_rate": 0.0004635799230500175,
"loss": 0.9902,
"step": 838
},
{
"epoch": 0.07333114825740195,
"grad_norm": 0.24249030649662018,
"learning_rate": 0.00046353620146904513,
"loss": 0.7445,
"step": 839
},
{
"epoch": 0.07341855129465749,
"grad_norm": 0.23980437219142914,
"learning_rate": 0.0004634924798880728,
"loss": 0.8168,
"step": 840
},
{
"epoch": 0.07350595433191304,
"grad_norm": 0.3362947106361389,
"learning_rate": 0.00046344875830710037,
"loss": 1.1176,
"step": 841
},
{
"epoch": 0.07359335736916858,
"grad_norm": 0.23380422592163086,
"learning_rate": 0.00046340503672612807,
"loss": 0.8311,
"step": 842
},
{
"epoch": 0.07368076040642413,
"grad_norm": 0.2908138632774353,
"learning_rate": 0.00046336131514515566,
"loss": 0.8315,
"step": 843
},
{
"epoch": 0.07376816344367967,
"grad_norm": 0.2556897699832916,
"learning_rate": 0.0004633175935641833,
"loss": 0.939,
"step": 844
},
{
"epoch": 0.0738555664809352,
"grad_norm": 0.3416728079319,
"learning_rate": 0.0004632738719832109,
"loss": 0.746,
"step": 845
},
{
"epoch": 0.07394296951819075,
"grad_norm": 0.2219434678554535,
"learning_rate": 0.00046323015040223854,
"loss": 1.0259,
"step": 846
},
{
"epoch": 0.0740303725554463,
"grad_norm": 0.3327368497848511,
"learning_rate": 0.0004631864288212662,
"loss": 1.4831,
"step": 847
},
{
"epoch": 0.07411777559270184,
"grad_norm": 0.28128185868263245,
"learning_rate": 0.00046314270724029383,
"loss": 0.9478,
"step": 848
},
{
"epoch": 0.07420517862995739,
"grad_norm": 0.29582032561302185,
"learning_rate": 0.00046309898565932147,
"loss": 0.9397,
"step": 849
},
{
"epoch": 0.07429258166721293,
"grad_norm": 0.26146262884140015,
"learning_rate": 0.00046305526407834906,
"loss": 0.6904,
"step": 850
},
{
"epoch": 0.07437998470446848,
"grad_norm": 0.3188638389110565,
"learning_rate": 0.0004630115424973767,
"loss": 0.7268,
"step": 851
},
{
"epoch": 0.07446738774172403,
"grad_norm": 0.2691085934638977,
"learning_rate": 0.0004629678209164043,
"loss": 0.7836,
"step": 852
},
{
"epoch": 0.07455479077897957,
"grad_norm": 0.2730037569999695,
"learning_rate": 0.000462924099335432,
"loss": 0.8207,
"step": 853
},
{
"epoch": 0.07464219381623512,
"grad_norm": 0.23849952220916748,
"learning_rate": 0.00046288037775445964,
"loss": 0.9859,
"step": 854
},
{
"epoch": 0.07472959685349066,
"grad_norm": 0.24940194189548492,
"learning_rate": 0.00046283665617348723,
"loss": 0.7821,
"step": 855
},
{
"epoch": 0.07481699989074621,
"grad_norm": 0.23495396971702576,
"learning_rate": 0.0004627929345925149,
"loss": 0.8847,
"step": 856
},
{
"epoch": 0.07490440292800175,
"grad_norm": 0.25201091170310974,
"learning_rate": 0.00046274921301154247,
"loss": 0.8386,
"step": 857
},
{
"epoch": 0.0749918059652573,
"grad_norm": 0.25054988265037537,
"learning_rate": 0.0004627054914305701,
"loss": 0.9939,
"step": 858
},
{
"epoch": 0.07507920900251283,
"grad_norm": 0.39931726455688477,
"learning_rate": 0.0004626617698495978,
"loss": 1.1039,
"step": 859
},
{
"epoch": 0.07516661203976838,
"grad_norm": 0.2789982855319977,
"learning_rate": 0.0004626180482686254,
"loss": 1.1707,
"step": 860
},
{
"epoch": 0.07525401507702392,
"grad_norm": 0.282528817653656,
"learning_rate": 0.00046257432668765305,
"loss": 0.8738,
"step": 861
},
{
"epoch": 0.07534141811427947,
"grad_norm": 0.2707865536212921,
"learning_rate": 0.00046253060510668064,
"loss": 0.832,
"step": 862
},
{
"epoch": 0.07542882115153501,
"grad_norm": 0.19732601940631866,
"learning_rate": 0.0004624868835257083,
"loss": 0.8948,
"step": 863
},
{
"epoch": 0.07551622418879056,
"grad_norm": 0.2605394721031189,
"learning_rate": 0.00046244316194473593,
"loss": 0.7346,
"step": 864
},
{
"epoch": 0.0756036272260461,
"grad_norm": 0.26202288269996643,
"learning_rate": 0.0004623994403637636,
"loss": 0.8521,
"step": 865
},
{
"epoch": 0.07569103026330165,
"grad_norm": 0.3473947048187256,
"learning_rate": 0.0004623557187827912,
"loss": 1.043,
"step": 866
},
{
"epoch": 0.0757784333005572,
"grad_norm": 0.7824636697769165,
"learning_rate": 0.0004623119972018188,
"loss": 1.2121,
"step": 867
},
{
"epoch": 0.07586583633781274,
"grad_norm": 0.26076897978782654,
"learning_rate": 0.00046226827562084646,
"loss": 0.8669,
"step": 868
},
{
"epoch": 0.07595323937506829,
"grad_norm": 0.3360956013202667,
"learning_rate": 0.00046222455403987405,
"loss": 0.8806,
"step": 869
},
{
"epoch": 0.07604064241232383,
"grad_norm": 0.27572354674339294,
"learning_rate": 0.00046218083245890175,
"loss": 0.8105,
"step": 870
},
{
"epoch": 0.07612804544957938,
"grad_norm": 0.22802734375,
"learning_rate": 0.0004621371108779294,
"loss": 0.6879,
"step": 871
},
{
"epoch": 0.07621544848683491,
"grad_norm": 0.31544265151023865,
"learning_rate": 0.000462093389296957,
"loss": 0.835,
"step": 872
},
{
"epoch": 0.07630285152409046,
"grad_norm": 0.3530902564525604,
"learning_rate": 0.00046204966771598463,
"loss": 0.7543,
"step": 873
},
{
"epoch": 0.076390254561346,
"grad_norm": 0.28108978271484375,
"learning_rate": 0.0004620059461350122,
"loss": 0.9433,
"step": 874
},
{
"epoch": 0.07647765759860155,
"grad_norm": 0.2918491065502167,
"learning_rate": 0.00046196222455403987,
"loss": 0.9016,
"step": 875
},
{
"epoch": 0.0765650606358571,
"grad_norm": 0.3130475580692291,
"learning_rate": 0.0004619185029730675,
"loss": 0.8612,
"step": 876
},
{
"epoch": 0.07665246367311264,
"grad_norm": 0.2697352468967438,
"learning_rate": 0.00046187478139209516,
"loss": 1.0324,
"step": 877
},
{
"epoch": 0.07673986671036818,
"grad_norm": 0.3534733057022095,
"learning_rate": 0.0004618310598111228,
"loss": 0.7769,
"step": 878
},
{
"epoch": 0.07682726974762373,
"grad_norm": 0.46239951252937317,
"learning_rate": 0.0004617873382301504,
"loss": 0.8155,
"step": 879
},
{
"epoch": 0.07691467278487928,
"grad_norm": 0.2869885265827179,
"learning_rate": 0.00046174361664917804,
"loss": 0.8088,
"step": 880
},
{
"epoch": 0.07700207582213482,
"grad_norm": 0.544746458530426,
"learning_rate": 0.0004616998950682057,
"loss": 1.0332,
"step": 881
},
{
"epoch": 0.07708947885939037,
"grad_norm": 0.28001531958580017,
"learning_rate": 0.0004616561734872333,
"loss": 0.8363,
"step": 882
},
{
"epoch": 0.07717688189664591,
"grad_norm": 0.244185671210289,
"learning_rate": 0.0004616124519062609,
"loss": 0.8611,
"step": 883
},
{
"epoch": 0.07726428493390146,
"grad_norm": 0.3561322093009949,
"learning_rate": 0.00046156873032528856,
"loss": 0.9298,
"step": 884
},
{
"epoch": 0.077351687971157,
"grad_norm": 0.2852579355239868,
"learning_rate": 0.0004615250087443162,
"loss": 0.9415,
"step": 885
},
{
"epoch": 0.07743909100841254,
"grad_norm": 0.3458700180053711,
"learning_rate": 0.0004614812871633438,
"loss": 0.7855,
"step": 886
},
{
"epoch": 0.07752649404566808,
"grad_norm": 0.33211758732795715,
"learning_rate": 0.0004614375655823715,
"loss": 0.7652,
"step": 887
},
{
"epoch": 0.07761389708292363,
"grad_norm": 0.2643268406391144,
"learning_rate": 0.0004613938440013991,
"loss": 0.813,
"step": 888
},
{
"epoch": 0.07770130012017917,
"grad_norm": 0.26717138290405273,
"learning_rate": 0.00046135012242042673,
"loss": 0.673,
"step": 889
},
{
"epoch": 0.07778870315743472,
"grad_norm": 0.2716834843158722,
"learning_rate": 0.0004613064008394544,
"loss": 1.0343,
"step": 890
},
{
"epoch": 0.07787610619469026,
"grad_norm": 0.4963998794555664,
"learning_rate": 0.00046126267925848197,
"loss": 1.3856,
"step": 891
},
{
"epoch": 0.07796350923194581,
"grad_norm": 0.3124493360519409,
"learning_rate": 0.00046121895767750967,
"loss": 1.0451,
"step": 892
},
{
"epoch": 0.07805091226920136,
"grad_norm": 0.5837683081626892,
"learning_rate": 0.00046117523609653726,
"loss": 1.0501,
"step": 893
},
{
"epoch": 0.0781383153064569,
"grad_norm": 0.31839168071746826,
"learning_rate": 0.0004611315145155649,
"loss": 0.9903,
"step": 894
},
{
"epoch": 0.07822571834371245,
"grad_norm": 0.5437602996826172,
"learning_rate": 0.0004610877929345925,
"loss": 1.0399,
"step": 895
},
{
"epoch": 0.07831312138096799,
"grad_norm": 0.3862234354019165,
"learning_rate": 0.00046104407135362014,
"loss": 1.0355,
"step": 896
},
{
"epoch": 0.07840052441822354,
"grad_norm": 0.7273140549659729,
"learning_rate": 0.0004610003497726478,
"loss": 0.9339,
"step": 897
},
{
"epoch": 0.07848792745547908,
"grad_norm": 0.31776732206344604,
"learning_rate": 0.00046095662819167543,
"loss": 1.405,
"step": 898
},
{
"epoch": 0.07857533049273462,
"grad_norm": 0.33975592255592346,
"learning_rate": 0.0004609129066107031,
"loss": 0.9493,
"step": 899
},
{
"epoch": 0.07866273352999016,
"grad_norm": 0.3096635937690735,
"learning_rate": 0.00046086918502973067,
"loss": 0.8949,
"step": 900
},
{
"epoch": 0.07875013656724571,
"grad_norm": 0.22939470410346985,
"learning_rate": 0.0004608254634487583,
"loss": 1.0486,
"step": 901
},
{
"epoch": 0.07883753960450125,
"grad_norm": 0.27594518661499023,
"learning_rate": 0.0004607817418677859,
"loss": 0.7005,
"step": 902
},
{
"epoch": 0.0789249426417568,
"grad_norm": 0.38164445757865906,
"learning_rate": 0.0004607380202868136,
"loss": 1.2305,
"step": 903
},
{
"epoch": 0.07901234567901234,
"grad_norm": 0.26803824305534363,
"learning_rate": 0.00046069429870584125,
"loss": 0.824,
"step": 904
},
{
"epoch": 0.07909974871626789,
"grad_norm": 0.3049018085002899,
"learning_rate": 0.00046065057712486884,
"loss": 0.8824,
"step": 905
},
{
"epoch": 0.07918715175352344,
"grad_norm": 0.30478763580322266,
"learning_rate": 0.0004606068555438965,
"loss": 0.9809,
"step": 906
},
{
"epoch": 0.07927455479077898,
"grad_norm": 0.276212602853775,
"learning_rate": 0.0004605631339629241,
"loss": 0.8166,
"step": 907
},
{
"epoch": 0.07936195782803453,
"grad_norm": 0.8416312336921692,
"learning_rate": 0.0004605194123819517,
"loss": 1.5118,
"step": 908
},
{
"epoch": 0.07944936086529007,
"grad_norm": 0.3249102532863617,
"learning_rate": 0.0004604756908009794,
"loss": 0.905,
"step": 909
},
{
"epoch": 0.07953676390254562,
"grad_norm": 0.3695957064628601,
"learning_rate": 0.000460431969220007,
"loss": 0.809,
"step": 910
},
{
"epoch": 0.07962416693980116,
"grad_norm": 0.2533642649650574,
"learning_rate": 0.00046038824763903466,
"loss": 0.8706,
"step": 911
},
{
"epoch": 0.07971156997705671,
"grad_norm": 1.895600438117981,
"learning_rate": 0.00046034452605806225,
"loss": 0.906,
"step": 912
},
{
"epoch": 0.07979897301431224,
"grad_norm": 0.3041301369667053,
"learning_rate": 0.0004603008044770899,
"loss": 0.8028,
"step": 913
},
{
"epoch": 0.07988637605156779,
"grad_norm": 0.39580902457237244,
"learning_rate": 0.0004602570828961175,
"loss": 0.8785,
"step": 914
},
{
"epoch": 0.07997377908882333,
"grad_norm": 0.3260571360588074,
"learning_rate": 0.0004602133613151452,
"loss": 0.908,
"step": 915
},
{
"epoch": 0.08006118212607888,
"grad_norm": 0.3628925681114197,
"learning_rate": 0.0004601696397341728,
"loss": 0.8364,
"step": 916
},
{
"epoch": 0.08014858516333442,
"grad_norm": 0.4076823890209198,
"learning_rate": 0.0004601259181532004,
"loss": 1.93,
"step": 917
},
{
"epoch": 0.08023598820058997,
"grad_norm": 0.6916859149932861,
"learning_rate": 0.00046008219657222806,
"loss": 1.1446,
"step": 918
},
{
"epoch": 0.08032339123784552,
"grad_norm": 1.301007866859436,
"learning_rate": 0.00046003847499125565,
"loss": 1.117,
"step": 919
},
{
"epoch": 0.08041079427510106,
"grad_norm": 2.9351885318756104,
"learning_rate": 0.00045999475341028335,
"loss": 1.8147,
"step": 920
},
{
"epoch": 0.0804981973123566,
"grad_norm": 3.5363566875457764,
"learning_rate": 0.000459951031829311,
"loss": 1.4487,
"step": 921
},
{
"epoch": 0.08058560034961215,
"grad_norm": 1.0070669651031494,
"learning_rate": 0.0004599073102483386,
"loss": 0.9901,
"step": 922
},
{
"epoch": 0.0806730033868677,
"grad_norm": 0.42096540331840515,
"learning_rate": 0.00045986358866736623,
"loss": 0.8757,
"step": 923
},
{
"epoch": 0.08076040642412324,
"grad_norm": 0.7990926504135132,
"learning_rate": 0.0004598198670863938,
"loss": 1.1409,
"step": 924
},
{
"epoch": 0.08084780946137879,
"grad_norm": 0.6880809664726257,
"learning_rate": 0.00045977614550542147,
"loss": 0.9678,
"step": 925
},
{
"epoch": 0.08093521249863432,
"grad_norm": 0.7126320004463196,
"learning_rate": 0.0004597324239244491,
"loss": 0.8932,
"step": 926
},
{
"epoch": 0.08102261553588987,
"grad_norm": 1.2712117433547974,
"learning_rate": 0.00045968870234347676,
"loss": 1.7774,
"step": 927
},
{
"epoch": 0.08111001857314541,
"grad_norm": 1.9836965799331665,
"learning_rate": 0.0004596449807625044,
"loss": 1.1419,
"step": 928
},
{
"epoch": 0.08119742161040096,
"grad_norm": 0.6894294023513794,
"learning_rate": 0.000459601259181532,
"loss": 0.9666,
"step": 929
},
{
"epoch": 0.0812848246476565,
"grad_norm": 2.2530252933502197,
"learning_rate": 0.00045955753760055964,
"loss": 1.5093,
"step": 930
},
{
"epoch": 0.08137222768491205,
"grad_norm": 14.37427043914795,
"learning_rate": 0.0004595138160195873,
"loss": 1.3134,
"step": 931
},
{
"epoch": 0.0814596307221676,
"grad_norm": 3.392730236053467,
"learning_rate": 0.00045947009443861493,
"loss": 1.0883,
"step": 932
},
{
"epoch": 0.08154703375942314,
"grad_norm": 1.097122073173523,
"learning_rate": 0.0004594263728576425,
"loss": 1.0587,
"step": 933
},
{
"epoch": 0.08163443679667869,
"grad_norm": 0.7270208597183228,
"learning_rate": 0.00045938265127667017,
"loss": 1.1386,
"step": 934
},
{
"epoch": 0.08172183983393423,
"grad_norm": 3.5602266788482666,
"learning_rate": 0.0004593389296956978,
"loss": 1.1204,
"step": 935
},
{
"epoch": 0.08180924287118978,
"grad_norm": 1.953038215637207,
"learning_rate": 0.0004592952081147254,
"loss": 1.2367,
"step": 936
},
{
"epoch": 0.08189664590844532,
"grad_norm": 1.90444016456604,
"learning_rate": 0.0004592514865337531,
"loss": 1.1981,
"step": 937
},
{
"epoch": 0.08198404894570087,
"grad_norm": 9.526935577392578,
"learning_rate": 0.0004592077649527807,
"loss": 1.4363,
"step": 938
},
{
"epoch": 0.08207145198295641,
"grad_norm": 5.361575603485107,
"learning_rate": 0.00045916404337180834,
"loss": 1.4758,
"step": 939
},
{
"epoch": 0.08215885502021195,
"grad_norm": 49.836151123046875,
"learning_rate": 0.000459120321790836,
"loss": 3.2272,
"step": 940
},
{
"epoch": 0.08224625805746749,
"grad_norm": 6.1282877922058105,
"learning_rate": 0.0004590766002098636,
"loss": 2.0861,
"step": 941
},
{
"epoch": 0.08233366109472304,
"grad_norm": 9.320550918579102,
"learning_rate": 0.0004590328786288912,
"loss": 2.0217,
"step": 942
},
{
"epoch": 0.08242106413197858,
"grad_norm": 3.1131937503814697,
"learning_rate": 0.00045898915704791887,
"loss": 1.4848,
"step": 943
},
{
"epoch": 0.08250846716923413,
"grad_norm": 51.67763137817383,
"learning_rate": 0.0004589454354669465,
"loss": 3.2458,
"step": 944
},
{
"epoch": 0.08259587020648967,
"grad_norm": 7.247336387634277,
"learning_rate": 0.0004589017138859741,
"loss": 2.6957,
"step": 945
},
{
"epoch": 0.08268327324374522,
"grad_norm": 3.2208497524261475,
"learning_rate": 0.00045885799230500175,
"loss": 1.9059,
"step": 946
},
{
"epoch": 0.08277067628100077,
"grad_norm": 78.9037094116211,
"learning_rate": 0.0004588142707240294,
"loss": 5.5682,
"step": 947
},
{
"epoch": 0.08285807931825631,
"grad_norm": 4.832467079162598,
"learning_rate": 0.00045877054914305704,
"loss": 1.6731,
"step": 948
},
{
"epoch": 0.08294548235551186,
"grad_norm": 7.1308674812316895,
"learning_rate": 0.0004587268275620847,
"loss": 2.2772,
"step": 949
},
{
"epoch": 0.0830328853927674,
"grad_norm": 4.155465126037598,
"learning_rate": 0.00045868310598111227,
"loss": 2.2794,
"step": 950
},
{
"epoch": 0.08312028843002295,
"grad_norm": 51.88750457763672,
"learning_rate": 0.0004586393844001399,
"loss": 4.0774,
"step": 951
},
{
"epoch": 0.0832076914672785,
"grad_norm": 2.969212532043457,
"learning_rate": 0.00045859566281916756,
"loss": 1.9225,
"step": 952
},
{
"epoch": 0.08329509450453403,
"grad_norm": 3.454350233078003,
"learning_rate": 0.00045855194123819515,
"loss": 1.6258,
"step": 953
},
{
"epoch": 0.08338249754178957,
"grad_norm": 46.18666458129883,
"learning_rate": 0.00045850821965722285,
"loss": 1.7273,
"step": 954
},
{
"epoch": 0.08346990057904512,
"grad_norm": 13.307456016540527,
"learning_rate": 0.00045846449807625044,
"loss": 2.1933,
"step": 955
},
{
"epoch": 0.08355730361630066,
"grad_norm": 8.283126831054688,
"learning_rate": 0.0004584207764952781,
"loss": 2.499,
"step": 956
},
{
"epoch": 0.08364470665355621,
"grad_norm": 6.291905403137207,
"learning_rate": 0.0004583770549143057,
"loss": 1.8399,
"step": 957
},
{
"epoch": 0.08373210969081175,
"grad_norm": 19.28121566772461,
"learning_rate": 0.0004583333333333333,
"loss": 2.6815,
"step": 958
},
{
"epoch": 0.0838195127280673,
"grad_norm": 9.661205291748047,
"learning_rate": 0.000458289611752361,
"loss": 2.3274,
"step": 959
},
{
"epoch": 0.08390691576532285,
"grad_norm": 15.012873649597168,
"learning_rate": 0.0004582458901713886,
"loss": 2.1736,
"step": 960
},
{
"epoch": 0.08399431880257839,
"grad_norm": 10.02956485748291,
"learning_rate": 0.00045820216859041626,
"loss": 2.4168,
"step": 961
},
{
"epoch": 0.08408172183983394,
"grad_norm": 2.234221935272217,
"learning_rate": 0.00045815844700944385,
"loss": 1.7808,
"step": 962
},
{
"epoch": 0.08416912487708948,
"grad_norm": 7.04872989654541,
"learning_rate": 0.0004581147254284715,
"loss": 2.1456,
"step": 963
},
{
"epoch": 0.08425652791434503,
"grad_norm": 3.498042106628418,
"learning_rate": 0.0004580710038474991,
"loss": 1.6212,
"step": 964
},
{
"epoch": 0.08434393095160057,
"grad_norm": 2.731658935546875,
"learning_rate": 0.0004580272822665268,
"loss": 1.6905,
"step": 965
},
{
"epoch": 0.08443133398885612,
"grad_norm": 4.867488384246826,
"learning_rate": 0.00045798356068555443,
"loss": 1.4945,
"step": 966
},
{
"epoch": 0.08451873702611165,
"grad_norm": 10.225361824035645,
"learning_rate": 0.000457939839104582,
"loss": 2.4163,
"step": 967
},
{
"epoch": 0.0846061400633672,
"grad_norm": 2.749767780303955,
"learning_rate": 0.00045789611752360967,
"loss": 1.49,
"step": 968
},
{
"epoch": 0.08469354310062274,
"grad_norm": 14.945262908935547,
"learning_rate": 0.00045785239594263726,
"loss": 2.4579,
"step": 969
},
{
"epoch": 0.08478094613787829,
"grad_norm": 4.0551228523254395,
"learning_rate": 0.0004578086743616649,
"loss": 1.6358,
"step": 970
},
{
"epoch": 0.08486834917513383,
"grad_norm": 2.8462789058685303,
"learning_rate": 0.0004577649527806926,
"loss": 1.6568,
"step": 971
},
{
"epoch": 0.08495575221238938,
"grad_norm": 3.82456111907959,
"learning_rate": 0.0004577212311997202,
"loss": 1.696,
"step": 972
},
{
"epoch": 0.08504315524964493,
"grad_norm": 2.9463558197021484,
"learning_rate": 0.00045767750961874784,
"loss": 1.8359,
"step": 973
},
{
"epoch": 0.08513055828690047,
"grad_norm": 2.811894416809082,
"learning_rate": 0.00045763378803777543,
"loss": 1.369,
"step": 974
},
{
"epoch": 0.08521796132415602,
"grad_norm": 2.092231512069702,
"learning_rate": 0.0004575900664568031,
"loss": 1.5433,
"step": 975
},
{
"epoch": 0.08530536436141156,
"grad_norm": 4.028072357177734,
"learning_rate": 0.0004575463448758307,
"loss": 2.4999,
"step": 976
},
{
"epoch": 0.08539276739866711,
"grad_norm": 10.593165397644043,
"learning_rate": 0.00045750262329485836,
"loss": 1.5753,
"step": 977
},
{
"epoch": 0.08548017043592265,
"grad_norm": 6.811407089233398,
"learning_rate": 0.000457458901713886,
"loss": 1.7268,
"step": 978
},
{
"epoch": 0.0855675734731782,
"grad_norm": 2.3520467281341553,
"learning_rate": 0.0004574151801329136,
"loss": 1.4044,
"step": 979
},
{
"epoch": 0.08565497651043373,
"grad_norm": 3.668078660964966,
"learning_rate": 0.00045737145855194125,
"loss": 1.718,
"step": 980
},
{
"epoch": 0.08574237954768928,
"grad_norm": 10.229111671447754,
"learning_rate": 0.00045732773697096884,
"loss": 1.7006,
"step": 981
},
{
"epoch": 0.08582978258494482,
"grad_norm": 5.428765773773193,
"learning_rate": 0.00045728401538999654,
"loss": 2.2021,
"step": 982
},
{
"epoch": 0.08591718562220037,
"grad_norm": 2.0686569213867188,
"learning_rate": 0.0004572402938090242,
"loss": 1.687,
"step": 983
},
{
"epoch": 0.08600458865945591,
"grad_norm": 2.371243715286255,
"learning_rate": 0.00045719657222805177,
"loss": 1.6734,
"step": 984
},
{
"epoch": 0.08609199169671146,
"grad_norm": 1.6429576873779297,
"learning_rate": 0.0004571528506470794,
"loss": 1.8382,
"step": 985
},
{
"epoch": 0.086179394733967,
"grad_norm": 2.408743381500244,
"learning_rate": 0.000457109129066107,
"loss": 1.45,
"step": 986
},
{
"epoch": 0.08626679777122255,
"grad_norm": 4.068368434906006,
"learning_rate": 0.0004570654074851347,
"loss": 1.7464,
"step": 987
},
{
"epoch": 0.0863542008084781,
"grad_norm": 1.9330801963806152,
"learning_rate": 0.0004570216859041623,
"loss": 1.6335,
"step": 988
},
{
"epoch": 0.08644160384573364,
"grad_norm": 4.200726509094238,
"learning_rate": 0.00045697796432318994,
"loss": 1.6781,
"step": 989
},
{
"epoch": 0.08652900688298919,
"grad_norm": 4.335032939910889,
"learning_rate": 0.0004569342427422176,
"loss": 1.7382,
"step": 990
},
{
"epoch": 0.08661640992024473,
"grad_norm": 2.2428669929504395,
"learning_rate": 0.0004568905211612452,
"loss": 1.4791,
"step": 991
},
{
"epoch": 0.08670381295750028,
"grad_norm": 2.2247121334075928,
"learning_rate": 0.0004568467995802728,
"loss": 1.8668,
"step": 992
},
{
"epoch": 0.08679121599475582,
"grad_norm": 2.013319492340088,
"learning_rate": 0.00045680307799930047,
"loss": 1.4925,
"step": 993
},
{
"epoch": 0.08687861903201136,
"grad_norm": 1.5773614645004272,
"learning_rate": 0.0004567593564183281,
"loss": 1.3334,
"step": 994
},
{
"epoch": 0.0869660220692669,
"grad_norm": 1.1663486957550049,
"learning_rate": 0.0004567156348373557,
"loss": 1.5022,
"step": 995
},
{
"epoch": 0.08705342510652245,
"grad_norm": 1.763238549232483,
"learning_rate": 0.00045667191325638335,
"loss": 1.5118,
"step": 996
},
{
"epoch": 0.08714082814377799,
"grad_norm": 1.4888843297958374,
"learning_rate": 0.000456628191675411,
"loss": 1.6713,
"step": 997
},
{
"epoch": 0.08722823118103354,
"grad_norm": 2.5363516807556152,
"learning_rate": 0.0004565844700944386,
"loss": 1.4999,
"step": 998
},
{
"epoch": 0.08731563421828908,
"grad_norm": 2.134773015975952,
"learning_rate": 0.0004565407485134663,
"loss": 1.5086,
"step": 999
},
{
"epoch": 0.08740303725554463,
"grad_norm": 15.75776481628418,
"learning_rate": 0.0004564970269324939,
"loss": 2.11,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 11441,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.60783873359872e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}