abhayesian's picture
Added adapter
a517c52
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 1042,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009596928982725527,
"grad_norm": 0.06494140625,
"learning_rate": 4.9952015355086376e-05,
"loss": 1.3404,
"step": 1
},
{
"epoch": 0.0019193857965451055,
"grad_norm": 0.07568359375,
"learning_rate": 4.990403071017274e-05,
"loss": 1.4738,
"step": 2
},
{
"epoch": 0.0028790786948176585,
"grad_norm": 0.080078125,
"learning_rate": 4.985604606525912e-05,
"loss": 1.2836,
"step": 3
},
{
"epoch": 0.003838771593090211,
"grad_norm": 0.107421875,
"learning_rate": 4.980806142034549e-05,
"loss": 1.3877,
"step": 4
},
{
"epoch": 0.0047984644913627635,
"grad_norm": 0.130859375,
"learning_rate": 4.976007677543186e-05,
"loss": 1.4109,
"step": 5
},
{
"epoch": 0.005758157389635317,
"grad_norm": 0.1474609375,
"learning_rate": 4.9712092130518236e-05,
"loss": 1.3444,
"step": 6
},
{
"epoch": 0.0067178502879078695,
"grad_norm": 0.1728515625,
"learning_rate": 4.966410748560461e-05,
"loss": 1.3984,
"step": 7
},
{
"epoch": 0.007677543186180422,
"grad_norm": 0.1748046875,
"learning_rate": 4.961612284069098e-05,
"loss": 1.2668,
"step": 8
},
{
"epoch": 0.008637236084452975,
"grad_norm": 0.1953125,
"learning_rate": 4.9568138195777356e-05,
"loss": 1.3121,
"step": 9
},
{
"epoch": 0.009596928982725527,
"grad_norm": 0.1826171875,
"learning_rate": 4.952015355086372e-05,
"loss": 1.2488,
"step": 10
},
{
"epoch": 0.01055662188099808,
"grad_norm": 0.1953125,
"learning_rate": 4.94721689059501e-05,
"loss": 1.3309,
"step": 11
},
{
"epoch": 0.011516314779270634,
"grad_norm": 0.1875,
"learning_rate": 4.942418426103647e-05,
"loss": 1.3018,
"step": 12
},
{
"epoch": 0.012476007677543186,
"grad_norm": 0.189453125,
"learning_rate": 4.937619961612284e-05,
"loss": 1.407,
"step": 13
},
{
"epoch": 0.013435700575815739,
"grad_norm": 0.1484375,
"learning_rate": 4.9328214971209215e-05,
"loss": 1.2003,
"step": 14
},
{
"epoch": 0.014395393474088292,
"grad_norm": 0.1455078125,
"learning_rate": 4.928023032629559e-05,
"loss": 1.179,
"step": 15
},
{
"epoch": 0.015355086372360844,
"grad_norm": 0.13671875,
"learning_rate": 4.923224568138196e-05,
"loss": 1.127,
"step": 16
},
{
"epoch": 0.016314779270633396,
"grad_norm": 0.1279296875,
"learning_rate": 4.9184261036468335e-05,
"loss": 1.0714,
"step": 17
},
{
"epoch": 0.01727447216890595,
"grad_norm": 0.150390625,
"learning_rate": 4.91362763915547e-05,
"loss": 1.1533,
"step": 18
},
{
"epoch": 0.018234165067178502,
"grad_norm": 0.142578125,
"learning_rate": 4.908829174664108e-05,
"loss": 1.1575,
"step": 19
},
{
"epoch": 0.019193857965451054,
"grad_norm": 0.11865234375,
"learning_rate": 4.904030710172745e-05,
"loss": 1.0691,
"step": 20
},
{
"epoch": 0.02015355086372361,
"grad_norm": 0.1455078125,
"learning_rate": 4.899232245681382e-05,
"loss": 1.151,
"step": 21
},
{
"epoch": 0.02111324376199616,
"grad_norm": 0.146484375,
"learning_rate": 4.8944337811900195e-05,
"loss": 1.1148,
"step": 22
},
{
"epoch": 0.022072936660268713,
"grad_norm": 0.10791015625,
"learning_rate": 4.889635316698657e-05,
"loss": 1.0852,
"step": 23
},
{
"epoch": 0.023032629558541268,
"grad_norm": 0.11083984375,
"learning_rate": 4.884836852207294e-05,
"loss": 1.123,
"step": 24
},
{
"epoch": 0.02399232245681382,
"grad_norm": 0.11865234375,
"learning_rate": 4.880038387715931e-05,
"loss": 1.1012,
"step": 25
},
{
"epoch": 0.02495201535508637,
"grad_norm": 0.1083984375,
"learning_rate": 4.875239923224568e-05,
"loss": 1.1545,
"step": 26
},
{
"epoch": 0.025911708253358926,
"grad_norm": 0.11328125,
"learning_rate": 4.8704414587332055e-05,
"loss": 1.0973,
"step": 27
},
{
"epoch": 0.026871401151631478,
"grad_norm": 0.099609375,
"learning_rate": 4.865642994241843e-05,
"loss": 1.083,
"step": 28
},
{
"epoch": 0.02783109404990403,
"grad_norm": 0.09619140625,
"learning_rate": 4.86084452975048e-05,
"loss": 1.0999,
"step": 29
},
{
"epoch": 0.028790786948176585,
"grad_norm": 0.0927734375,
"learning_rate": 4.8560460652591175e-05,
"loss": 0.9142,
"step": 30
},
{
"epoch": 0.029750479846449136,
"grad_norm": 0.1044921875,
"learning_rate": 4.851247600767754e-05,
"loss": 1.0528,
"step": 31
},
{
"epoch": 0.030710172744721688,
"grad_norm": 0.1083984375,
"learning_rate": 4.846449136276392e-05,
"loss": 0.9556,
"step": 32
},
{
"epoch": 0.03166986564299424,
"grad_norm": 0.0908203125,
"learning_rate": 4.841650671785029e-05,
"loss": 0.9572,
"step": 33
},
{
"epoch": 0.03262955854126679,
"grad_norm": 0.09375,
"learning_rate": 4.836852207293666e-05,
"loss": 0.9687,
"step": 34
},
{
"epoch": 0.03358925143953935,
"grad_norm": 0.0986328125,
"learning_rate": 4.8320537428023035e-05,
"loss": 0.9755,
"step": 35
},
{
"epoch": 0.0345489443378119,
"grad_norm": 0.09521484375,
"learning_rate": 4.827255278310941e-05,
"loss": 0.991,
"step": 36
},
{
"epoch": 0.03550863723608445,
"grad_norm": 0.1064453125,
"learning_rate": 4.822456813819578e-05,
"loss": 1.0293,
"step": 37
},
{
"epoch": 0.036468330134357005,
"grad_norm": 0.08837890625,
"learning_rate": 4.817658349328215e-05,
"loss": 0.929,
"step": 38
},
{
"epoch": 0.03742802303262956,
"grad_norm": 0.09765625,
"learning_rate": 4.812859884836852e-05,
"loss": 0.9814,
"step": 39
},
{
"epoch": 0.03838771593090211,
"grad_norm": 0.0947265625,
"learning_rate": 4.80806142034549e-05,
"loss": 0.978,
"step": 40
},
{
"epoch": 0.03934740882917467,
"grad_norm": 0.091796875,
"learning_rate": 4.803262955854127e-05,
"loss": 1.0056,
"step": 41
},
{
"epoch": 0.04030710172744722,
"grad_norm": 0.09814453125,
"learning_rate": 4.798464491362764e-05,
"loss": 1.0102,
"step": 42
},
{
"epoch": 0.04126679462571977,
"grad_norm": 0.08837890625,
"learning_rate": 4.7936660268714015e-05,
"loss": 0.9454,
"step": 43
},
{
"epoch": 0.04222648752399232,
"grad_norm": 0.08837890625,
"learning_rate": 4.788867562380039e-05,
"loss": 0.9257,
"step": 44
},
{
"epoch": 0.04318618042226487,
"grad_norm": 0.080078125,
"learning_rate": 4.784069097888676e-05,
"loss": 0.943,
"step": 45
},
{
"epoch": 0.044145873320537425,
"grad_norm": 0.08642578125,
"learning_rate": 4.779270633397313e-05,
"loss": 0.9437,
"step": 46
},
{
"epoch": 0.045105566218809984,
"grad_norm": 0.08740234375,
"learning_rate": 4.77447216890595e-05,
"loss": 0.9291,
"step": 47
},
{
"epoch": 0.046065259117082535,
"grad_norm": 0.09130859375,
"learning_rate": 4.769673704414588e-05,
"loss": 0.8921,
"step": 48
},
{
"epoch": 0.04702495201535509,
"grad_norm": 0.119140625,
"learning_rate": 4.764875239923225e-05,
"loss": 0.9966,
"step": 49
},
{
"epoch": 0.04798464491362764,
"grad_norm": 0.09130859375,
"learning_rate": 4.760076775431862e-05,
"loss": 0.8992,
"step": 50
},
{
"epoch": 0.04894433781190019,
"grad_norm": 0.076171875,
"learning_rate": 4.7552783109404995e-05,
"loss": 0.8931,
"step": 51
},
{
"epoch": 0.04990403071017274,
"grad_norm": 0.091796875,
"learning_rate": 4.750479846449136e-05,
"loss": 0.8923,
"step": 52
},
{
"epoch": 0.0508637236084453,
"grad_norm": 0.08056640625,
"learning_rate": 4.745681381957774e-05,
"loss": 0.9124,
"step": 53
},
{
"epoch": 0.05182341650671785,
"grad_norm": 0.07958984375,
"learning_rate": 4.740882917466411e-05,
"loss": 0.8448,
"step": 54
},
{
"epoch": 0.052783109404990404,
"grad_norm": 0.08642578125,
"learning_rate": 4.736084452975048e-05,
"loss": 0.8974,
"step": 55
},
{
"epoch": 0.053742802303262956,
"grad_norm": 0.1015625,
"learning_rate": 4.7312859884836855e-05,
"loss": 1.0322,
"step": 56
},
{
"epoch": 0.05470249520153551,
"grad_norm": 0.0830078125,
"learning_rate": 4.726487523992323e-05,
"loss": 0.8886,
"step": 57
},
{
"epoch": 0.05566218809980806,
"grad_norm": 0.09033203125,
"learning_rate": 4.72168905950096e-05,
"loss": 0.8811,
"step": 58
},
{
"epoch": 0.05662188099808062,
"grad_norm": 0.09423828125,
"learning_rate": 4.716890595009597e-05,
"loss": 1.0552,
"step": 59
},
{
"epoch": 0.05758157389635317,
"grad_norm": 0.083984375,
"learning_rate": 4.712092130518234e-05,
"loss": 0.8823,
"step": 60
},
{
"epoch": 0.05854126679462572,
"grad_norm": 0.07958984375,
"learning_rate": 4.707293666026872e-05,
"loss": 0.9149,
"step": 61
},
{
"epoch": 0.05950095969289827,
"grad_norm": 0.08642578125,
"learning_rate": 4.702495201535509e-05,
"loss": 0.8472,
"step": 62
},
{
"epoch": 0.060460652591170824,
"grad_norm": 0.08349609375,
"learning_rate": 4.697696737044146e-05,
"loss": 0.9021,
"step": 63
},
{
"epoch": 0.061420345489443376,
"grad_norm": 0.08251953125,
"learning_rate": 4.6928982725527834e-05,
"loss": 0.9205,
"step": 64
},
{
"epoch": 0.06238003838771593,
"grad_norm": 0.09619140625,
"learning_rate": 4.688099808061421e-05,
"loss": 0.8649,
"step": 65
},
{
"epoch": 0.06333973128598848,
"grad_norm": 0.08447265625,
"learning_rate": 4.683301343570058e-05,
"loss": 0.8204,
"step": 66
},
{
"epoch": 0.06429942418426103,
"grad_norm": 0.0810546875,
"learning_rate": 4.678502879078695e-05,
"loss": 0.8614,
"step": 67
},
{
"epoch": 0.06525911708253358,
"grad_norm": 0.1201171875,
"learning_rate": 4.673704414587332e-05,
"loss": 1.0054,
"step": 68
},
{
"epoch": 0.06621880998080615,
"grad_norm": 0.1201171875,
"learning_rate": 4.6689059500959694e-05,
"loss": 0.965,
"step": 69
},
{
"epoch": 0.0671785028790787,
"grad_norm": 0.083984375,
"learning_rate": 4.664107485604607e-05,
"loss": 0.8924,
"step": 70
},
{
"epoch": 0.06813819577735125,
"grad_norm": 0.083984375,
"learning_rate": 4.659309021113244e-05,
"loss": 0.7907,
"step": 71
},
{
"epoch": 0.0690978886756238,
"grad_norm": 0.08251953125,
"learning_rate": 4.654510556621881e-05,
"loss": 0.841,
"step": 72
},
{
"epoch": 0.07005758157389635,
"grad_norm": 0.0849609375,
"learning_rate": 4.649712092130519e-05,
"loss": 0.8276,
"step": 73
},
{
"epoch": 0.0710172744721689,
"grad_norm": 0.138671875,
"learning_rate": 4.644913627639156e-05,
"loss": 0.8203,
"step": 74
},
{
"epoch": 0.07197696737044146,
"grad_norm": 0.08740234375,
"learning_rate": 4.640115163147793e-05,
"loss": 0.9008,
"step": 75
},
{
"epoch": 0.07293666026871401,
"grad_norm": 0.08544921875,
"learning_rate": 4.63531669865643e-05,
"loss": 0.8216,
"step": 76
},
{
"epoch": 0.07389635316698656,
"grad_norm": 0.09033203125,
"learning_rate": 4.6305182341650674e-05,
"loss": 0.825,
"step": 77
},
{
"epoch": 0.07485604606525911,
"grad_norm": 0.0888671875,
"learning_rate": 4.625719769673705e-05,
"loss": 0.7909,
"step": 78
},
{
"epoch": 0.07581573896353166,
"grad_norm": 0.0927734375,
"learning_rate": 4.620921305182342e-05,
"loss": 0.8424,
"step": 79
},
{
"epoch": 0.07677543186180422,
"grad_norm": 0.0927734375,
"learning_rate": 4.616122840690979e-05,
"loss": 0.8197,
"step": 80
},
{
"epoch": 0.07773512476007678,
"grad_norm": 0.10400390625,
"learning_rate": 4.611324376199616e-05,
"loss": 0.8275,
"step": 81
},
{
"epoch": 0.07869481765834933,
"grad_norm": 0.08837890625,
"learning_rate": 4.606525911708254e-05,
"loss": 0.8048,
"step": 82
},
{
"epoch": 0.07965451055662189,
"grad_norm": 0.09375,
"learning_rate": 4.601727447216891e-05,
"loss": 0.8103,
"step": 83
},
{
"epoch": 0.08061420345489444,
"grad_norm": 0.10693359375,
"learning_rate": 4.596928982725528e-05,
"loss": 0.7741,
"step": 84
},
{
"epoch": 0.08157389635316699,
"grad_norm": 0.09375,
"learning_rate": 4.5921305182341654e-05,
"loss": 0.8088,
"step": 85
},
{
"epoch": 0.08253358925143954,
"grad_norm": 0.0927734375,
"learning_rate": 4.587332053742803e-05,
"loss": 0.8514,
"step": 86
},
{
"epoch": 0.08349328214971209,
"grad_norm": 0.09375,
"learning_rate": 4.58253358925144e-05,
"loss": 0.8502,
"step": 87
},
{
"epoch": 0.08445297504798464,
"grad_norm": 0.09521484375,
"learning_rate": 4.577735124760077e-05,
"loss": 0.8378,
"step": 88
},
{
"epoch": 0.0854126679462572,
"grad_norm": 0.0908203125,
"learning_rate": 4.572936660268714e-05,
"loss": 0.8144,
"step": 89
},
{
"epoch": 0.08637236084452975,
"grad_norm": 0.1005859375,
"learning_rate": 4.5681381957773514e-05,
"loss": 0.8769,
"step": 90
},
{
"epoch": 0.0873320537428023,
"grad_norm": 0.08984375,
"learning_rate": 4.563339731285989e-05,
"loss": 0.7952,
"step": 91
},
{
"epoch": 0.08829174664107485,
"grad_norm": 0.09912109375,
"learning_rate": 4.558541266794626e-05,
"loss": 0.8802,
"step": 92
},
{
"epoch": 0.0892514395393474,
"grad_norm": 0.09228515625,
"learning_rate": 4.553742802303263e-05,
"loss": 0.8435,
"step": 93
},
{
"epoch": 0.09021113243761997,
"grad_norm": 0.0986328125,
"learning_rate": 4.548944337811901e-05,
"loss": 0.8348,
"step": 94
},
{
"epoch": 0.09117082533589252,
"grad_norm": 0.10107421875,
"learning_rate": 4.544145873320538e-05,
"loss": 0.8354,
"step": 95
},
{
"epoch": 0.09213051823416507,
"grad_norm": 0.0986328125,
"learning_rate": 4.539347408829175e-05,
"loss": 0.804,
"step": 96
},
{
"epoch": 0.09309021113243762,
"grad_norm": 0.09326171875,
"learning_rate": 4.534548944337812e-05,
"loss": 0.8968,
"step": 97
},
{
"epoch": 0.09404990403071017,
"grad_norm": 0.09375,
"learning_rate": 4.5297504798464494e-05,
"loss": 0.8628,
"step": 98
},
{
"epoch": 0.09500959692898273,
"grad_norm": 0.11865234375,
"learning_rate": 4.524952015355087e-05,
"loss": 0.8004,
"step": 99
},
{
"epoch": 0.09596928982725528,
"grad_norm": 0.09521484375,
"learning_rate": 4.520153550863724e-05,
"loss": 0.7847,
"step": 100
},
{
"epoch": 0.09692898272552783,
"grad_norm": 0.10400390625,
"learning_rate": 4.515355086372361e-05,
"loss": 0.7901,
"step": 101
},
{
"epoch": 0.09788867562380038,
"grad_norm": 0.0966796875,
"learning_rate": 4.510556621880998e-05,
"loss": 0.854,
"step": 102
},
{
"epoch": 0.09884836852207293,
"grad_norm": 0.12109375,
"learning_rate": 4.505758157389636e-05,
"loss": 0.7811,
"step": 103
},
{
"epoch": 0.09980806142034548,
"grad_norm": 0.10595703125,
"learning_rate": 4.500959692898273e-05,
"loss": 0.8829,
"step": 104
},
{
"epoch": 0.10076775431861804,
"grad_norm": 0.09619140625,
"learning_rate": 4.49616122840691e-05,
"loss": 0.8016,
"step": 105
},
{
"epoch": 0.1017274472168906,
"grad_norm": 0.09814453125,
"learning_rate": 4.491362763915547e-05,
"loss": 0.7894,
"step": 106
},
{
"epoch": 0.10268714011516315,
"grad_norm": 0.0986328125,
"learning_rate": 4.486564299424185e-05,
"loss": 0.7981,
"step": 107
},
{
"epoch": 0.1036468330134357,
"grad_norm": 0.1083984375,
"learning_rate": 4.481765834932822e-05,
"loss": 0.8251,
"step": 108
},
{
"epoch": 0.10460652591170826,
"grad_norm": 0.1669921875,
"learning_rate": 4.476967370441459e-05,
"loss": 0.9373,
"step": 109
},
{
"epoch": 0.10556621880998081,
"grad_norm": 0.10400390625,
"learning_rate": 4.472168905950096e-05,
"loss": 0.7491,
"step": 110
},
{
"epoch": 0.10652591170825336,
"grad_norm": 0.099609375,
"learning_rate": 4.467370441458733e-05,
"loss": 0.7829,
"step": 111
},
{
"epoch": 0.10748560460652591,
"grad_norm": 0.1044921875,
"learning_rate": 4.462571976967371e-05,
"loss": 0.8941,
"step": 112
},
{
"epoch": 0.10844529750479846,
"grad_norm": 0.10546875,
"learning_rate": 4.457773512476008e-05,
"loss": 0.8423,
"step": 113
},
{
"epoch": 0.10940499040307101,
"grad_norm": 0.1064453125,
"learning_rate": 4.4529750479846447e-05,
"loss": 0.7553,
"step": 114
},
{
"epoch": 0.11036468330134357,
"grad_norm": 0.1669921875,
"learning_rate": 4.448176583493283e-05,
"loss": 0.8334,
"step": 115
},
{
"epoch": 0.11132437619961612,
"grad_norm": 0.10595703125,
"learning_rate": 4.44337811900192e-05,
"loss": 0.7987,
"step": 116
},
{
"epoch": 0.11228406909788867,
"grad_norm": 0.11767578125,
"learning_rate": 4.4385796545105567e-05,
"loss": 0.8109,
"step": 117
},
{
"epoch": 0.11324376199616124,
"grad_norm": 0.10986328125,
"learning_rate": 4.433781190019194e-05,
"loss": 0.7601,
"step": 118
},
{
"epoch": 0.11420345489443379,
"grad_norm": 0.130859375,
"learning_rate": 4.428982725527831e-05,
"loss": 0.8111,
"step": 119
},
{
"epoch": 0.11516314779270634,
"grad_norm": 0.109375,
"learning_rate": 4.4241842610364687e-05,
"loss": 0.8731,
"step": 120
},
{
"epoch": 0.11612284069097889,
"grad_norm": 0.10595703125,
"learning_rate": 4.419385796545106e-05,
"loss": 0.7185,
"step": 121
},
{
"epoch": 0.11708253358925144,
"grad_norm": 0.10693359375,
"learning_rate": 4.4145873320537426e-05,
"loss": 0.7751,
"step": 122
},
{
"epoch": 0.118042226487524,
"grad_norm": 0.10595703125,
"learning_rate": 4.4097888675623806e-05,
"loss": 0.7975,
"step": 123
},
{
"epoch": 0.11900191938579655,
"grad_norm": 0.115234375,
"learning_rate": 4.404990403071017e-05,
"loss": 0.8107,
"step": 124
},
{
"epoch": 0.1199616122840691,
"grad_norm": 0.134765625,
"learning_rate": 4.4001919385796546e-05,
"loss": 0.8543,
"step": 125
},
{
"epoch": 0.12092130518234165,
"grad_norm": 0.1123046875,
"learning_rate": 4.395393474088292e-05,
"loss": 0.8406,
"step": 126
},
{
"epoch": 0.1218809980806142,
"grad_norm": 0.1025390625,
"learning_rate": 4.3905950095969286e-05,
"loss": 0.8258,
"step": 127
},
{
"epoch": 0.12284069097888675,
"grad_norm": 0.11279296875,
"learning_rate": 4.3857965451055666e-05,
"loss": 0.817,
"step": 128
},
{
"epoch": 0.1238003838771593,
"grad_norm": 0.1396484375,
"learning_rate": 4.380998080614204e-05,
"loss": 0.7262,
"step": 129
},
{
"epoch": 0.12476007677543186,
"grad_norm": 0.220703125,
"learning_rate": 4.3761996161228406e-05,
"loss": 0.76,
"step": 130
},
{
"epoch": 0.1257197696737044,
"grad_norm": 0.11083984375,
"learning_rate": 4.371401151631478e-05,
"loss": 0.7845,
"step": 131
},
{
"epoch": 0.12667946257197696,
"grad_norm": 0.1259765625,
"learning_rate": 4.366602687140115e-05,
"loss": 0.8156,
"step": 132
},
{
"epoch": 0.1276391554702495,
"grad_norm": 0.11474609375,
"learning_rate": 4.3618042226487526e-05,
"loss": 0.8062,
"step": 133
},
{
"epoch": 0.12859884836852206,
"grad_norm": 0.11279296875,
"learning_rate": 4.35700575815739e-05,
"loss": 0.7609,
"step": 134
},
{
"epoch": 0.1295585412667946,
"grad_norm": 0.11328125,
"learning_rate": 4.3522072936660266e-05,
"loss": 0.8428,
"step": 135
},
{
"epoch": 0.13051823416506717,
"grad_norm": 0.1083984375,
"learning_rate": 4.3474088291746646e-05,
"loss": 0.7752,
"step": 136
},
{
"epoch": 0.13147792706333974,
"grad_norm": 0.12158203125,
"learning_rate": 4.342610364683302e-05,
"loss": 0.7556,
"step": 137
},
{
"epoch": 0.1324376199616123,
"grad_norm": 0.11083984375,
"learning_rate": 4.3378119001919386e-05,
"loss": 0.8069,
"step": 138
},
{
"epoch": 0.13339731285988485,
"grad_norm": 0.130859375,
"learning_rate": 4.333013435700576e-05,
"loss": 0.8006,
"step": 139
},
{
"epoch": 0.1343570057581574,
"grad_norm": 0.11865234375,
"learning_rate": 4.328214971209213e-05,
"loss": 0.8035,
"step": 140
},
{
"epoch": 0.13531669865642995,
"grad_norm": 0.1181640625,
"learning_rate": 4.3234165067178506e-05,
"loss": 0.792,
"step": 141
},
{
"epoch": 0.1362763915547025,
"grad_norm": 0.12890625,
"learning_rate": 4.318618042226488e-05,
"loss": 0.741,
"step": 142
},
{
"epoch": 0.13723608445297505,
"grad_norm": 0.11962890625,
"learning_rate": 4.3138195777351246e-05,
"loss": 0.7722,
"step": 143
},
{
"epoch": 0.1381957773512476,
"grad_norm": 0.1240234375,
"learning_rate": 4.3090211132437626e-05,
"loss": 0.7554,
"step": 144
},
{
"epoch": 0.13915547024952016,
"grad_norm": 0.11279296875,
"learning_rate": 4.304222648752399e-05,
"loss": 0.7693,
"step": 145
},
{
"epoch": 0.1401151631477927,
"grad_norm": 0.1142578125,
"learning_rate": 4.2994241842610366e-05,
"loss": 0.7235,
"step": 146
},
{
"epoch": 0.14107485604606526,
"grad_norm": 0.1123046875,
"learning_rate": 4.294625719769674e-05,
"loss": 0.7351,
"step": 147
},
{
"epoch": 0.1420345489443378,
"grad_norm": 0.11376953125,
"learning_rate": 4.289827255278311e-05,
"loss": 0.743,
"step": 148
},
{
"epoch": 0.14299424184261036,
"grad_norm": 0.1640625,
"learning_rate": 4.2850287907869486e-05,
"loss": 0.7349,
"step": 149
},
{
"epoch": 0.14395393474088292,
"grad_norm": 0.11474609375,
"learning_rate": 4.280230326295586e-05,
"loss": 0.8031,
"step": 150
},
{
"epoch": 0.14491362763915547,
"grad_norm": 0.1220703125,
"learning_rate": 4.2754318618042226e-05,
"loss": 0.8145,
"step": 151
},
{
"epoch": 0.14587332053742802,
"grad_norm": 0.1201171875,
"learning_rate": 4.27063339731286e-05,
"loss": 0.7043,
"step": 152
},
{
"epoch": 0.14683301343570057,
"grad_norm": 0.12255859375,
"learning_rate": 4.265834932821497e-05,
"loss": 0.7998,
"step": 153
},
{
"epoch": 0.14779270633397312,
"grad_norm": 0.140625,
"learning_rate": 4.2610364683301346e-05,
"loss": 0.7619,
"step": 154
},
{
"epoch": 0.14875239923224567,
"grad_norm": 0.119140625,
"learning_rate": 4.256238003838772e-05,
"loss": 0.7511,
"step": 155
},
{
"epoch": 0.14971209213051823,
"grad_norm": 0.11767578125,
"learning_rate": 4.2514395393474086e-05,
"loss": 0.7492,
"step": 156
},
{
"epoch": 0.15067178502879078,
"grad_norm": 0.1298828125,
"learning_rate": 4.2466410748560466e-05,
"loss": 0.7702,
"step": 157
},
{
"epoch": 0.15163147792706333,
"grad_norm": 0.1162109375,
"learning_rate": 4.241842610364683e-05,
"loss": 0.7038,
"step": 158
},
{
"epoch": 0.15259117082533588,
"grad_norm": 0.1181640625,
"learning_rate": 4.2370441458733206e-05,
"loss": 0.8124,
"step": 159
},
{
"epoch": 0.15355086372360843,
"grad_norm": 0.12890625,
"learning_rate": 4.232245681381958e-05,
"loss": 0.7566,
"step": 160
},
{
"epoch": 0.15451055662188098,
"grad_norm": 0.11474609375,
"learning_rate": 4.227447216890595e-05,
"loss": 0.7076,
"step": 161
},
{
"epoch": 0.15547024952015356,
"grad_norm": 0.1220703125,
"learning_rate": 4.2226487523992326e-05,
"loss": 0.7593,
"step": 162
},
{
"epoch": 0.15642994241842612,
"grad_norm": 0.1591796875,
"learning_rate": 4.21785028790787e-05,
"loss": 0.7716,
"step": 163
},
{
"epoch": 0.15738963531669867,
"grad_norm": 0.1376953125,
"learning_rate": 4.2130518234165065e-05,
"loss": 0.8253,
"step": 164
},
{
"epoch": 0.15834932821497122,
"grad_norm": 0.119140625,
"learning_rate": 4.2082533589251446e-05,
"loss": 0.7395,
"step": 165
},
{
"epoch": 0.15930902111324377,
"grad_norm": 0.1142578125,
"learning_rate": 4.203454894433781e-05,
"loss": 0.7586,
"step": 166
},
{
"epoch": 0.16026871401151632,
"grad_norm": 0.12353515625,
"learning_rate": 4.1986564299424185e-05,
"loss": 0.7453,
"step": 167
},
{
"epoch": 0.16122840690978887,
"grad_norm": 0.123046875,
"learning_rate": 4.193857965451056e-05,
"loss": 0.7521,
"step": 168
},
{
"epoch": 0.16218809980806143,
"grad_norm": 0.12109375,
"learning_rate": 4.189059500959693e-05,
"loss": 0.7548,
"step": 169
},
{
"epoch": 0.16314779270633398,
"grad_norm": 0.1298828125,
"learning_rate": 4.1842610364683305e-05,
"loss": 0.7302,
"step": 170
},
{
"epoch": 0.16410748560460653,
"grad_norm": 0.12255859375,
"learning_rate": 4.179462571976968e-05,
"loss": 0.7698,
"step": 171
},
{
"epoch": 0.16506717850287908,
"grad_norm": 0.12255859375,
"learning_rate": 4.1746641074856045e-05,
"loss": 0.7054,
"step": 172
},
{
"epoch": 0.16602687140115163,
"grad_norm": 0.1240234375,
"learning_rate": 4.1698656429942425e-05,
"loss": 0.8326,
"step": 173
},
{
"epoch": 0.16698656429942418,
"grad_norm": 0.1240234375,
"learning_rate": 4.165067178502879e-05,
"loss": 0.7553,
"step": 174
},
{
"epoch": 0.16794625719769674,
"grad_norm": 0.1318359375,
"learning_rate": 4.1602687140115165e-05,
"loss": 0.7239,
"step": 175
},
{
"epoch": 0.1689059500959693,
"grad_norm": 0.1552734375,
"learning_rate": 4.155470249520154e-05,
"loss": 0.7776,
"step": 176
},
{
"epoch": 0.16986564299424184,
"grad_norm": 0.12158203125,
"learning_rate": 4.1506717850287905e-05,
"loss": 0.7774,
"step": 177
},
{
"epoch": 0.1708253358925144,
"grad_norm": 0.1201171875,
"learning_rate": 4.1458733205374285e-05,
"loss": 0.7945,
"step": 178
},
{
"epoch": 0.17178502879078694,
"grad_norm": 0.12451171875,
"learning_rate": 4.141074856046065e-05,
"loss": 0.8037,
"step": 179
},
{
"epoch": 0.1727447216890595,
"grad_norm": 0.12109375,
"learning_rate": 4.1362763915547025e-05,
"loss": 0.8721,
"step": 180
},
{
"epoch": 0.17370441458733205,
"grad_norm": 0.1328125,
"learning_rate": 4.13147792706334e-05,
"loss": 0.7226,
"step": 181
},
{
"epoch": 0.1746641074856046,
"grad_norm": 0.1337890625,
"learning_rate": 4.126679462571977e-05,
"loss": 0.7682,
"step": 182
},
{
"epoch": 0.17562380038387715,
"grad_norm": 0.119140625,
"learning_rate": 4.1218809980806145e-05,
"loss": 0.8327,
"step": 183
},
{
"epoch": 0.1765834932821497,
"grad_norm": 0.1259765625,
"learning_rate": 4.117082533589252e-05,
"loss": 0.8166,
"step": 184
},
{
"epoch": 0.17754318618042225,
"grad_norm": 0.11474609375,
"learning_rate": 4.1122840690978885e-05,
"loss": 0.8122,
"step": 185
},
{
"epoch": 0.1785028790786948,
"grad_norm": 0.1279296875,
"learning_rate": 4.1074856046065265e-05,
"loss": 0.7033,
"step": 186
},
{
"epoch": 0.17946257197696738,
"grad_norm": 0.1376953125,
"learning_rate": 4.102687140115163e-05,
"loss": 0.7136,
"step": 187
},
{
"epoch": 0.18042226487523993,
"grad_norm": 0.1259765625,
"learning_rate": 4.0978886756238005e-05,
"loss": 0.7105,
"step": 188
},
{
"epoch": 0.1813819577735125,
"grad_norm": 0.126953125,
"learning_rate": 4.093090211132438e-05,
"loss": 0.6925,
"step": 189
},
{
"epoch": 0.18234165067178504,
"grad_norm": 0.1376953125,
"learning_rate": 4.088291746641075e-05,
"loss": 0.7665,
"step": 190
},
{
"epoch": 0.1833013435700576,
"grad_norm": 0.1259765625,
"learning_rate": 4.0834932821497125e-05,
"loss": 0.7447,
"step": 191
},
{
"epoch": 0.18426103646833014,
"grad_norm": 0.1279296875,
"learning_rate": 4.078694817658349e-05,
"loss": 0.8261,
"step": 192
},
{
"epoch": 0.1852207293666027,
"grad_norm": 0.1396484375,
"learning_rate": 4.0738963531669865e-05,
"loss": 0.747,
"step": 193
},
{
"epoch": 0.18618042226487524,
"grad_norm": 0.1337890625,
"learning_rate": 4.0690978886756245e-05,
"loss": 0.8633,
"step": 194
},
{
"epoch": 0.1871401151631478,
"grad_norm": 0.1328125,
"learning_rate": 4.064299424184261e-05,
"loss": 0.75,
"step": 195
},
{
"epoch": 0.18809980806142035,
"grad_norm": 0.1298828125,
"learning_rate": 4.0595009596928985e-05,
"loss": 0.7286,
"step": 196
},
{
"epoch": 0.1890595009596929,
"grad_norm": 0.134765625,
"learning_rate": 4.054702495201536e-05,
"loss": 0.7934,
"step": 197
},
{
"epoch": 0.19001919385796545,
"grad_norm": 0.1767578125,
"learning_rate": 4.049904030710173e-05,
"loss": 0.7847,
"step": 198
},
{
"epoch": 0.190978886756238,
"grad_norm": 0.1376953125,
"learning_rate": 4.0451055662188105e-05,
"loss": 0.7316,
"step": 199
},
{
"epoch": 0.19193857965451055,
"grad_norm": 0.1396484375,
"learning_rate": 4.040307101727447e-05,
"loss": 0.6962,
"step": 200
},
{
"epoch": 0.1928982725527831,
"grad_norm": 0.1328125,
"learning_rate": 4.0355086372360845e-05,
"loss": 0.7105,
"step": 201
},
{
"epoch": 0.19385796545105566,
"grad_norm": 0.1279296875,
"learning_rate": 4.030710172744722e-05,
"loss": 0.7288,
"step": 202
},
{
"epoch": 0.1948176583493282,
"grad_norm": 0.1259765625,
"learning_rate": 4.025911708253359e-05,
"loss": 0.7149,
"step": 203
},
{
"epoch": 0.19577735124760076,
"grad_norm": 0.1328125,
"learning_rate": 4.0211132437619965e-05,
"loss": 0.7202,
"step": 204
},
{
"epoch": 0.1967370441458733,
"grad_norm": 0.1298828125,
"learning_rate": 4.016314779270634e-05,
"loss": 0.6825,
"step": 205
},
{
"epoch": 0.19769673704414586,
"grad_norm": 0.1328125,
"learning_rate": 4.0115163147792705e-05,
"loss": 0.722,
"step": 206
},
{
"epoch": 0.19865642994241842,
"grad_norm": 0.1376953125,
"learning_rate": 4.0067178502879085e-05,
"loss": 0.683,
"step": 207
},
{
"epoch": 0.19961612284069097,
"grad_norm": 0.1484375,
"learning_rate": 4.001919385796545e-05,
"loss": 0.7062,
"step": 208
},
{
"epoch": 0.20057581573896352,
"grad_norm": 0.1328125,
"learning_rate": 3.9971209213051825e-05,
"loss": 0.7325,
"step": 209
},
{
"epoch": 0.20153550863723607,
"grad_norm": 0.12353515625,
"learning_rate": 3.99232245681382e-05,
"loss": 0.8078,
"step": 210
},
{
"epoch": 0.20249520153550865,
"grad_norm": 0.1337890625,
"learning_rate": 3.987523992322457e-05,
"loss": 0.6984,
"step": 211
},
{
"epoch": 0.2034548944337812,
"grad_norm": 0.13671875,
"learning_rate": 3.9827255278310945e-05,
"loss": 0.8541,
"step": 212
},
{
"epoch": 0.20441458733205375,
"grad_norm": 0.154296875,
"learning_rate": 3.977927063339731e-05,
"loss": 0.7189,
"step": 213
},
{
"epoch": 0.2053742802303263,
"grad_norm": 0.146484375,
"learning_rate": 3.9731285988483684e-05,
"loss": 0.6897,
"step": 214
},
{
"epoch": 0.20633397312859886,
"grad_norm": 0.138671875,
"learning_rate": 3.9683301343570065e-05,
"loss": 0.7874,
"step": 215
},
{
"epoch": 0.2072936660268714,
"grad_norm": 0.1328125,
"learning_rate": 3.963531669865643e-05,
"loss": 0.9131,
"step": 216
},
{
"epoch": 0.20825335892514396,
"grad_norm": 0.126953125,
"learning_rate": 3.9587332053742804e-05,
"loss": 0.6953,
"step": 217
},
{
"epoch": 0.2092130518234165,
"grad_norm": 0.1435546875,
"learning_rate": 3.953934740882918e-05,
"loss": 0.7278,
"step": 218
},
{
"epoch": 0.21017274472168906,
"grad_norm": 0.203125,
"learning_rate": 3.949136276391555e-05,
"loss": 0.6261,
"step": 219
},
{
"epoch": 0.21113243761996162,
"grad_norm": 0.130859375,
"learning_rate": 3.9443378119001924e-05,
"loss": 0.7201,
"step": 220
},
{
"epoch": 0.21209213051823417,
"grad_norm": 0.134765625,
"learning_rate": 3.939539347408829e-05,
"loss": 0.7117,
"step": 221
},
{
"epoch": 0.21305182341650672,
"grad_norm": 0.1357421875,
"learning_rate": 3.9347408829174664e-05,
"loss": 0.795,
"step": 222
},
{
"epoch": 0.21401151631477927,
"grad_norm": 0.1435546875,
"learning_rate": 3.9299424184261044e-05,
"loss": 0.698,
"step": 223
},
{
"epoch": 0.21497120921305182,
"grad_norm": 0.140625,
"learning_rate": 3.925143953934741e-05,
"loss": 0.7643,
"step": 224
},
{
"epoch": 0.21593090211132437,
"grad_norm": 0.1630859375,
"learning_rate": 3.9203454894433784e-05,
"loss": 0.7322,
"step": 225
},
{
"epoch": 0.21689059500959693,
"grad_norm": 0.1337890625,
"learning_rate": 3.915547024952016e-05,
"loss": 0.7173,
"step": 226
},
{
"epoch": 0.21785028790786948,
"grad_norm": 0.150390625,
"learning_rate": 3.9107485604606524e-05,
"loss": 0.8099,
"step": 227
},
{
"epoch": 0.21880998080614203,
"grad_norm": 0.146484375,
"learning_rate": 3.9059500959692904e-05,
"loss": 0.7695,
"step": 228
},
{
"epoch": 0.21976967370441458,
"grad_norm": 0.1357421875,
"learning_rate": 3.901151631477927e-05,
"loss": 0.7597,
"step": 229
},
{
"epoch": 0.22072936660268713,
"grad_norm": 0.138671875,
"learning_rate": 3.8963531669865644e-05,
"loss": 0.6964,
"step": 230
},
{
"epoch": 0.22168905950095968,
"grad_norm": 0.1455078125,
"learning_rate": 3.891554702495202e-05,
"loss": 0.7146,
"step": 231
},
{
"epoch": 0.22264875239923224,
"grad_norm": 0.146484375,
"learning_rate": 3.886756238003839e-05,
"loss": 0.7936,
"step": 232
},
{
"epoch": 0.2236084452975048,
"grad_norm": 0.1416015625,
"learning_rate": 3.8819577735124764e-05,
"loss": 0.7637,
"step": 233
},
{
"epoch": 0.22456813819577734,
"grad_norm": 0.154296875,
"learning_rate": 3.877159309021113e-05,
"loss": 0.7628,
"step": 234
},
{
"epoch": 0.2255278310940499,
"grad_norm": 0.1455078125,
"learning_rate": 3.8723608445297504e-05,
"loss": 0.6949,
"step": 235
},
{
"epoch": 0.22648752399232247,
"grad_norm": 0.1494140625,
"learning_rate": 3.8675623800383884e-05,
"loss": 0.7897,
"step": 236
},
{
"epoch": 0.22744721689059502,
"grad_norm": 0.1396484375,
"learning_rate": 3.862763915547025e-05,
"loss": 0.7913,
"step": 237
},
{
"epoch": 0.22840690978886757,
"grad_norm": 0.138671875,
"learning_rate": 3.8579654510556624e-05,
"loss": 0.748,
"step": 238
},
{
"epoch": 0.22936660268714013,
"grad_norm": 0.1435546875,
"learning_rate": 3.8531669865643e-05,
"loss": 0.7325,
"step": 239
},
{
"epoch": 0.23032629558541268,
"grad_norm": 0.1513671875,
"learning_rate": 3.848368522072937e-05,
"loss": 0.7363,
"step": 240
},
{
"epoch": 0.23128598848368523,
"grad_norm": 0.146484375,
"learning_rate": 3.8435700575815744e-05,
"loss": 0.6879,
"step": 241
},
{
"epoch": 0.23224568138195778,
"grad_norm": 0.140625,
"learning_rate": 3.838771593090211e-05,
"loss": 0.6557,
"step": 242
},
{
"epoch": 0.23320537428023033,
"grad_norm": 0.1328125,
"learning_rate": 3.8339731285988484e-05,
"loss": 0.7875,
"step": 243
},
{
"epoch": 0.23416506717850288,
"grad_norm": 0.1474609375,
"learning_rate": 3.829174664107486e-05,
"loss": 0.6932,
"step": 244
},
{
"epoch": 0.23512476007677544,
"grad_norm": 0.1435546875,
"learning_rate": 3.824376199616123e-05,
"loss": 0.7278,
"step": 245
},
{
"epoch": 0.236084452975048,
"grad_norm": 0.142578125,
"learning_rate": 3.8195777351247604e-05,
"loss": 0.7493,
"step": 246
},
{
"epoch": 0.23704414587332054,
"grad_norm": 0.1474609375,
"learning_rate": 3.814779270633397e-05,
"loss": 0.6941,
"step": 247
},
{
"epoch": 0.2380038387715931,
"grad_norm": 0.146484375,
"learning_rate": 3.809980806142035e-05,
"loss": 0.7274,
"step": 248
},
{
"epoch": 0.23896353166986564,
"grad_norm": 0.146484375,
"learning_rate": 3.8051823416506724e-05,
"loss": 0.7197,
"step": 249
},
{
"epoch": 0.2399232245681382,
"grad_norm": 0.146484375,
"learning_rate": 3.800383877159309e-05,
"loss": 0.7782,
"step": 250
},
{
"epoch": 0.24088291746641075,
"grad_norm": 0.1435546875,
"learning_rate": 3.7955854126679464e-05,
"loss": 0.6752,
"step": 251
},
{
"epoch": 0.2418426103646833,
"grad_norm": 0.1416015625,
"learning_rate": 3.790786948176584e-05,
"loss": 0.7021,
"step": 252
},
{
"epoch": 0.24280230326295585,
"grad_norm": 0.1396484375,
"learning_rate": 3.785988483685221e-05,
"loss": 0.6937,
"step": 253
},
{
"epoch": 0.2437619961612284,
"grad_norm": 0.1435546875,
"learning_rate": 3.7811900191938584e-05,
"loss": 0.8093,
"step": 254
},
{
"epoch": 0.24472168905950095,
"grad_norm": 0.142578125,
"learning_rate": 3.776391554702495e-05,
"loss": 0.7016,
"step": 255
},
{
"epoch": 0.2456813819577735,
"grad_norm": 0.14453125,
"learning_rate": 3.7715930902111324e-05,
"loss": 0.6506,
"step": 256
},
{
"epoch": 0.24664107485604606,
"grad_norm": 0.1376953125,
"learning_rate": 3.7667946257197704e-05,
"loss": 0.7127,
"step": 257
},
{
"epoch": 0.2476007677543186,
"grad_norm": 0.1513671875,
"learning_rate": 3.761996161228407e-05,
"loss": 0.7209,
"step": 258
},
{
"epoch": 0.24856046065259116,
"grad_norm": 0.13671875,
"learning_rate": 3.7571976967370443e-05,
"loss": 0.7729,
"step": 259
},
{
"epoch": 0.2495201535508637,
"grad_norm": 0.15234375,
"learning_rate": 3.752399232245682e-05,
"loss": 0.7553,
"step": 260
},
{
"epoch": 0.2504798464491363,
"grad_norm": 0.1513671875,
"learning_rate": 3.747600767754319e-05,
"loss": 0.7793,
"step": 261
},
{
"epoch": 0.2514395393474088,
"grad_norm": 0.1435546875,
"learning_rate": 3.7428023032629563e-05,
"loss": 0.6608,
"step": 262
},
{
"epoch": 0.2523992322456814,
"grad_norm": 0.146484375,
"learning_rate": 3.738003838771593e-05,
"loss": 0.7076,
"step": 263
},
{
"epoch": 0.2533589251439539,
"grad_norm": 0.1787109375,
"learning_rate": 3.73320537428023e-05,
"loss": 0.772,
"step": 264
},
{
"epoch": 0.2543186180422265,
"grad_norm": 0.1474609375,
"learning_rate": 3.728406909788868e-05,
"loss": 0.6851,
"step": 265
},
{
"epoch": 0.255278310940499,
"grad_norm": 0.1650390625,
"learning_rate": 3.723608445297505e-05,
"loss": 0.6781,
"step": 266
},
{
"epoch": 0.2562380038387716,
"grad_norm": 0.14453125,
"learning_rate": 3.718809980806142e-05,
"loss": 0.6604,
"step": 267
},
{
"epoch": 0.2571976967370441,
"grad_norm": 0.1474609375,
"learning_rate": 3.714011516314779e-05,
"loss": 0.6484,
"step": 268
},
{
"epoch": 0.2581573896353167,
"grad_norm": 0.146484375,
"learning_rate": 3.709213051823417e-05,
"loss": 0.7034,
"step": 269
},
{
"epoch": 0.2591170825335892,
"grad_norm": 0.1650390625,
"learning_rate": 3.704414587332054e-05,
"loss": 0.7504,
"step": 270
},
{
"epoch": 0.2600767754318618,
"grad_norm": 0.15625,
"learning_rate": 3.699616122840691e-05,
"loss": 0.7586,
"step": 271
},
{
"epoch": 0.26103646833013433,
"grad_norm": 0.1435546875,
"learning_rate": 3.694817658349328e-05,
"loss": 0.741,
"step": 272
},
{
"epoch": 0.2619961612284069,
"grad_norm": 0.1513671875,
"learning_rate": 3.6900191938579657e-05,
"loss": 0.6887,
"step": 273
},
{
"epoch": 0.2629558541266795,
"grad_norm": 0.1552734375,
"learning_rate": 3.685220729366603e-05,
"loss": 0.6369,
"step": 274
},
{
"epoch": 0.263915547024952,
"grad_norm": 0.15625,
"learning_rate": 3.68042226487524e-05,
"loss": 0.7094,
"step": 275
},
{
"epoch": 0.2648752399232246,
"grad_norm": 0.1728515625,
"learning_rate": 3.675623800383877e-05,
"loss": 0.7673,
"step": 276
},
{
"epoch": 0.2658349328214971,
"grad_norm": 0.1474609375,
"learning_rate": 3.670825335892514e-05,
"loss": 0.7441,
"step": 277
},
{
"epoch": 0.2667946257197697,
"grad_norm": 0.150390625,
"learning_rate": 3.6660268714011516e-05,
"loss": 0.6746,
"step": 278
},
{
"epoch": 0.2677543186180422,
"grad_norm": 0.15234375,
"learning_rate": 3.661228406909789e-05,
"loss": 0.7387,
"step": 279
},
{
"epoch": 0.2687140115163148,
"grad_norm": 0.15625,
"learning_rate": 3.656429942418426e-05,
"loss": 0.6934,
"step": 280
},
{
"epoch": 0.2696737044145873,
"grad_norm": 0.1552734375,
"learning_rate": 3.651631477927063e-05,
"loss": 0.64,
"step": 281
},
{
"epoch": 0.2706333973128599,
"grad_norm": 0.171875,
"learning_rate": 3.646833013435701e-05,
"loss": 0.9662,
"step": 282
},
{
"epoch": 0.2715930902111324,
"grad_norm": 0.1455078125,
"learning_rate": 3.642034548944338e-05,
"loss": 0.7063,
"step": 283
},
{
"epoch": 0.272552783109405,
"grad_norm": 0.1513671875,
"learning_rate": 3.637236084452975e-05,
"loss": 0.7988,
"step": 284
},
{
"epoch": 0.27351247600767753,
"grad_norm": 0.1689453125,
"learning_rate": 3.632437619961612e-05,
"loss": 0.7435,
"step": 285
},
{
"epoch": 0.2744721689059501,
"grad_norm": 0.1513671875,
"learning_rate": 3.6276391554702496e-05,
"loss": 0.7163,
"step": 286
},
{
"epoch": 0.27543186180422263,
"grad_norm": 0.146484375,
"learning_rate": 3.622840690978887e-05,
"loss": 0.6569,
"step": 287
},
{
"epoch": 0.2763915547024952,
"grad_norm": 0.158203125,
"learning_rate": 3.618042226487524e-05,
"loss": 0.699,
"step": 288
},
{
"epoch": 0.27735124760076774,
"grad_norm": 0.154296875,
"learning_rate": 3.613243761996161e-05,
"loss": 0.7237,
"step": 289
},
{
"epoch": 0.2783109404990403,
"grad_norm": 0.1533203125,
"learning_rate": 3.608445297504799e-05,
"loss": 0.6712,
"step": 290
},
{
"epoch": 0.27927063339731284,
"grad_norm": 0.1484375,
"learning_rate": 3.603646833013436e-05,
"loss": 0.6856,
"step": 291
},
{
"epoch": 0.2802303262955854,
"grad_norm": 0.154296875,
"learning_rate": 3.598848368522073e-05,
"loss": 0.7346,
"step": 292
},
{
"epoch": 0.28119001919385794,
"grad_norm": 0.1630859375,
"learning_rate": 3.59404990403071e-05,
"loss": 0.6807,
"step": 293
},
{
"epoch": 0.2821497120921305,
"grad_norm": 0.1767578125,
"learning_rate": 3.5892514395393476e-05,
"loss": 0.6471,
"step": 294
},
{
"epoch": 0.28310940499040305,
"grad_norm": 0.1552734375,
"learning_rate": 3.584452975047985e-05,
"loss": 0.6942,
"step": 295
},
{
"epoch": 0.2840690978886756,
"grad_norm": 0.146484375,
"learning_rate": 3.579654510556622e-05,
"loss": 0.7439,
"step": 296
},
{
"epoch": 0.28502879078694815,
"grad_norm": 0.1650390625,
"learning_rate": 3.574856046065259e-05,
"loss": 0.7056,
"step": 297
},
{
"epoch": 0.28598848368522073,
"grad_norm": 0.1767578125,
"learning_rate": 3.570057581573896e-05,
"loss": 0.6869,
"step": 298
},
{
"epoch": 0.2869481765834933,
"grad_norm": 0.1474609375,
"learning_rate": 3.5652591170825336e-05,
"loss": 0.6763,
"step": 299
},
{
"epoch": 0.28790786948176583,
"grad_norm": 0.1474609375,
"learning_rate": 3.560460652591171e-05,
"loss": 0.7642,
"step": 300
},
{
"epoch": 0.2888675623800384,
"grad_norm": 0.154296875,
"learning_rate": 3.555662188099808e-05,
"loss": 0.7297,
"step": 301
},
{
"epoch": 0.28982725527831094,
"grad_norm": 0.15234375,
"learning_rate": 3.550863723608445e-05,
"loss": 0.6708,
"step": 302
},
{
"epoch": 0.2907869481765835,
"grad_norm": 0.1611328125,
"learning_rate": 3.546065259117083e-05,
"loss": 0.6544,
"step": 303
},
{
"epoch": 0.29174664107485604,
"grad_norm": 0.14453125,
"learning_rate": 3.54126679462572e-05,
"loss": 0.6653,
"step": 304
},
{
"epoch": 0.2927063339731286,
"grad_norm": 0.2265625,
"learning_rate": 3.536468330134357e-05,
"loss": 0.6902,
"step": 305
},
{
"epoch": 0.29366602687140114,
"grad_norm": 0.150390625,
"learning_rate": 3.531669865642994e-05,
"loss": 0.6777,
"step": 306
},
{
"epoch": 0.2946257197696737,
"grad_norm": 0.1552734375,
"learning_rate": 3.5268714011516316e-05,
"loss": 0.6928,
"step": 307
},
{
"epoch": 0.29558541266794625,
"grad_norm": 0.1953125,
"learning_rate": 3.522072936660269e-05,
"loss": 0.7738,
"step": 308
},
{
"epoch": 0.2965451055662188,
"grad_norm": 0.1533203125,
"learning_rate": 3.517274472168906e-05,
"loss": 0.6672,
"step": 309
},
{
"epoch": 0.29750479846449135,
"grad_norm": 0.1572265625,
"learning_rate": 3.512476007677543e-05,
"loss": 0.6526,
"step": 310
},
{
"epoch": 0.29846449136276393,
"grad_norm": 0.1552734375,
"learning_rate": 3.507677543186181e-05,
"loss": 0.7665,
"step": 311
},
{
"epoch": 0.29942418426103645,
"grad_norm": 0.154296875,
"learning_rate": 3.502879078694818e-05,
"loss": 0.7359,
"step": 312
},
{
"epoch": 0.30038387715930903,
"grad_norm": 0.1494140625,
"learning_rate": 3.498080614203455e-05,
"loss": 0.6592,
"step": 313
},
{
"epoch": 0.30134357005758156,
"grad_norm": 0.1474609375,
"learning_rate": 3.493282149712092e-05,
"loss": 0.752,
"step": 314
},
{
"epoch": 0.30230326295585414,
"grad_norm": 0.162109375,
"learning_rate": 3.4884836852207296e-05,
"loss": 0.6995,
"step": 315
},
{
"epoch": 0.30326295585412666,
"grad_norm": 0.15234375,
"learning_rate": 3.483685220729367e-05,
"loss": 0.7031,
"step": 316
},
{
"epoch": 0.30422264875239924,
"grad_norm": 0.1591796875,
"learning_rate": 3.478886756238004e-05,
"loss": 0.7046,
"step": 317
},
{
"epoch": 0.30518234165067176,
"grad_norm": 0.15234375,
"learning_rate": 3.474088291746641e-05,
"loss": 0.6898,
"step": 318
},
{
"epoch": 0.30614203454894434,
"grad_norm": 0.1591796875,
"learning_rate": 3.469289827255279e-05,
"loss": 0.6746,
"step": 319
},
{
"epoch": 0.30710172744721687,
"grad_norm": 0.142578125,
"learning_rate": 3.4644913627639155e-05,
"loss": 0.8099,
"step": 320
},
{
"epoch": 0.30806142034548945,
"grad_norm": 0.1611328125,
"learning_rate": 3.459692898272553e-05,
"loss": 0.6901,
"step": 321
},
{
"epoch": 0.30902111324376197,
"grad_norm": 0.220703125,
"learning_rate": 3.45489443378119e-05,
"loss": 0.6712,
"step": 322
},
{
"epoch": 0.30998080614203455,
"grad_norm": 0.1494140625,
"learning_rate": 3.450095969289827e-05,
"loss": 0.6829,
"step": 323
},
{
"epoch": 0.31094049904030713,
"grad_norm": 0.1572265625,
"learning_rate": 3.445297504798465e-05,
"loss": 0.6831,
"step": 324
},
{
"epoch": 0.31190019193857965,
"grad_norm": 0.16015625,
"learning_rate": 3.440499040307102e-05,
"loss": 0.6814,
"step": 325
},
{
"epoch": 0.31285988483685223,
"grad_norm": 0.1552734375,
"learning_rate": 3.435700575815739e-05,
"loss": 0.7156,
"step": 326
},
{
"epoch": 0.31381957773512476,
"grad_norm": 0.166015625,
"learning_rate": 3.430902111324376e-05,
"loss": 0.7648,
"step": 327
},
{
"epoch": 0.31477927063339733,
"grad_norm": 0.1455078125,
"learning_rate": 3.4261036468330135e-05,
"loss": 0.7224,
"step": 328
},
{
"epoch": 0.31573896353166986,
"grad_norm": 0.154296875,
"learning_rate": 3.421305182341651e-05,
"loss": 0.6945,
"step": 329
},
{
"epoch": 0.31669865642994244,
"grad_norm": 0.1533203125,
"learning_rate": 3.416506717850288e-05,
"loss": 0.7545,
"step": 330
},
{
"epoch": 0.31765834932821496,
"grad_norm": 0.1484375,
"learning_rate": 3.411708253358925e-05,
"loss": 0.7285,
"step": 331
},
{
"epoch": 0.31861804222648754,
"grad_norm": 0.1513671875,
"learning_rate": 3.406909788867563e-05,
"loss": 0.7181,
"step": 332
},
{
"epoch": 0.31957773512476007,
"grad_norm": 0.154296875,
"learning_rate": 3.4021113243761995e-05,
"loss": 0.7902,
"step": 333
},
{
"epoch": 0.32053742802303264,
"grad_norm": 0.1962890625,
"learning_rate": 3.397312859884837e-05,
"loss": 0.6529,
"step": 334
},
{
"epoch": 0.32149712092130517,
"grad_norm": 0.16015625,
"learning_rate": 3.392514395393474e-05,
"loss": 0.7397,
"step": 335
},
{
"epoch": 0.32245681381957775,
"grad_norm": 0.1552734375,
"learning_rate": 3.3877159309021115e-05,
"loss": 0.6852,
"step": 336
},
{
"epoch": 0.32341650671785027,
"grad_norm": 0.1640625,
"learning_rate": 3.382917466410749e-05,
"loss": 0.7339,
"step": 337
},
{
"epoch": 0.32437619961612285,
"grad_norm": 0.1591796875,
"learning_rate": 3.378119001919386e-05,
"loss": 0.6874,
"step": 338
},
{
"epoch": 0.3253358925143954,
"grad_norm": 0.1689453125,
"learning_rate": 3.373320537428023e-05,
"loss": 0.6913,
"step": 339
},
{
"epoch": 0.32629558541266795,
"grad_norm": 0.1689453125,
"learning_rate": 3.368522072936661e-05,
"loss": 0.6271,
"step": 340
},
{
"epoch": 0.3272552783109405,
"grad_norm": 0.166015625,
"learning_rate": 3.3637236084452975e-05,
"loss": 0.6797,
"step": 341
},
{
"epoch": 0.32821497120921306,
"grad_norm": 0.1494140625,
"learning_rate": 3.358925143953935e-05,
"loss": 0.6866,
"step": 342
},
{
"epoch": 0.3291746641074856,
"grad_norm": 0.1669921875,
"learning_rate": 3.354126679462572e-05,
"loss": 0.7679,
"step": 343
},
{
"epoch": 0.33013435700575816,
"grad_norm": 0.1474609375,
"learning_rate": 3.3493282149712095e-05,
"loss": 0.7125,
"step": 344
},
{
"epoch": 0.3310940499040307,
"grad_norm": 0.1650390625,
"learning_rate": 3.344529750479847e-05,
"loss": 0.7532,
"step": 345
},
{
"epoch": 0.33205374280230326,
"grad_norm": 0.1484375,
"learning_rate": 3.339731285988484e-05,
"loss": 0.6617,
"step": 346
},
{
"epoch": 0.3330134357005758,
"grad_norm": 0.1611328125,
"learning_rate": 3.334932821497121e-05,
"loss": 0.6764,
"step": 347
},
{
"epoch": 0.33397312859884837,
"grad_norm": 0.1591796875,
"learning_rate": 3.330134357005758e-05,
"loss": 0.6953,
"step": 348
},
{
"epoch": 0.33493282149712095,
"grad_norm": 0.1640625,
"learning_rate": 3.3253358925143955e-05,
"loss": 0.6981,
"step": 349
},
{
"epoch": 0.33589251439539347,
"grad_norm": 0.21875,
"learning_rate": 3.320537428023033e-05,
"loss": 0.7343,
"step": 350
},
{
"epoch": 0.33685220729366605,
"grad_norm": 0.166015625,
"learning_rate": 3.31573896353167e-05,
"loss": 0.7237,
"step": 351
},
{
"epoch": 0.3378119001919386,
"grad_norm": 0.166015625,
"learning_rate": 3.310940499040307e-05,
"loss": 0.7092,
"step": 352
},
{
"epoch": 0.33877159309021115,
"grad_norm": 0.173828125,
"learning_rate": 3.306142034548945e-05,
"loss": 0.6969,
"step": 353
},
{
"epoch": 0.3397312859884837,
"grad_norm": 0.166015625,
"learning_rate": 3.3013435700575815e-05,
"loss": 0.6786,
"step": 354
},
{
"epoch": 0.34069097888675626,
"grad_norm": 0.1572265625,
"learning_rate": 3.296545105566219e-05,
"loss": 0.7467,
"step": 355
},
{
"epoch": 0.3416506717850288,
"grad_norm": 0.169921875,
"learning_rate": 3.291746641074856e-05,
"loss": 0.6644,
"step": 356
},
{
"epoch": 0.34261036468330136,
"grad_norm": 0.1591796875,
"learning_rate": 3.2869481765834935e-05,
"loss": 0.6865,
"step": 357
},
{
"epoch": 0.3435700575815739,
"grad_norm": 0.1650390625,
"learning_rate": 3.282149712092131e-05,
"loss": 0.7558,
"step": 358
},
{
"epoch": 0.34452975047984646,
"grad_norm": 0.1552734375,
"learning_rate": 3.277351247600768e-05,
"loss": 0.6625,
"step": 359
},
{
"epoch": 0.345489443378119,
"grad_norm": 0.1630859375,
"learning_rate": 3.272552783109405e-05,
"loss": 0.7669,
"step": 360
},
{
"epoch": 0.34644913627639157,
"grad_norm": 0.1572265625,
"learning_rate": 3.267754318618043e-05,
"loss": 0.6358,
"step": 361
},
{
"epoch": 0.3474088291746641,
"grad_norm": 0.1689453125,
"learning_rate": 3.2629558541266795e-05,
"loss": 0.7083,
"step": 362
},
{
"epoch": 0.34836852207293667,
"grad_norm": 0.1708984375,
"learning_rate": 3.258157389635317e-05,
"loss": 0.7058,
"step": 363
},
{
"epoch": 0.3493282149712092,
"grad_norm": 0.1533203125,
"learning_rate": 3.253358925143954e-05,
"loss": 0.6559,
"step": 364
},
{
"epoch": 0.3502879078694818,
"grad_norm": 0.162109375,
"learning_rate": 3.2485604606525915e-05,
"loss": 0.7451,
"step": 365
},
{
"epoch": 0.3512476007677543,
"grad_norm": 0.169921875,
"learning_rate": 3.243761996161229e-05,
"loss": 0.626,
"step": 366
},
{
"epoch": 0.3522072936660269,
"grad_norm": 0.294921875,
"learning_rate": 3.2389635316698654e-05,
"loss": 0.8474,
"step": 367
},
{
"epoch": 0.3531669865642994,
"grad_norm": 0.1650390625,
"learning_rate": 3.234165067178503e-05,
"loss": 0.7122,
"step": 368
},
{
"epoch": 0.354126679462572,
"grad_norm": 0.15625,
"learning_rate": 3.229366602687141e-05,
"loss": 0.6793,
"step": 369
},
{
"epoch": 0.3550863723608445,
"grad_norm": 0.173828125,
"learning_rate": 3.2245681381957774e-05,
"loss": 0.676,
"step": 370
},
{
"epoch": 0.3560460652591171,
"grad_norm": 0.1630859375,
"learning_rate": 3.219769673704415e-05,
"loss": 0.6879,
"step": 371
},
{
"epoch": 0.3570057581573896,
"grad_norm": 0.177734375,
"learning_rate": 3.214971209213052e-05,
"loss": 0.794,
"step": 372
},
{
"epoch": 0.3579654510556622,
"grad_norm": 0.1572265625,
"learning_rate": 3.210172744721689e-05,
"loss": 0.6602,
"step": 373
},
{
"epoch": 0.35892514395393477,
"grad_norm": 0.15234375,
"learning_rate": 3.205374280230327e-05,
"loss": 0.6773,
"step": 374
},
{
"epoch": 0.3598848368522073,
"grad_norm": 0.1640625,
"learning_rate": 3.2005758157389634e-05,
"loss": 0.7207,
"step": 375
},
{
"epoch": 0.36084452975047987,
"grad_norm": 0.2001953125,
"learning_rate": 3.195777351247601e-05,
"loss": 0.7465,
"step": 376
},
{
"epoch": 0.3618042226487524,
"grad_norm": 0.154296875,
"learning_rate": 3.190978886756238e-05,
"loss": 0.63,
"step": 377
},
{
"epoch": 0.362763915547025,
"grad_norm": 0.16796875,
"learning_rate": 3.1861804222648754e-05,
"loss": 0.6951,
"step": 378
},
{
"epoch": 0.3637236084452975,
"grad_norm": 0.1953125,
"learning_rate": 3.181381957773513e-05,
"loss": 0.6942,
"step": 379
},
{
"epoch": 0.3646833013435701,
"grad_norm": 0.171875,
"learning_rate": 3.17658349328215e-05,
"loss": 0.7014,
"step": 380
},
{
"epoch": 0.3656429942418426,
"grad_norm": 0.1748046875,
"learning_rate": 3.171785028790787e-05,
"loss": 0.7354,
"step": 381
},
{
"epoch": 0.3666026871401152,
"grad_norm": 0.1728515625,
"learning_rate": 3.166986564299425e-05,
"loss": 0.6438,
"step": 382
},
{
"epoch": 0.3675623800383877,
"grad_norm": 0.169921875,
"learning_rate": 3.1621880998080614e-05,
"loss": 0.6779,
"step": 383
},
{
"epoch": 0.3685220729366603,
"grad_norm": 0.17578125,
"learning_rate": 3.157389635316699e-05,
"loss": 0.7349,
"step": 384
},
{
"epoch": 0.3694817658349328,
"grad_norm": 0.162109375,
"learning_rate": 3.152591170825336e-05,
"loss": 0.6854,
"step": 385
},
{
"epoch": 0.3704414587332054,
"grad_norm": 0.150390625,
"learning_rate": 3.1477927063339734e-05,
"loss": 0.6163,
"step": 386
},
{
"epoch": 0.3714011516314779,
"grad_norm": 0.185546875,
"learning_rate": 3.142994241842611e-05,
"loss": 0.7172,
"step": 387
},
{
"epoch": 0.3723608445297505,
"grad_norm": 0.166015625,
"learning_rate": 3.1381957773512474e-05,
"loss": 0.6633,
"step": 388
},
{
"epoch": 0.373320537428023,
"grad_norm": 0.1865234375,
"learning_rate": 3.133397312859885e-05,
"loss": 0.6357,
"step": 389
},
{
"epoch": 0.3742802303262956,
"grad_norm": 0.16015625,
"learning_rate": 3.128598848368523e-05,
"loss": 0.6354,
"step": 390
},
{
"epoch": 0.3752399232245681,
"grad_norm": 0.1591796875,
"learning_rate": 3.1238003838771594e-05,
"loss": 0.6848,
"step": 391
},
{
"epoch": 0.3761996161228407,
"grad_norm": 0.15234375,
"learning_rate": 3.119001919385797e-05,
"loss": 0.6624,
"step": 392
},
{
"epoch": 0.3771593090211132,
"grad_norm": 0.173828125,
"learning_rate": 3.114203454894434e-05,
"loss": 0.7023,
"step": 393
},
{
"epoch": 0.3781190019193858,
"grad_norm": 0.185546875,
"learning_rate": 3.1094049904030714e-05,
"loss": 0.6766,
"step": 394
},
{
"epoch": 0.3790786948176583,
"grad_norm": 0.1748046875,
"learning_rate": 3.104606525911709e-05,
"loss": 0.7,
"step": 395
},
{
"epoch": 0.3800383877159309,
"grad_norm": 0.1689453125,
"learning_rate": 3.0998080614203454e-05,
"loss": 0.6905,
"step": 396
},
{
"epoch": 0.3809980806142035,
"grad_norm": 0.16796875,
"learning_rate": 3.095009596928983e-05,
"loss": 0.6854,
"step": 397
},
{
"epoch": 0.381957773512476,
"grad_norm": 0.1767578125,
"learning_rate": 3.09021113243762e-05,
"loss": 0.6882,
"step": 398
},
{
"epoch": 0.3829174664107486,
"grad_norm": 0.162109375,
"learning_rate": 3.0854126679462574e-05,
"loss": 0.6951,
"step": 399
},
{
"epoch": 0.3838771593090211,
"grad_norm": 0.2314453125,
"learning_rate": 3.080614203454895e-05,
"loss": 0.6999,
"step": 400
},
{
"epoch": 0.3848368522072937,
"grad_norm": 0.1708984375,
"learning_rate": 3.0758157389635314e-05,
"loss": 0.6821,
"step": 401
},
{
"epoch": 0.3857965451055662,
"grad_norm": 0.2021484375,
"learning_rate": 3.071017274472169e-05,
"loss": 0.6445,
"step": 402
},
{
"epoch": 0.3867562380038388,
"grad_norm": 0.17578125,
"learning_rate": 3.066218809980807e-05,
"loss": 0.6687,
"step": 403
},
{
"epoch": 0.3877159309021113,
"grad_norm": 0.1708984375,
"learning_rate": 3.0614203454894434e-05,
"loss": 0.6485,
"step": 404
},
{
"epoch": 0.3886756238003839,
"grad_norm": 0.1640625,
"learning_rate": 3.056621880998081e-05,
"loss": 0.6737,
"step": 405
},
{
"epoch": 0.3896353166986564,
"grad_norm": 0.166015625,
"learning_rate": 3.051823416506718e-05,
"loss": 0.6588,
"step": 406
},
{
"epoch": 0.390595009596929,
"grad_norm": 0.1728515625,
"learning_rate": 3.0470249520153554e-05,
"loss": 0.6528,
"step": 407
},
{
"epoch": 0.3915547024952015,
"grad_norm": 0.1689453125,
"learning_rate": 3.0422264875239924e-05,
"loss": 0.6638,
"step": 408
},
{
"epoch": 0.3925143953934741,
"grad_norm": 0.1669921875,
"learning_rate": 3.0374280230326297e-05,
"loss": 0.6735,
"step": 409
},
{
"epoch": 0.3934740882917466,
"grad_norm": 0.1669921875,
"learning_rate": 3.0326295585412667e-05,
"loss": 0.7035,
"step": 410
},
{
"epoch": 0.3944337811900192,
"grad_norm": 0.16796875,
"learning_rate": 3.0278310940499044e-05,
"loss": 0.6549,
"step": 411
},
{
"epoch": 0.39539347408829173,
"grad_norm": 0.1806640625,
"learning_rate": 3.0230326295585414e-05,
"loss": 0.7402,
"step": 412
},
{
"epoch": 0.3963531669865643,
"grad_norm": 0.177734375,
"learning_rate": 3.0182341650671787e-05,
"loss": 0.7682,
"step": 413
},
{
"epoch": 0.39731285988483683,
"grad_norm": 0.1572265625,
"learning_rate": 3.0134357005758157e-05,
"loss": 0.6547,
"step": 414
},
{
"epoch": 0.3982725527831094,
"grad_norm": 0.1650390625,
"learning_rate": 3.0086372360844534e-05,
"loss": 0.6486,
"step": 415
},
{
"epoch": 0.39923224568138194,
"grad_norm": 0.1650390625,
"learning_rate": 3.0038387715930903e-05,
"loss": 0.6471,
"step": 416
},
{
"epoch": 0.4001919385796545,
"grad_norm": 0.1748046875,
"learning_rate": 2.9990403071017277e-05,
"loss": 0.6749,
"step": 417
},
{
"epoch": 0.40115163147792704,
"grad_norm": 0.162109375,
"learning_rate": 2.9942418426103647e-05,
"loss": 0.6779,
"step": 418
},
{
"epoch": 0.4021113243761996,
"grad_norm": 0.1904296875,
"learning_rate": 2.9894433781190023e-05,
"loss": 0.6719,
"step": 419
},
{
"epoch": 0.40307101727447214,
"grad_norm": 0.171875,
"learning_rate": 2.9846449136276393e-05,
"loss": 0.6665,
"step": 420
},
{
"epoch": 0.4040307101727447,
"grad_norm": 0.171875,
"learning_rate": 2.9798464491362767e-05,
"loss": 0.6897,
"step": 421
},
{
"epoch": 0.4049904030710173,
"grad_norm": 0.185546875,
"learning_rate": 2.9750479846449137e-05,
"loss": 0.6689,
"step": 422
},
{
"epoch": 0.4059500959692898,
"grad_norm": 0.18359375,
"learning_rate": 2.9702495201535507e-05,
"loss": 0.6594,
"step": 423
},
{
"epoch": 0.4069097888675624,
"grad_norm": 0.1728515625,
"learning_rate": 2.9654510556621883e-05,
"loss": 0.6481,
"step": 424
},
{
"epoch": 0.40786948176583493,
"grad_norm": 0.1640625,
"learning_rate": 2.9606525911708257e-05,
"loss": 0.5975,
"step": 425
},
{
"epoch": 0.4088291746641075,
"grad_norm": 0.1689453125,
"learning_rate": 2.9558541266794627e-05,
"loss": 0.6538,
"step": 426
},
{
"epoch": 0.40978886756238003,
"grad_norm": 0.162109375,
"learning_rate": 2.9510556621880997e-05,
"loss": 0.7139,
"step": 427
},
{
"epoch": 0.4107485604606526,
"grad_norm": 0.1650390625,
"learning_rate": 2.9462571976967373e-05,
"loss": 0.6408,
"step": 428
},
{
"epoch": 0.41170825335892514,
"grad_norm": 0.2001953125,
"learning_rate": 2.9414587332053743e-05,
"loss": 0.7105,
"step": 429
},
{
"epoch": 0.4126679462571977,
"grad_norm": 0.2197265625,
"learning_rate": 2.9366602687140116e-05,
"loss": 0.7465,
"step": 430
},
{
"epoch": 0.41362763915547024,
"grad_norm": 0.1689453125,
"learning_rate": 2.9318618042226486e-05,
"loss": 0.6594,
"step": 431
},
{
"epoch": 0.4145873320537428,
"grad_norm": 0.1708984375,
"learning_rate": 2.9270633397312863e-05,
"loss": 0.6646,
"step": 432
},
{
"epoch": 0.41554702495201534,
"grad_norm": 0.1806640625,
"learning_rate": 2.9222648752399233e-05,
"loss": 0.6537,
"step": 433
},
{
"epoch": 0.4165067178502879,
"grad_norm": 0.17578125,
"learning_rate": 2.9174664107485606e-05,
"loss": 0.7188,
"step": 434
},
{
"epoch": 0.41746641074856045,
"grad_norm": 0.1689453125,
"learning_rate": 2.9126679462571976e-05,
"loss": 0.7669,
"step": 435
},
{
"epoch": 0.418426103646833,
"grad_norm": 0.169921875,
"learning_rate": 2.9078694817658353e-05,
"loss": 0.6955,
"step": 436
},
{
"epoch": 0.41938579654510555,
"grad_norm": 0.166015625,
"learning_rate": 2.9030710172744723e-05,
"loss": 0.668,
"step": 437
},
{
"epoch": 0.42034548944337813,
"grad_norm": 0.166015625,
"learning_rate": 2.8982725527831096e-05,
"loss": 0.675,
"step": 438
},
{
"epoch": 0.42130518234165065,
"grad_norm": 0.1630859375,
"learning_rate": 2.8934740882917466e-05,
"loss": 0.7466,
"step": 439
},
{
"epoch": 0.42226487523992323,
"grad_norm": 0.1796875,
"learning_rate": 2.8886756238003843e-05,
"loss": 0.6396,
"step": 440
},
{
"epoch": 0.42322456813819576,
"grad_norm": 0.169921875,
"learning_rate": 2.8838771593090213e-05,
"loss": 0.675,
"step": 441
},
{
"epoch": 0.42418426103646834,
"grad_norm": 0.1826171875,
"learning_rate": 2.8790786948176586e-05,
"loss": 0.7135,
"step": 442
},
{
"epoch": 0.42514395393474086,
"grad_norm": 0.1728515625,
"learning_rate": 2.8742802303262956e-05,
"loss": 0.6642,
"step": 443
},
{
"epoch": 0.42610364683301344,
"grad_norm": 0.16796875,
"learning_rate": 2.8694817658349333e-05,
"loss": 0.6373,
"step": 444
},
{
"epoch": 0.42706333973128596,
"grad_norm": 0.1787109375,
"learning_rate": 2.8646833013435703e-05,
"loss": 0.6906,
"step": 445
},
{
"epoch": 0.42802303262955854,
"grad_norm": 0.1669921875,
"learning_rate": 2.8598848368522073e-05,
"loss": 0.6607,
"step": 446
},
{
"epoch": 0.4289827255278311,
"grad_norm": 0.171875,
"learning_rate": 2.8550863723608446e-05,
"loss": 0.7029,
"step": 447
},
{
"epoch": 0.42994241842610365,
"grad_norm": 0.1669921875,
"learning_rate": 2.8502879078694816e-05,
"loss": 0.6377,
"step": 448
},
{
"epoch": 0.4309021113243762,
"grad_norm": 0.1787109375,
"learning_rate": 2.8454894433781193e-05,
"loss": 0.7336,
"step": 449
},
{
"epoch": 0.43186180422264875,
"grad_norm": 0.1728515625,
"learning_rate": 2.8406909788867563e-05,
"loss": 0.6323,
"step": 450
},
{
"epoch": 0.43282149712092133,
"grad_norm": 0.1708984375,
"learning_rate": 2.8358925143953936e-05,
"loss": 0.6363,
"step": 451
},
{
"epoch": 0.43378119001919385,
"grad_norm": 0.177734375,
"learning_rate": 2.8310940499040306e-05,
"loss": 0.6749,
"step": 452
},
{
"epoch": 0.43474088291746643,
"grad_norm": 0.1748046875,
"learning_rate": 2.8262955854126683e-05,
"loss": 0.7227,
"step": 453
},
{
"epoch": 0.43570057581573896,
"grad_norm": 0.1728515625,
"learning_rate": 2.8214971209213053e-05,
"loss": 0.7482,
"step": 454
},
{
"epoch": 0.43666026871401153,
"grad_norm": 0.169921875,
"learning_rate": 2.8166986564299426e-05,
"loss": 0.6636,
"step": 455
},
{
"epoch": 0.43761996161228406,
"grad_norm": 0.1748046875,
"learning_rate": 2.8119001919385796e-05,
"loss": 0.6363,
"step": 456
},
{
"epoch": 0.43857965451055664,
"grad_norm": 0.1787109375,
"learning_rate": 2.8071017274472173e-05,
"loss": 0.6765,
"step": 457
},
{
"epoch": 0.43953934740882916,
"grad_norm": 0.166015625,
"learning_rate": 2.8023032629558543e-05,
"loss": 0.7481,
"step": 458
},
{
"epoch": 0.44049904030710174,
"grad_norm": 0.177734375,
"learning_rate": 2.7975047984644916e-05,
"loss": 0.6333,
"step": 459
},
{
"epoch": 0.44145873320537427,
"grad_norm": 0.1708984375,
"learning_rate": 2.7927063339731286e-05,
"loss": 0.6644,
"step": 460
},
{
"epoch": 0.44241842610364684,
"grad_norm": 0.1787109375,
"learning_rate": 2.7879078694817663e-05,
"loss": 0.6443,
"step": 461
},
{
"epoch": 0.44337811900191937,
"grad_norm": 0.1728515625,
"learning_rate": 2.7831094049904032e-05,
"loss": 0.7137,
"step": 462
},
{
"epoch": 0.44433781190019195,
"grad_norm": 0.1748046875,
"learning_rate": 2.7783109404990402e-05,
"loss": 0.653,
"step": 463
},
{
"epoch": 0.44529750479846447,
"grad_norm": 0.1865234375,
"learning_rate": 2.7735124760076776e-05,
"loss": 0.6663,
"step": 464
},
{
"epoch": 0.44625719769673705,
"grad_norm": 0.173828125,
"learning_rate": 2.7687140115163152e-05,
"loss": 0.7799,
"step": 465
},
{
"epoch": 0.4472168905950096,
"grad_norm": 0.1708984375,
"learning_rate": 2.7639155470249522e-05,
"loss": 0.7356,
"step": 466
},
{
"epoch": 0.44817658349328215,
"grad_norm": 0.1630859375,
"learning_rate": 2.7591170825335892e-05,
"loss": 0.622,
"step": 467
},
{
"epoch": 0.4491362763915547,
"grad_norm": 0.16796875,
"learning_rate": 2.7543186180422266e-05,
"loss": 0.7582,
"step": 468
},
{
"epoch": 0.45009596928982726,
"grad_norm": 0.173828125,
"learning_rate": 2.7495201535508642e-05,
"loss": 0.6162,
"step": 469
},
{
"epoch": 0.4510556621880998,
"grad_norm": 0.1748046875,
"learning_rate": 2.7447216890595012e-05,
"loss": 0.6388,
"step": 470
},
{
"epoch": 0.45201535508637236,
"grad_norm": 0.16796875,
"learning_rate": 2.7399232245681382e-05,
"loss": 0.6759,
"step": 471
},
{
"epoch": 0.45297504798464494,
"grad_norm": 0.1748046875,
"learning_rate": 2.7351247600767756e-05,
"loss": 0.6981,
"step": 472
},
{
"epoch": 0.45393474088291746,
"grad_norm": 0.1611328125,
"learning_rate": 2.7303262955854126e-05,
"loss": 0.6393,
"step": 473
},
{
"epoch": 0.45489443378119004,
"grad_norm": 0.1708984375,
"learning_rate": 2.7255278310940502e-05,
"loss": 0.6688,
"step": 474
},
{
"epoch": 0.45585412667946257,
"grad_norm": 0.171875,
"learning_rate": 2.7207293666026872e-05,
"loss": 0.6113,
"step": 475
},
{
"epoch": 0.45681381957773515,
"grad_norm": 0.1591796875,
"learning_rate": 2.7159309021113246e-05,
"loss": 0.6047,
"step": 476
},
{
"epoch": 0.45777351247600767,
"grad_norm": 0.19921875,
"learning_rate": 2.7111324376199615e-05,
"loss": 0.6615,
"step": 477
},
{
"epoch": 0.45873320537428025,
"grad_norm": 0.1865234375,
"learning_rate": 2.7063339731285992e-05,
"loss": 0.6283,
"step": 478
},
{
"epoch": 0.4596928982725528,
"grad_norm": 0.1865234375,
"learning_rate": 2.7015355086372362e-05,
"loss": 0.7262,
"step": 479
},
{
"epoch": 0.46065259117082535,
"grad_norm": 0.181640625,
"learning_rate": 2.6967370441458732e-05,
"loss": 0.6462,
"step": 480
},
{
"epoch": 0.4616122840690979,
"grad_norm": 0.1826171875,
"learning_rate": 2.6919385796545105e-05,
"loss": 0.6287,
"step": 481
},
{
"epoch": 0.46257197696737046,
"grad_norm": 0.1796875,
"learning_rate": 2.6871401151631482e-05,
"loss": 0.6341,
"step": 482
},
{
"epoch": 0.463531669865643,
"grad_norm": 0.16796875,
"learning_rate": 2.6823416506717852e-05,
"loss": 0.6403,
"step": 483
},
{
"epoch": 0.46449136276391556,
"grad_norm": 0.1796875,
"learning_rate": 2.6775431861804222e-05,
"loss": 0.6479,
"step": 484
},
{
"epoch": 0.4654510556621881,
"grad_norm": 0.1728515625,
"learning_rate": 2.6727447216890595e-05,
"loss": 0.6622,
"step": 485
},
{
"epoch": 0.46641074856046066,
"grad_norm": 0.171875,
"learning_rate": 2.6679462571976972e-05,
"loss": 0.7203,
"step": 486
},
{
"epoch": 0.4673704414587332,
"grad_norm": 0.267578125,
"learning_rate": 2.6631477927063342e-05,
"loss": 0.6908,
"step": 487
},
{
"epoch": 0.46833013435700577,
"grad_norm": 0.177734375,
"learning_rate": 2.6583493282149712e-05,
"loss": 0.6072,
"step": 488
},
{
"epoch": 0.4692898272552783,
"grad_norm": 0.1767578125,
"learning_rate": 2.6535508637236085e-05,
"loss": 0.6831,
"step": 489
},
{
"epoch": 0.47024952015355087,
"grad_norm": 0.177734375,
"learning_rate": 2.6487523992322462e-05,
"loss": 0.6283,
"step": 490
},
{
"epoch": 0.4712092130518234,
"grad_norm": 0.173828125,
"learning_rate": 2.6439539347408832e-05,
"loss": 0.7183,
"step": 491
},
{
"epoch": 0.472168905950096,
"grad_norm": 0.18359375,
"learning_rate": 2.6391554702495202e-05,
"loss": 0.6332,
"step": 492
},
{
"epoch": 0.4731285988483685,
"grad_norm": 0.181640625,
"learning_rate": 2.6343570057581575e-05,
"loss": 0.629,
"step": 493
},
{
"epoch": 0.4740882917466411,
"grad_norm": 0.17578125,
"learning_rate": 2.6295585412667952e-05,
"loss": 0.6857,
"step": 494
},
{
"epoch": 0.4750479846449136,
"grad_norm": 0.169921875,
"learning_rate": 2.6247600767754322e-05,
"loss": 0.6298,
"step": 495
},
{
"epoch": 0.4760076775431862,
"grad_norm": 0.1767578125,
"learning_rate": 2.6199616122840692e-05,
"loss": 0.6572,
"step": 496
},
{
"epoch": 0.47696737044145876,
"grad_norm": 0.1806640625,
"learning_rate": 2.615163147792706e-05,
"loss": 0.5846,
"step": 497
},
{
"epoch": 0.4779270633397313,
"grad_norm": 0.2138671875,
"learning_rate": 2.6103646833013435e-05,
"loss": 0.7872,
"step": 498
},
{
"epoch": 0.47888675623800386,
"grad_norm": 0.181640625,
"learning_rate": 2.6055662188099812e-05,
"loss": 0.6542,
"step": 499
},
{
"epoch": 0.4798464491362764,
"grad_norm": 0.173828125,
"learning_rate": 2.600767754318618e-05,
"loss": 0.6299,
"step": 500
},
{
"epoch": 0.48080614203454897,
"grad_norm": 0.2392578125,
"learning_rate": 2.595969289827255e-05,
"loss": 0.6197,
"step": 501
},
{
"epoch": 0.4817658349328215,
"grad_norm": 0.19921875,
"learning_rate": 2.5911708253358925e-05,
"loss": 0.6573,
"step": 502
},
{
"epoch": 0.48272552783109407,
"grad_norm": 0.1796875,
"learning_rate": 2.58637236084453e-05,
"loss": 0.6847,
"step": 503
},
{
"epoch": 0.4836852207293666,
"grad_norm": 0.1689453125,
"learning_rate": 2.581573896353167e-05,
"loss": 0.6969,
"step": 504
},
{
"epoch": 0.4846449136276392,
"grad_norm": 0.1748046875,
"learning_rate": 2.576775431861804e-05,
"loss": 0.6578,
"step": 505
},
{
"epoch": 0.4856046065259117,
"grad_norm": 0.1806640625,
"learning_rate": 2.5719769673704415e-05,
"loss": 0.712,
"step": 506
},
{
"epoch": 0.4865642994241843,
"grad_norm": 0.18359375,
"learning_rate": 2.567178502879079e-05,
"loss": 0.6483,
"step": 507
},
{
"epoch": 0.4875239923224568,
"grad_norm": 0.1728515625,
"learning_rate": 2.562380038387716e-05,
"loss": 0.751,
"step": 508
},
{
"epoch": 0.4884836852207294,
"grad_norm": 0.17578125,
"learning_rate": 2.557581573896353e-05,
"loss": 0.7356,
"step": 509
},
{
"epoch": 0.4894433781190019,
"grad_norm": 0.1826171875,
"learning_rate": 2.5527831094049905e-05,
"loss": 0.6905,
"step": 510
},
{
"epoch": 0.4904030710172745,
"grad_norm": 0.177734375,
"learning_rate": 2.547984644913628e-05,
"loss": 0.6421,
"step": 511
},
{
"epoch": 0.491362763915547,
"grad_norm": 0.23046875,
"learning_rate": 2.543186180422265e-05,
"loss": 0.7447,
"step": 512
},
{
"epoch": 0.4923224568138196,
"grad_norm": 0.177734375,
"learning_rate": 2.538387715930902e-05,
"loss": 0.63,
"step": 513
},
{
"epoch": 0.4932821497120921,
"grad_norm": 0.1806640625,
"learning_rate": 2.533589251439539e-05,
"loss": 0.6252,
"step": 514
},
{
"epoch": 0.4942418426103647,
"grad_norm": 0.1787109375,
"learning_rate": 2.5287907869481768e-05,
"loss": 0.6607,
"step": 515
},
{
"epoch": 0.4952015355086372,
"grad_norm": 0.1796875,
"learning_rate": 2.523992322456814e-05,
"loss": 0.6498,
"step": 516
},
{
"epoch": 0.4961612284069098,
"grad_norm": 0.171875,
"learning_rate": 2.519193857965451e-05,
"loss": 0.7057,
"step": 517
},
{
"epoch": 0.4971209213051823,
"grad_norm": 0.173828125,
"learning_rate": 2.514395393474088e-05,
"loss": 0.6589,
"step": 518
},
{
"epoch": 0.4980806142034549,
"grad_norm": 0.1796875,
"learning_rate": 2.5095969289827258e-05,
"loss": 0.6719,
"step": 519
},
{
"epoch": 0.4990403071017274,
"grad_norm": 0.181640625,
"learning_rate": 2.504798464491363e-05,
"loss": 0.7305,
"step": 520
},
{
"epoch": 0.5,
"grad_norm": 0.1826171875,
"learning_rate": 2.5e-05,
"loss": 0.6557,
"step": 521
},
{
"epoch": 0.5009596928982726,
"grad_norm": 0.18359375,
"learning_rate": 2.495201535508637e-05,
"loss": 0.7,
"step": 522
},
{
"epoch": 0.5019193857965452,
"grad_norm": 0.1708984375,
"learning_rate": 2.4904030710172744e-05,
"loss": 0.6396,
"step": 523
},
{
"epoch": 0.5028790786948176,
"grad_norm": 0.17578125,
"learning_rate": 2.4856046065259118e-05,
"loss": 0.6105,
"step": 524
},
{
"epoch": 0.5038387715930902,
"grad_norm": 0.189453125,
"learning_rate": 2.480806142034549e-05,
"loss": 0.6552,
"step": 525
},
{
"epoch": 0.5047984644913628,
"grad_norm": 0.1845703125,
"learning_rate": 2.476007677543186e-05,
"loss": 0.6455,
"step": 526
},
{
"epoch": 0.5057581573896354,
"grad_norm": 0.17578125,
"learning_rate": 2.4712092130518234e-05,
"loss": 0.6799,
"step": 527
},
{
"epoch": 0.5067178502879078,
"grad_norm": 0.1767578125,
"learning_rate": 2.4664107485604608e-05,
"loss": 0.7056,
"step": 528
},
{
"epoch": 0.5076775431861804,
"grad_norm": 0.1806640625,
"learning_rate": 2.461612284069098e-05,
"loss": 0.6064,
"step": 529
},
{
"epoch": 0.508637236084453,
"grad_norm": 0.181640625,
"learning_rate": 2.456813819577735e-05,
"loss": 0.6832,
"step": 530
},
{
"epoch": 0.5095969289827256,
"grad_norm": 0.1806640625,
"learning_rate": 2.4520153550863724e-05,
"loss": 0.6416,
"step": 531
},
{
"epoch": 0.510556621880998,
"grad_norm": 0.171875,
"learning_rate": 2.4472168905950098e-05,
"loss": 0.6261,
"step": 532
},
{
"epoch": 0.5115163147792706,
"grad_norm": 0.1826171875,
"learning_rate": 2.442418426103647e-05,
"loss": 0.648,
"step": 533
},
{
"epoch": 0.5124760076775432,
"grad_norm": 0.265625,
"learning_rate": 2.437619961612284e-05,
"loss": 0.6793,
"step": 534
},
{
"epoch": 0.5134357005758158,
"grad_norm": 0.1689453125,
"learning_rate": 2.4328214971209214e-05,
"loss": 0.711,
"step": 535
},
{
"epoch": 0.5143953934740882,
"grad_norm": 0.1787109375,
"learning_rate": 2.4280230326295588e-05,
"loss": 0.6517,
"step": 536
},
{
"epoch": 0.5153550863723608,
"grad_norm": 0.1875,
"learning_rate": 2.423224568138196e-05,
"loss": 0.6468,
"step": 537
},
{
"epoch": 0.5163147792706334,
"grad_norm": 0.1796875,
"learning_rate": 2.418426103646833e-05,
"loss": 0.6369,
"step": 538
},
{
"epoch": 0.517274472168906,
"grad_norm": 0.248046875,
"learning_rate": 2.4136276391554704e-05,
"loss": 0.6717,
"step": 539
},
{
"epoch": 0.5182341650671785,
"grad_norm": 0.177734375,
"learning_rate": 2.4088291746641074e-05,
"loss": 0.7289,
"step": 540
},
{
"epoch": 0.519193857965451,
"grad_norm": 0.1787109375,
"learning_rate": 2.404030710172745e-05,
"loss": 0.6263,
"step": 541
},
{
"epoch": 0.5201535508637236,
"grad_norm": 0.18359375,
"learning_rate": 2.399232245681382e-05,
"loss": 0.6856,
"step": 542
},
{
"epoch": 0.5211132437619962,
"grad_norm": 0.1806640625,
"learning_rate": 2.3944337811900194e-05,
"loss": 0.6376,
"step": 543
},
{
"epoch": 0.5220729366602687,
"grad_norm": 0.177734375,
"learning_rate": 2.3896353166986564e-05,
"loss": 0.5946,
"step": 544
},
{
"epoch": 0.5230326295585412,
"grad_norm": 0.17578125,
"learning_rate": 2.384836852207294e-05,
"loss": 0.661,
"step": 545
},
{
"epoch": 0.5239923224568138,
"grad_norm": 0.1826171875,
"learning_rate": 2.380038387715931e-05,
"loss": 0.7157,
"step": 546
},
{
"epoch": 0.5249520153550864,
"grad_norm": 0.2041015625,
"learning_rate": 2.375239923224568e-05,
"loss": 0.7163,
"step": 547
},
{
"epoch": 0.525911708253359,
"grad_norm": 0.1796875,
"learning_rate": 2.3704414587332054e-05,
"loss": 0.612,
"step": 548
},
{
"epoch": 0.5268714011516314,
"grad_norm": 0.1767578125,
"learning_rate": 2.3656429942418427e-05,
"loss": 0.6513,
"step": 549
},
{
"epoch": 0.527831094049904,
"grad_norm": 0.1767578125,
"learning_rate": 2.36084452975048e-05,
"loss": 0.6722,
"step": 550
},
{
"epoch": 0.5287907869481766,
"grad_norm": 0.18359375,
"learning_rate": 2.356046065259117e-05,
"loss": 0.6462,
"step": 551
},
{
"epoch": 0.5297504798464492,
"grad_norm": 0.197265625,
"learning_rate": 2.3512476007677544e-05,
"loss": 0.6398,
"step": 552
},
{
"epoch": 0.5307101727447217,
"grad_norm": 0.1806640625,
"learning_rate": 2.3464491362763917e-05,
"loss": 0.6747,
"step": 553
},
{
"epoch": 0.5316698656429942,
"grad_norm": 0.2021484375,
"learning_rate": 2.341650671785029e-05,
"loss": 0.795,
"step": 554
},
{
"epoch": 0.5326295585412668,
"grad_norm": 0.1826171875,
"learning_rate": 2.336852207293666e-05,
"loss": 0.6755,
"step": 555
},
{
"epoch": 0.5335892514395394,
"grad_norm": 0.37109375,
"learning_rate": 2.3320537428023034e-05,
"loss": 0.6751,
"step": 556
},
{
"epoch": 0.5345489443378119,
"grad_norm": 0.2001953125,
"learning_rate": 2.3272552783109404e-05,
"loss": 0.7003,
"step": 557
},
{
"epoch": 0.5355086372360844,
"grad_norm": 0.185546875,
"learning_rate": 2.322456813819578e-05,
"loss": 0.6791,
"step": 558
},
{
"epoch": 0.536468330134357,
"grad_norm": 0.181640625,
"learning_rate": 2.317658349328215e-05,
"loss": 0.6676,
"step": 559
},
{
"epoch": 0.5374280230326296,
"grad_norm": 0.1962890625,
"learning_rate": 2.3128598848368524e-05,
"loss": 0.695,
"step": 560
},
{
"epoch": 0.5383877159309021,
"grad_norm": 0.1806640625,
"learning_rate": 2.3080614203454894e-05,
"loss": 0.6326,
"step": 561
},
{
"epoch": 0.5393474088291746,
"grad_norm": 0.1962890625,
"learning_rate": 2.303262955854127e-05,
"loss": 0.6717,
"step": 562
},
{
"epoch": 0.5403071017274472,
"grad_norm": 0.1865234375,
"learning_rate": 2.298464491362764e-05,
"loss": 0.6424,
"step": 563
},
{
"epoch": 0.5412667946257198,
"grad_norm": 0.173828125,
"learning_rate": 2.2936660268714014e-05,
"loss": 0.6316,
"step": 564
},
{
"epoch": 0.5422264875239923,
"grad_norm": 0.2890625,
"learning_rate": 2.2888675623800384e-05,
"loss": 0.6362,
"step": 565
},
{
"epoch": 0.5431861804222649,
"grad_norm": 0.189453125,
"learning_rate": 2.2840690978886757e-05,
"loss": 0.6916,
"step": 566
},
{
"epoch": 0.5441458733205374,
"grad_norm": 0.18359375,
"learning_rate": 2.279270633397313e-05,
"loss": 0.6169,
"step": 567
},
{
"epoch": 0.54510556621881,
"grad_norm": 0.201171875,
"learning_rate": 2.2744721689059504e-05,
"loss": 0.6349,
"step": 568
},
{
"epoch": 0.5460652591170825,
"grad_norm": 0.1796875,
"learning_rate": 2.2696737044145873e-05,
"loss": 0.7681,
"step": 569
},
{
"epoch": 0.5470249520153551,
"grad_norm": 0.1845703125,
"learning_rate": 2.2648752399232247e-05,
"loss": 0.6704,
"step": 570
},
{
"epoch": 0.5479846449136276,
"grad_norm": 0.1865234375,
"learning_rate": 2.260076775431862e-05,
"loss": 0.657,
"step": 571
},
{
"epoch": 0.5489443378119002,
"grad_norm": 0.21484375,
"learning_rate": 2.255278310940499e-05,
"loss": 0.6165,
"step": 572
},
{
"epoch": 0.5499040307101728,
"grad_norm": 0.1845703125,
"learning_rate": 2.2504798464491363e-05,
"loss": 0.6614,
"step": 573
},
{
"epoch": 0.5508637236084453,
"grad_norm": 0.18359375,
"learning_rate": 2.2456813819577733e-05,
"loss": 0.7066,
"step": 574
},
{
"epoch": 0.5518234165067178,
"grad_norm": 0.18359375,
"learning_rate": 2.240882917466411e-05,
"loss": 0.7623,
"step": 575
},
{
"epoch": 0.5527831094049904,
"grad_norm": 0.1669921875,
"learning_rate": 2.236084452975048e-05,
"loss": 0.6818,
"step": 576
},
{
"epoch": 0.553742802303263,
"grad_norm": 0.18359375,
"learning_rate": 2.2312859884836853e-05,
"loss": 0.6859,
"step": 577
},
{
"epoch": 0.5547024952015355,
"grad_norm": 0.185546875,
"learning_rate": 2.2264875239923223e-05,
"loss": 0.7791,
"step": 578
},
{
"epoch": 0.555662188099808,
"grad_norm": 0.1884765625,
"learning_rate": 2.22168905950096e-05,
"loss": 0.6485,
"step": 579
},
{
"epoch": 0.5566218809980806,
"grad_norm": 0.18359375,
"learning_rate": 2.216890595009597e-05,
"loss": 0.6189,
"step": 580
},
{
"epoch": 0.5575815738963532,
"grad_norm": 0.177734375,
"learning_rate": 2.2120921305182343e-05,
"loss": 0.6253,
"step": 581
},
{
"epoch": 0.5585412667946257,
"grad_norm": 0.1796875,
"learning_rate": 2.2072936660268713e-05,
"loss": 0.6484,
"step": 582
},
{
"epoch": 0.5595009596928983,
"grad_norm": 0.1875,
"learning_rate": 2.2024952015355087e-05,
"loss": 0.6793,
"step": 583
},
{
"epoch": 0.5604606525911708,
"grad_norm": 0.1796875,
"learning_rate": 2.197696737044146e-05,
"loss": 0.7062,
"step": 584
},
{
"epoch": 0.5614203454894434,
"grad_norm": 0.220703125,
"learning_rate": 2.1928982725527833e-05,
"loss": 0.6422,
"step": 585
},
{
"epoch": 0.5623800383877159,
"grad_norm": 0.1904296875,
"learning_rate": 2.1880998080614203e-05,
"loss": 0.6656,
"step": 586
},
{
"epoch": 0.5633397312859885,
"grad_norm": 0.1845703125,
"learning_rate": 2.1833013435700576e-05,
"loss": 0.6325,
"step": 587
},
{
"epoch": 0.564299424184261,
"grad_norm": 0.181640625,
"learning_rate": 2.178502879078695e-05,
"loss": 0.5826,
"step": 588
},
{
"epoch": 0.5652591170825336,
"grad_norm": 0.216796875,
"learning_rate": 2.1737044145873323e-05,
"loss": 0.6637,
"step": 589
},
{
"epoch": 0.5662188099808061,
"grad_norm": 0.1904296875,
"learning_rate": 2.1689059500959693e-05,
"loss": 0.7056,
"step": 590
},
{
"epoch": 0.5671785028790787,
"grad_norm": 0.1904296875,
"learning_rate": 2.1641074856046066e-05,
"loss": 0.6351,
"step": 591
},
{
"epoch": 0.5681381957773513,
"grad_norm": 0.1796875,
"learning_rate": 2.159309021113244e-05,
"loss": 0.6376,
"step": 592
},
{
"epoch": 0.5690978886756238,
"grad_norm": 0.291015625,
"learning_rate": 2.1545105566218813e-05,
"loss": 0.7302,
"step": 593
},
{
"epoch": 0.5700575815738963,
"grad_norm": 0.177734375,
"learning_rate": 2.1497120921305183e-05,
"loss": 0.6534,
"step": 594
},
{
"epoch": 0.5710172744721689,
"grad_norm": 0.193359375,
"learning_rate": 2.1449136276391556e-05,
"loss": 0.7276,
"step": 595
},
{
"epoch": 0.5719769673704415,
"grad_norm": 0.1884765625,
"learning_rate": 2.140115163147793e-05,
"loss": 0.7175,
"step": 596
},
{
"epoch": 0.572936660268714,
"grad_norm": 0.1787109375,
"learning_rate": 2.13531669865643e-05,
"loss": 0.6521,
"step": 597
},
{
"epoch": 0.5738963531669866,
"grad_norm": 0.1884765625,
"learning_rate": 2.1305182341650673e-05,
"loss": 0.6608,
"step": 598
},
{
"epoch": 0.5748560460652591,
"grad_norm": 0.185546875,
"learning_rate": 2.1257197696737043e-05,
"loss": 0.6794,
"step": 599
},
{
"epoch": 0.5758157389635317,
"grad_norm": 0.1923828125,
"learning_rate": 2.1209213051823416e-05,
"loss": 0.6665,
"step": 600
},
{
"epoch": 0.5767754318618042,
"grad_norm": 0.1904296875,
"learning_rate": 2.116122840690979e-05,
"loss": 0.7426,
"step": 601
},
{
"epoch": 0.5777351247600768,
"grad_norm": 0.2021484375,
"learning_rate": 2.1113243761996163e-05,
"loss": 0.7074,
"step": 602
},
{
"epoch": 0.5786948176583493,
"grad_norm": 0.181640625,
"learning_rate": 2.1065259117082533e-05,
"loss": 0.6269,
"step": 603
},
{
"epoch": 0.5796545105566219,
"grad_norm": 0.17578125,
"learning_rate": 2.1017274472168906e-05,
"loss": 0.6592,
"step": 604
},
{
"epoch": 0.5806142034548945,
"grad_norm": 0.17578125,
"learning_rate": 2.096928982725528e-05,
"loss": 0.653,
"step": 605
},
{
"epoch": 0.581573896353167,
"grad_norm": 0.17578125,
"learning_rate": 2.0921305182341653e-05,
"loss": 0.6614,
"step": 606
},
{
"epoch": 0.5825335892514395,
"grad_norm": 0.1728515625,
"learning_rate": 2.0873320537428023e-05,
"loss": 0.695,
"step": 607
},
{
"epoch": 0.5834932821497121,
"grad_norm": 0.1806640625,
"learning_rate": 2.0825335892514396e-05,
"loss": 0.6617,
"step": 608
},
{
"epoch": 0.5844529750479847,
"grad_norm": 0.1796875,
"learning_rate": 2.077735124760077e-05,
"loss": 0.6105,
"step": 609
},
{
"epoch": 0.5854126679462572,
"grad_norm": 0.1826171875,
"learning_rate": 2.0729366602687143e-05,
"loss": 0.6862,
"step": 610
},
{
"epoch": 0.5863723608445297,
"grad_norm": 0.1767578125,
"learning_rate": 2.0681381957773513e-05,
"loss": 0.6923,
"step": 611
},
{
"epoch": 0.5873320537428023,
"grad_norm": 0.1796875,
"learning_rate": 2.0633397312859886e-05,
"loss": 0.6701,
"step": 612
},
{
"epoch": 0.5882917466410749,
"grad_norm": 0.1884765625,
"learning_rate": 2.058541266794626e-05,
"loss": 0.7386,
"step": 613
},
{
"epoch": 0.5892514395393474,
"grad_norm": 0.193359375,
"learning_rate": 2.0537428023032633e-05,
"loss": 0.6568,
"step": 614
},
{
"epoch": 0.5902111324376199,
"grad_norm": 0.1884765625,
"learning_rate": 2.0489443378119003e-05,
"loss": 0.64,
"step": 615
},
{
"epoch": 0.5911708253358925,
"grad_norm": 0.1875,
"learning_rate": 2.0441458733205376e-05,
"loss": 0.633,
"step": 616
},
{
"epoch": 0.5921305182341651,
"grad_norm": 0.220703125,
"learning_rate": 2.0393474088291746e-05,
"loss": 0.5945,
"step": 617
},
{
"epoch": 0.5930902111324377,
"grad_norm": 0.18359375,
"learning_rate": 2.0345489443378122e-05,
"loss": 0.6024,
"step": 618
},
{
"epoch": 0.5940499040307101,
"grad_norm": 0.189453125,
"learning_rate": 2.0297504798464492e-05,
"loss": 0.6571,
"step": 619
},
{
"epoch": 0.5950095969289827,
"grad_norm": 0.208984375,
"learning_rate": 2.0249520153550866e-05,
"loss": 0.6426,
"step": 620
},
{
"epoch": 0.5959692898272553,
"grad_norm": 0.1826171875,
"learning_rate": 2.0201535508637236e-05,
"loss": 0.6087,
"step": 621
},
{
"epoch": 0.5969289827255279,
"grad_norm": 0.267578125,
"learning_rate": 2.015355086372361e-05,
"loss": 0.6481,
"step": 622
},
{
"epoch": 0.5978886756238004,
"grad_norm": 0.189453125,
"learning_rate": 2.0105566218809982e-05,
"loss": 0.712,
"step": 623
},
{
"epoch": 0.5988483685220729,
"grad_norm": 0.2099609375,
"learning_rate": 2.0057581573896352e-05,
"loss": 0.6061,
"step": 624
},
{
"epoch": 0.5998080614203455,
"grad_norm": 0.1787109375,
"learning_rate": 2.0009596928982726e-05,
"loss": 0.6566,
"step": 625
},
{
"epoch": 0.6007677543186181,
"grad_norm": 0.18359375,
"learning_rate": 1.99616122840691e-05,
"loss": 0.682,
"step": 626
},
{
"epoch": 0.6017274472168906,
"grad_norm": 0.1904296875,
"learning_rate": 1.9913627639155472e-05,
"loss": 0.6136,
"step": 627
},
{
"epoch": 0.6026871401151631,
"grad_norm": 0.181640625,
"learning_rate": 1.9865642994241842e-05,
"loss": 0.6502,
"step": 628
},
{
"epoch": 0.6036468330134357,
"grad_norm": 0.1865234375,
"learning_rate": 1.9817658349328216e-05,
"loss": 0.6402,
"step": 629
},
{
"epoch": 0.6046065259117083,
"grad_norm": 0.208984375,
"learning_rate": 1.976967370441459e-05,
"loss": 0.7516,
"step": 630
},
{
"epoch": 0.6055662188099808,
"grad_norm": 0.1884765625,
"learning_rate": 1.9721689059500962e-05,
"loss": 0.7179,
"step": 631
},
{
"epoch": 0.6065259117082533,
"grad_norm": 0.1943359375,
"learning_rate": 1.9673704414587332e-05,
"loss": 0.6649,
"step": 632
},
{
"epoch": 0.6074856046065259,
"grad_norm": 0.1875,
"learning_rate": 1.9625719769673705e-05,
"loss": 0.6738,
"step": 633
},
{
"epoch": 0.6084452975047985,
"grad_norm": 0.181640625,
"learning_rate": 1.957773512476008e-05,
"loss": 0.6455,
"step": 634
},
{
"epoch": 0.6094049904030711,
"grad_norm": 0.1767578125,
"learning_rate": 1.9529750479846452e-05,
"loss": 0.6375,
"step": 635
},
{
"epoch": 0.6103646833013435,
"grad_norm": 0.201171875,
"learning_rate": 1.9481765834932822e-05,
"loss": 0.6934,
"step": 636
},
{
"epoch": 0.6113243761996161,
"grad_norm": 0.173828125,
"learning_rate": 1.9433781190019195e-05,
"loss": 0.702,
"step": 637
},
{
"epoch": 0.6122840690978887,
"grad_norm": 0.181640625,
"learning_rate": 1.9385796545105565e-05,
"loss": 0.6226,
"step": 638
},
{
"epoch": 0.6132437619961613,
"grad_norm": 0.19140625,
"learning_rate": 1.9337811900191942e-05,
"loss": 0.6289,
"step": 639
},
{
"epoch": 0.6142034548944337,
"grad_norm": 0.1865234375,
"learning_rate": 1.9289827255278312e-05,
"loss": 0.6697,
"step": 640
},
{
"epoch": 0.6151631477927063,
"grad_norm": 0.1923828125,
"learning_rate": 1.9241842610364685e-05,
"loss": 0.6255,
"step": 641
},
{
"epoch": 0.6161228406909789,
"grad_norm": 0.1953125,
"learning_rate": 1.9193857965451055e-05,
"loss": 0.6304,
"step": 642
},
{
"epoch": 0.6170825335892515,
"grad_norm": 0.185546875,
"learning_rate": 1.914587332053743e-05,
"loss": 0.6377,
"step": 643
},
{
"epoch": 0.6180422264875239,
"grad_norm": 0.1806640625,
"learning_rate": 1.9097888675623802e-05,
"loss": 0.6029,
"step": 644
},
{
"epoch": 0.6190019193857965,
"grad_norm": 0.1865234375,
"learning_rate": 1.9049904030710175e-05,
"loss": 0.6464,
"step": 645
},
{
"epoch": 0.6199616122840691,
"grad_norm": 0.19140625,
"learning_rate": 1.9001919385796545e-05,
"loss": 0.6886,
"step": 646
},
{
"epoch": 0.6209213051823417,
"grad_norm": 0.1875,
"learning_rate": 1.895393474088292e-05,
"loss": 0.6223,
"step": 647
},
{
"epoch": 0.6218809980806143,
"grad_norm": 0.259765625,
"learning_rate": 1.8905950095969292e-05,
"loss": 0.6896,
"step": 648
},
{
"epoch": 0.6228406909788867,
"grad_norm": 0.1865234375,
"learning_rate": 1.8857965451055662e-05,
"loss": 0.6286,
"step": 649
},
{
"epoch": 0.6238003838771593,
"grad_norm": 0.1767578125,
"learning_rate": 1.8809980806142035e-05,
"loss": 0.6154,
"step": 650
},
{
"epoch": 0.6247600767754319,
"grad_norm": 0.1884765625,
"learning_rate": 1.876199616122841e-05,
"loss": 0.6716,
"step": 651
},
{
"epoch": 0.6257197696737045,
"grad_norm": 0.2001953125,
"learning_rate": 1.8714011516314782e-05,
"loss": 0.6663,
"step": 652
},
{
"epoch": 0.6266794625719769,
"grad_norm": 0.173828125,
"learning_rate": 1.866602687140115e-05,
"loss": 0.6428,
"step": 653
},
{
"epoch": 0.6276391554702495,
"grad_norm": 0.1767578125,
"learning_rate": 1.8618042226487525e-05,
"loss": 0.6735,
"step": 654
},
{
"epoch": 0.6285988483685221,
"grad_norm": 0.1962890625,
"learning_rate": 1.8570057581573895e-05,
"loss": 0.6826,
"step": 655
},
{
"epoch": 0.6295585412667947,
"grad_norm": 0.1865234375,
"learning_rate": 1.852207293666027e-05,
"loss": 0.6801,
"step": 656
},
{
"epoch": 0.6305182341650671,
"grad_norm": 0.1806640625,
"learning_rate": 1.847408829174664e-05,
"loss": 0.6533,
"step": 657
},
{
"epoch": 0.6314779270633397,
"grad_norm": 0.19140625,
"learning_rate": 1.8426103646833015e-05,
"loss": 0.7034,
"step": 658
},
{
"epoch": 0.6324376199616123,
"grad_norm": 0.1875,
"learning_rate": 1.8378119001919385e-05,
"loss": 0.5994,
"step": 659
},
{
"epoch": 0.6333973128598849,
"grad_norm": 0.173828125,
"learning_rate": 1.8330134357005758e-05,
"loss": 0.6696,
"step": 660
},
{
"epoch": 0.6343570057581573,
"grad_norm": 0.1865234375,
"learning_rate": 1.828214971209213e-05,
"loss": 0.6168,
"step": 661
},
{
"epoch": 0.6353166986564299,
"grad_norm": 0.185546875,
"learning_rate": 1.8234165067178505e-05,
"loss": 0.6458,
"step": 662
},
{
"epoch": 0.6362763915547025,
"grad_norm": 0.1826171875,
"learning_rate": 1.8186180422264875e-05,
"loss": 0.6651,
"step": 663
},
{
"epoch": 0.6372360844529751,
"grad_norm": 0.1904296875,
"learning_rate": 1.8138195777351248e-05,
"loss": 0.5914,
"step": 664
},
{
"epoch": 0.6381957773512476,
"grad_norm": 0.23828125,
"learning_rate": 1.809021113243762e-05,
"loss": 0.6907,
"step": 665
},
{
"epoch": 0.6391554702495201,
"grad_norm": 0.1845703125,
"learning_rate": 1.8042226487523995e-05,
"loss": 0.6052,
"step": 666
},
{
"epoch": 0.6401151631477927,
"grad_norm": 0.2236328125,
"learning_rate": 1.7994241842610365e-05,
"loss": 0.602,
"step": 667
},
{
"epoch": 0.6410748560460653,
"grad_norm": 0.185546875,
"learning_rate": 1.7946257197696738e-05,
"loss": 0.6327,
"step": 668
},
{
"epoch": 0.6420345489443378,
"grad_norm": 0.1923828125,
"learning_rate": 1.789827255278311e-05,
"loss": 0.6137,
"step": 669
},
{
"epoch": 0.6429942418426103,
"grad_norm": 0.18359375,
"learning_rate": 1.785028790786948e-05,
"loss": 0.6465,
"step": 670
},
{
"epoch": 0.6439539347408829,
"grad_norm": 0.181640625,
"learning_rate": 1.7802303262955855e-05,
"loss": 0.606,
"step": 671
},
{
"epoch": 0.6449136276391555,
"grad_norm": 0.255859375,
"learning_rate": 1.7754318618042225e-05,
"loss": 0.6425,
"step": 672
},
{
"epoch": 0.6458733205374281,
"grad_norm": 0.181640625,
"learning_rate": 1.77063339731286e-05,
"loss": 0.7085,
"step": 673
},
{
"epoch": 0.6468330134357005,
"grad_norm": 0.1845703125,
"learning_rate": 1.765834932821497e-05,
"loss": 0.6124,
"step": 674
},
{
"epoch": 0.6477927063339731,
"grad_norm": 0.193359375,
"learning_rate": 1.7610364683301345e-05,
"loss": 0.6603,
"step": 675
},
{
"epoch": 0.6487523992322457,
"grad_norm": 0.189453125,
"learning_rate": 1.7562380038387714e-05,
"loss": 0.6491,
"step": 676
},
{
"epoch": 0.6497120921305183,
"grad_norm": 0.1904296875,
"learning_rate": 1.751439539347409e-05,
"loss": 0.6662,
"step": 677
},
{
"epoch": 0.6506717850287908,
"grad_norm": 0.1982421875,
"learning_rate": 1.746641074856046e-05,
"loss": 0.5911,
"step": 678
},
{
"epoch": 0.6516314779270633,
"grad_norm": 0.181640625,
"learning_rate": 1.7418426103646834e-05,
"loss": 0.6353,
"step": 679
},
{
"epoch": 0.6525911708253359,
"grad_norm": 0.1943359375,
"learning_rate": 1.7370441458733204e-05,
"loss": 0.6741,
"step": 680
},
{
"epoch": 0.6535508637236085,
"grad_norm": 0.1875,
"learning_rate": 1.7322456813819578e-05,
"loss": 0.6571,
"step": 681
},
{
"epoch": 0.654510556621881,
"grad_norm": 0.1884765625,
"learning_rate": 1.727447216890595e-05,
"loss": 0.6054,
"step": 682
},
{
"epoch": 0.6554702495201535,
"grad_norm": 0.345703125,
"learning_rate": 1.7226487523992324e-05,
"loss": 0.6947,
"step": 683
},
{
"epoch": 0.6564299424184261,
"grad_norm": 0.1787109375,
"learning_rate": 1.7178502879078694e-05,
"loss": 0.618,
"step": 684
},
{
"epoch": 0.6573896353166987,
"grad_norm": 0.1884765625,
"learning_rate": 1.7130518234165068e-05,
"loss": 0.6234,
"step": 685
},
{
"epoch": 0.6583493282149712,
"grad_norm": 0.181640625,
"learning_rate": 1.708253358925144e-05,
"loss": 0.7353,
"step": 686
},
{
"epoch": 0.6593090211132437,
"grad_norm": 0.1845703125,
"learning_rate": 1.7034548944337814e-05,
"loss": 0.7295,
"step": 687
},
{
"epoch": 0.6602687140115163,
"grad_norm": 0.185546875,
"learning_rate": 1.6986564299424184e-05,
"loss": 0.6263,
"step": 688
},
{
"epoch": 0.6612284069097889,
"grad_norm": 0.1796875,
"learning_rate": 1.6938579654510558e-05,
"loss": 0.7171,
"step": 689
},
{
"epoch": 0.6621880998080614,
"grad_norm": 0.24609375,
"learning_rate": 1.689059500959693e-05,
"loss": 0.6646,
"step": 690
},
{
"epoch": 0.663147792706334,
"grad_norm": 0.263671875,
"learning_rate": 1.6842610364683304e-05,
"loss": 0.7177,
"step": 691
},
{
"epoch": 0.6641074856046065,
"grad_norm": 0.185546875,
"learning_rate": 1.6794625719769674e-05,
"loss": 0.6303,
"step": 692
},
{
"epoch": 0.6650671785028791,
"grad_norm": 0.201171875,
"learning_rate": 1.6746641074856048e-05,
"loss": 0.6422,
"step": 693
},
{
"epoch": 0.6660268714011516,
"grad_norm": 0.283203125,
"learning_rate": 1.669865642994242e-05,
"loss": 0.6577,
"step": 694
},
{
"epoch": 0.6669865642994242,
"grad_norm": 0.1767578125,
"learning_rate": 1.665067178502879e-05,
"loss": 0.7325,
"step": 695
},
{
"epoch": 0.6679462571976967,
"grad_norm": 0.181640625,
"learning_rate": 1.6602687140115164e-05,
"loss": 0.6445,
"step": 696
},
{
"epoch": 0.6689059500959693,
"grad_norm": 0.1923828125,
"learning_rate": 1.6554702495201534e-05,
"loss": 0.6456,
"step": 697
},
{
"epoch": 0.6698656429942419,
"grad_norm": 0.1787109375,
"learning_rate": 1.6506717850287907e-05,
"loss": 0.5991,
"step": 698
},
{
"epoch": 0.6708253358925144,
"grad_norm": 0.2216796875,
"learning_rate": 1.645873320537428e-05,
"loss": 0.7291,
"step": 699
},
{
"epoch": 0.6717850287907869,
"grad_norm": 0.189453125,
"learning_rate": 1.6410748560460654e-05,
"loss": 0.6138,
"step": 700
},
{
"epoch": 0.6727447216890595,
"grad_norm": 0.234375,
"learning_rate": 1.6362763915547024e-05,
"loss": 0.7448,
"step": 701
},
{
"epoch": 0.6737044145873321,
"grad_norm": 0.19140625,
"learning_rate": 1.6314779270633397e-05,
"loss": 0.6448,
"step": 702
},
{
"epoch": 0.6746641074856046,
"grad_norm": 0.1884765625,
"learning_rate": 1.626679462571977e-05,
"loss": 0.6737,
"step": 703
},
{
"epoch": 0.6756238003838771,
"grad_norm": 0.177734375,
"learning_rate": 1.6218809980806144e-05,
"loss": 0.6544,
"step": 704
},
{
"epoch": 0.6765834932821497,
"grad_norm": 0.1826171875,
"learning_rate": 1.6170825335892514e-05,
"loss": 0.6924,
"step": 705
},
{
"epoch": 0.6775431861804223,
"grad_norm": 0.1826171875,
"learning_rate": 1.6122840690978887e-05,
"loss": 0.6291,
"step": 706
},
{
"epoch": 0.6785028790786948,
"grad_norm": 0.1904296875,
"learning_rate": 1.607485604606526e-05,
"loss": 0.6303,
"step": 707
},
{
"epoch": 0.6794625719769674,
"grad_norm": 0.1865234375,
"learning_rate": 1.6026871401151634e-05,
"loss": 0.6373,
"step": 708
},
{
"epoch": 0.6804222648752399,
"grad_norm": 0.189453125,
"learning_rate": 1.5978886756238004e-05,
"loss": 0.5909,
"step": 709
},
{
"epoch": 0.6813819577735125,
"grad_norm": 0.1787109375,
"learning_rate": 1.5930902111324377e-05,
"loss": 0.6442,
"step": 710
},
{
"epoch": 0.682341650671785,
"grad_norm": 0.267578125,
"learning_rate": 1.588291746641075e-05,
"loss": 0.7233,
"step": 711
},
{
"epoch": 0.6833013435700576,
"grad_norm": 0.189453125,
"learning_rate": 1.5834932821497124e-05,
"loss": 0.6688,
"step": 712
},
{
"epoch": 0.6842610364683301,
"grad_norm": 0.259765625,
"learning_rate": 1.5786948176583494e-05,
"loss": 0.6431,
"step": 713
},
{
"epoch": 0.6852207293666027,
"grad_norm": 0.1826171875,
"learning_rate": 1.5738963531669867e-05,
"loss": 0.6273,
"step": 714
},
{
"epoch": 0.6861804222648752,
"grad_norm": 0.1875,
"learning_rate": 1.5690978886756237e-05,
"loss": 0.6353,
"step": 715
},
{
"epoch": 0.6871401151631478,
"grad_norm": 0.208984375,
"learning_rate": 1.5642994241842614e-05,
"loss": 0.7089,
"step": 716
},
{
"epoch": 0.6880998080614203,
"grad_norm": 0.1787109375,
"learning_rate": 1.5595009596928984e-05,
"loss": 0.6341,
"step": 717
},
{
"epoch": 0.6890595009596929,
"grad_norm": 0.18359375,
"learning_rate": 1.5547024952015357e-05,
"loss": 0.6427,
"step": 718
},
{
"epoch": 0.6900191938579654,
"grad_norm": 0.1845703125,
"learning_rate": 1.5499040307101727e-05,
"loss": 0.6742,
"step": 719
},
{
"epoch": 0.690978886756238,
"grad_norm": 0.1943359375,
"learning_rate": 1.54510556621881e-05,
"loss": 0.6445,
"step": 720
},
{
"epoch": 0.6919385796545106,
"grad_norm": 0.1806640625,
"learning_rate": 1.5403071017274474e-05,
"loss": 0.6915,
"step": 721
},
{
"epoch": 0.6928982725527831,
"grad_norm": 0.1787109375,
"learning_rate": 1.5355086372360844e-05,
"loss": 0.6877,
"step": 722
},
{
"epoch": 0.6938579654510557,
"grad_norm": 0.18359375,
"learning_rate": 1.5307101727447217e-05,
"loss": 0.6345,
"step": 723
},
{
"epoch": 0.6948176583493282,
"grad_norm": 0.240234375,
"learning_rate": 1.525911708253359e-05,
"loss": 0.6741,
"step": 724
},
{
"epoch": 0.6957773512476008,
"grad_norm": 0.197265625,
"learning_rate": 1.5211132437619962e-05,
"loss": 0.5774,
"step": 725
},
{
"epoch": 0.6967370441458733,
"grad_norm": 0.1875,
"learning_rate": 1.5163147792706333e-05,
"loss": 0.6195,
"step": 726
},
{
"epoch": 0.6976967370441459,
"grad_norm": 0.18359375,
"learning_rate": 1.5115163147792707e-05,
"loss": 0.6503,
"step": 727
},
{
"epoch": 0.6986564299424184,
"grad_norm": 0.201171875,
"learning_rate": 1.5067178502879078e-05,
"loss": 0.72,
"step": 728
},
{
"epoch": 0.699616122840691,
"grad_norm": 0.2099609375,
"learning_rate": 1.5019193857965452e-05,
"loss": 0.6388,
"step": 729
},
{
"epoch": 0.7005758157389635,
"grad_norm": 0.17578125,
"learning_rate": 1.4971209213051823e-05,
"loss": 0.722,
"step": 730
},
{
"epoch": 0.7015355086372361,
"grad_norm": 0.2578125,
"learning_rate": 1.4923224568138197e-05,
"loss": 0.7685,
"step": 731
},
{
"epoch": 0.7024952015355086,
"grad_norm": 0.1767578125,
"learning_rate": 1.4875239923224568e-05,
"loss": 0.6498,
"step": 732
},
{
"epoch": 0.7034548944337812,
"grad_norm": 0.1943359375,
"learning_rate": 1.4827255278310942e-05,
"loss": 0.6546,
"step": 733
},
{
"epoch": 0.7044145873320538,
"grad_norm": 0.2080078125,
"learning_rate": 1.4779270633397313e-05,
"loss": 0.7309,
"step": 734
},
{
"epoch": 0.7053742802303263,
"grad_norm": 0.193359375,
"learning_rate": 1.4731285988483687e-05,
"loss": 0.6422,
"step": 735
},
{
"epoch": 0.7063339731285988,
"grad_norm": 0.1875,
"learning_rate": 1.4683301343570058e-05,
"loss": 0.6917,
"step": 736
},
{
"epoch": 0.7072936660268714,
"grad_norm": 0.1865234375,
"learning_rate": 1.4635316698656432e-05,
"loss": 0.6573,
"step": 737
},
{
"epoch": 0.708253358925144,
"grad_norm": 0.181640625,
"learning_rate": 1.4587332053742803e-05,
"loss": 0.6355,
"step": 738
},
{
"epoch": 0.7092130518234165,
"grad_norm": 0.18359375,
"learning_rate": 1.4539347408829177e-05,
"loss": 0.6352,
"step": 739
},
{
"epoch": 0.710172744721689,
"grad_norm": 0.177734375,
"learning_rate": 1.4491362763915548e-05,
"loss": 0.6415,
"step": 740
},
{
"epoch": 0.7111324376199616,
"grad_norm": 0.1884765625,
"learning_rate": 1.4443378119001921e-05,
"loss": 0.6732,
"step": 741
},
{
"epoch": 0.7120921305182342,
"grad_norm": 0.2177734375,
"learning_rate": 1.4395393474088293e-05,
"loss": 0.6681,
"step": 742
},
{
"epoch": 0.7130518234165067,
"grad_norm": 0.189453125,
"learning_rate": 1.4347408829174666e-05,
"loss": 0.6902,
"step": 743
},
{
"epoch": 0.7140115163147792,
"grad_norm": 0.181640625,
"learning_rate": 1.4299424184261036e-05,
"loss": 0.7067,
"step": 744
},
{
"epoch": 0.7149712092130518,
"grad_norm": 0.185546875,
"learning_rate": 1.4251439539347408e-05,
"loss": 0.6437,
"step": 745
},
{
"epoch": 0.7159309021113244,
"grad_norm": 0.19140625,
"learning_rate": 1.4203454894433781e-05,
"loss": 0.6672,
"step": 746
},
{
"epoch": 0.716890595009597,
"grad_norm": 0.1865234375,
"learning_rate": 1.4155470249520153e-05,
"loss": 0.7454,
"step": 747
},
{
"epoch": 0.7178502879078695,
"grad_norm": 0.20703125,
"learning_rate": 1.4107485604606526e-05,
"loss": 0.6786,
"step": 748
},
{
"epoch": 0.718809980806142,
"grad_norm": 0.19140625,
"learning_rate": 1.4059500959692898e-05,
"loss": 0.6255,
"step": 749
},
{
"epoch": 0.7197696737044146,
"grad_norm": 0.259765625,
"learning_rate": 1.4011516314779271e-05,
"loss": 0.7794,
"step": 750
},
{
"epoch": 0.7207293666026872,
"grad_norm": 0.1845703125,
"learning_rate": 1.3963531669865643e-05,
"loss": 0.6911,
"step": 751
},
{
"epoch": 0.7216890595009597,
"grad_norm": 0.1865234375,
"learning_rate": 1.3915547024952016e-05,
"loss": 0.6235,
"step": 752
},
{
"epoch": 0.7226487523992322,
"grad_norm": 0.1943359375,
"learning_rate": 1.3867562380038388e-05,
"loss": 0.6387,
"step": 753
},
{
"epoch": 0.7236084452975048,
"grad_norm": 0.2158203125,
"learning_rate": 1.3819577735124761e-05,
"loss": 0.7356,
"step": 754
},
{
"epoch": 0.7245681381957774,
"grad_norm": 0.1845703125,
"learning_rate": 1.3771593090211133e-05,
"loss": 0.6322,
"step": 755
},
{
"epoch": 0.72552783109405,
"grad_norm": 0.2080078125,
"learning_rate": 1.3723608445297506e-05,
"loss": 0.6258,
"step": 756
},
{
"epoch": 0.7264875239923224,
"grad_norm": 0.1845703125,
"learning_rate": 1.3675623800383878e-05,
"loss": 0.6248,
"step": 757
},
{
"epoch": 0.727447216890595,
"grad_norm": 0.1884765625,
"learning_rate": 1.3627639155470251e-05,
"loss": 0.6328,
"step": 758
},
{
"epoch": 0.7284069097888676,
"grad_norm": 0.1787109375,
"learning_rate": 1.3579654510556623e-05,
"loss": 0.6624,
"step": 759
},
{
"epoch": 0.7293666026871402,
"grad_norm": 0.1845703125,
"learning_rate": 1.3531669865642996e-05,
"loss": 0.6588,
"step": 760
},
{
"epoch": 0.7303262955854126,
"grad_norm": 0.189453125,
"learning_rate": 1.3483685220729366e-05,
"loss": 0.6972,
"step": 761
},
{
"epoch": 0.7312859884836852,
"grad_norm": 0.20703125,
"learning_rate": 1.3435700575815741e-05,
"loss": 0.5945,
"step": 762
},
{
"epoch": 0.7322456813819578,
"grad_norm": 0.1953125,
"learning_rate": 1.3387715930902111e-05,
"loss": 0.6462,
"step": 763
},
{
"epoch": 0.7332053742802304,
"grad_norm": 0.181640625,
"learning_rate": 1.3339731285988486e-05,
"loss": 0.6181,
"step": 764
},
{
"epoch": 0.7341650671785028,
"grad_norm": 0.193359375,
"learning_rate": 1.3291746641074856e-05,
"loss": 0.7334,
"step": 765
},
{
"epoch": 0.7351247600767754,
"grad_norm": 0.220703125,
"learning_rate": 1.3243761996161231e-05,
"loss": 0.6741,
"step": 766
},
{
"epoch": 0.736084452975048,
"grad_norm": 0.1962890625,
"learning_rate": 1.3195777351247601e-05,
"loss": 0.6938,
"step": 767
},
{
"epoch": 0.7370441458733206,
"grad_norm": 0.19921875,
"learning_rate": 1.3147792706333976e-05,
"loss": 0.6204,
"step": 768
},
{
"epoch": 0.738003838771593,
"grad_norm": 0.1845703125,
"learning_rate": 1.3099808061420346e-05,
"loss": 0.6083,
"step": 769
},
{
"epoch": 0.7389635316698656,
"grad_norm": 0.1904296875,
"learning_rate": 1.3051823416506717e-05,
"loss": 0.6224,
"step": 770
},
{
"epoch": 0.7399232245681382,
"grad_norm": 0.17578125,
"learning_rate": 1.300383877159309e-05,
"loss": 0.6501,
"step": 771
},
{
"epoch": 0.7408829174664108,
"grad_norm": 0.25,
"learning_rate": 1.2955854126679462e-05,
"loss": 0.6908,
"step": 772
},
{
"epoch": 0.7418426103646834,
"grad_norm": 0.1728515625,
"learning_rate": 1.2907869481765836e-05,
"loss": 0.6874,
"step": 773
},
{
"epoch": 0.7428023032629558,
"grad_norm": 0.1904296875,
"learning_rate": 1.2859884836852207e-05,
"loss": 0.6379,
"step": 774
},
{
"epoch": 0.7437619961612284,
"grad_norm": 0.1845703125,
"learning_rate": 1.281190019193858e-05,
"loss": 0.6241,
"step": 775
},
{
"epoch": 0.744721689059501,
"grad_norm": 0.1865234375,
"learning_rate": 1.2763915547024952e-05,
"loss": 0.7043,
"step": 776
},
{
"epoch": 0.7456813819577736,
"grad_norm": 0.1904296875,
"learning_rate": 1.2715930902111326e-05,
"loss": 0.6731,
"step": 777
},
{
"epoch": 0.746641074856046,
"grad_norm": 0.1943359375,
"learning_rate": 1.2667946257197696e-05,
"loss": 0.6525,
"step": 778
},
{
"epoch": 0.7476007677543186,
"grad_norm": 0.1865234375,
"learning_rate": 1.261996161228407e-05,
"loss": 0.6071,
"step": 779
},
{
"epoch": 0.7485604606525912,
"grad_norm": 0.1884765625,
"learning_rate": 1.257197696737044e-05,
"loss": 0.6466,
"step": 780
},
{
"epoch": 0.7495201535508638,
"grad_norm": 0.19140625,
"learning_rate": 1.2523992322456816e-05,
"loss": 0.6391,
"step": 781
},
{
"epoch": 0.7504798464491362,
"grad_norm": 0.1845703125,
"learning_rate": 1.2476007677543186e-05,
"loss": 0.5748,
"step": 782
},
{
"epoch": 0.7514395393474088,
"grad_norm": 0.193359375,
"learning_rate": 1.2428023032629559e-05,
"loss": 0.704,
"step": 783
},
{
"epoch": 0.7523992322456814,
"grad_norm": 0.1884765625,
"learning_rate": 1.238003838771593e-05,
"loss": 0.6755,
"step": 784
},
{
"epoch": 0.753358925143954,
"grad_norm": 0.1865234375,
"learning_rate": 1.2332053742802304e-05,
"loss": 0.7469,
"step": 785
},
{
"epoch": 0.7543186180422264,
"grad_norm": 0.19921875,
"learning_rate": 1.2284069097888675e-05,
"loss": 0.6035,
"step": 786
},
{
"epoch": 0.755278310940499,
"grad_norm": 0.185546875,
"learning_rate": 1.2236084452975049e-05,
"loss": 0.6198,
"step": 787
},
{
"epoch": 0.7562380038387716,
"grad_norm": 0.18359375,
"learning_rate": 1.218809980806142e-05,
"loss": 0.6353,
"step": 788
},
{
"epoch": 0.7571976967370442,
"grad_norm": 0.185546875,
"learning_rate": 1.2140115163147794e-05,
"loss": 0.601,
"step": 789
},
{
"epoch": 0.7581573896353166,
"grad_norm": 0.310546875,
"learning_rate": 1.2092130518234165e-05,
"loss": 0.6355,
"step": 790
},
{
"epoch": 0.7591170825335892,
"grad_norm": 0.1962890625,
"learning_rate": 1.2044145873320537e-05,
"loss": 0.6715,
"step": 791
},
{
"epoch": 0.7600767754318618,
"grad_norm": 0.1875,
"learning_rate": 1.199616122840691e-05,
"loss": 0.6421,
"step": 792
},
{
"epoch": 0.7610364683301344,
"grad_norm": 0.201171875,
"learning_rate": 1.1948176583493282e-05,
"loss": 0.657,
"step": 793
},
{
"epoch": 0.761996161228407,
"grad_norm": 0.197265625,
"learning_rate": 1.1900191938579655e-05,
"loss": 0.6295,
"step": 794
},
{
"epoch": 0.7629558541266794,
"grad_norm": 0.19140625,
"learning_rate": 1.1852207293666027e-05,
"loss": 0.6543,
"step": 795
},
{
"epoch": 0.763915547024952,
"grad_norm": 0.1923828125,
"learning_rate": 1.18042226487524e-05,
"loss": 0.6595,
"step": 796
},
{
"epoch": 0.7648752399232246,
"grad_norm": 0.1884765625,
"learning_rate": 1.1756238003838772e-05,
"loss": 0.6413,
"step": 797
},
{
"epoch": 0.7658349328214972,
"grad_norm": 0.1875,
"learning_rate": 1.1708253358925145e-05,
"loss": 0.6076,
"step": 798
},
{
"epoch": 0.7667946257197696,
"grad_norm": 0.181640625,
"learning_rate": 1.1660268714011517e-05,
"loss": 0.6734,
"step": 799
},
{
"epoch": 0.7677543186180422,
"grad_norm": 0.1943359375,
"learning_rate": 1.161228406909789e-05,
"loss": 0.687,
"step": 800
},
{
"epoch": 0.7687140115163148,
"grad_norm": 0.248046875,
"learning_rate": 1.1564299424184262e-05,
"loss": 0.6757,
"step": 801
},
{
"epoch": 0.7696737044145874,
"grad_norm": 0.1904296875,
"learning_rate": 1.1516314779270635e-05,
"loss": 0.6202,
"step": 802
},
{
"epoch": 0.7706333973128598,
"grad_norm": 0.1875,
"learning_rate": 1.1468330134357007e-05,
"loss": 0.6245,
"step": 803
},
{
"epoch": 0.7715930902111324,
"grad_norm": 0.197265625,
"learning_rate": 1.1420345489443378e-05,
"loss": 0.6626,
"step": 804
},
{
"epoch": 0.772552783109405,
"grad_norm": 0.185546875,
"learning_rate": 1.1372360844529752e-05,
"loss": 0.7026,
"step": 805
},
{
"epoch": 0.7735124760076776,
"grad_norm": 0.1884765625,
"learning_rate": 1.1324376199616123e-05,
"loss": 0.6806,
"step": 806
},
{
"epoch": 0.77447216890595,
"grad_norm": 0.1943359375,
"learning_rate": 1.1276391554702495e-05,
"loss": 0.7397,
"step": 807
},
{
"epoch": 0.7754318618042226,
"grad_norm": 0.1982421875,
"learning_rate": 1.1228406909788867e-05,
"loss": 0.6077,
"step": 808
},
{
"epoch": 0.7763915547024952,
"grad_norm": 0.189453125,
"learning_rate": 1.118042226487524e-05,
"loss": 0.5914,
"step": 809
},
{
"epoch": 0.7773512476007678,
"grad_norm": 0.181640625,
"learning_rate": 1.1132437619961612e-05,
"loss": 0.5846,
"step": 810
},
{
"epoch": 0.7783109404990403,
"grad_norm": 0.1865234375,
"learning_rate": 1.1084452975047985e-05,
"loss": 0.6449,
"step": 811
},
{
"epoch": 0.7792706333973128,
"grad_norm": 0.1875,
"learning_rate": 1.1036468330134357e-05,
"loss": 0.6712,
"step": 812
},
{
"epoch": 0.7802303262955854,
"grad_norm": 0.18359375,
"learning_rate": 1.098848368522073e-05,
"loss": 0.6806,
"step": 813
},
{
"epoch": 0.781190019193858,
"grad_norm": 0.1904296875,
"learning_rate": 1.0940499040307102e-05,
"loss": 0.6603,
"step": 814
},
{
"epoch": 0.7821497120921305,
"grad_norm": 0.1943359375,
"learning_rate": 1.0892514395393475e-05,
"loss": 0.6292,
"step": 815
},
{
"epoch": 0.783109404990403,
"grad_norm": 0.1953125,
"learning_rate": 1.0844529750479847e-05,
"loss": 0.6088,
"step": 816
},
{
"epoch": 0.7840690978886756,
"grad_norm": 0.2119140625,
"learning_rate": 1.079654510556622e-05,
"loss": 0.6789,
"step": 817
},
{
"epoch": 0.7850287907869482,
"grad_norm": 0.201171875,
"learning_rate": 1.0748560460652591e-05,
"loss": 0.6616,
"step": 818
},
{
"epoch": 0.7859884836852208,
"grad_norm": 0.1875,
"learning_rate": 1.0700575815738965e-05,
"loss": 0.6143,
"step": 819
},
{
"epoch": 0.7869481765834933,
"grad_norm": 0.193359375,
"learning_rate": 1.0652591170825336e-05,
"loss": 0.6689,
"step": 820
},
{
"epoch": 0.7879078694817658,
"grad_norm": 0.1767578125,
"learning_rate": 1.0604606525911708e-05,
"loss": 0.704,
"step": 821
},
{
"epoch": 0.7888675623800384,
"grad_norm": 0.193359375,
"learning_rate": 1.0556621880998081e-05,
"loss": 0.5669,
"step": 822
},
{
"epoch": 0.789827255278311,
"grad_norm": 0.23828125,
"learning_rate": 1.0508637236084453e-05,
"loss": 0.5997,
"step": 823
},
{
"epoch": 0.7907869481765835,
"grad_norm": 0.181640625,
"learning_rate": 1.0460652591170826e-05,
"loss": 0.6471,
"step": 824
},
{
"epoch": 0.791746641074856,
"grad_norm": 0.1923828125,
"learning_rate": 1.0412667946257198e-05,
"loss": 0.5961,
"step": 825
},
{
"epoch": 0.7927063339731286,
"grad_norm": 0.189453125,
"learning_rate": 1.0364683301343571e-05,
"loss": 0.7073,
"step": 826
},
{
"epoch": 0.7936660268714012,
"grad_norm": 0.181640625,
"learning_rate": 1.0316698656429943e-05,
"loss": 0.6735,
"step": 827
},
{
"epoch": 0.7946257197696737,
"grad_norm": 0.1806640625,
"learning_rate": 1.0268714011516316e-05,
"loss": 0.6317,
"step": 828
},
{
"epoch": 0.7955854126679462,
"grad_norm": 0.1796875,
"learning_rate": 1.0220729366602688e-05,
"loss": 0.6551,
"step": 829
},
{
"epoch": 0.7965451055662188,
"grad_norm": 0.173828125,
"learning_rate": 1.0172744721689061e-05,
"loss": 0.6298,
"step": 830
},
{
"epoch": 0.7975047984644914,
"grad_norm": 0.1865234375,
"learning_rate": 1.0124760076775433e-05,
"loss": 0.6228,
"step": 831
},
{
"epoch": 0.7984644913627639,
"grad_norm": 0.189453125,
"learning_rate": 1.0076775431861805e-05,
"loss": 0.6468,
"step": 832
},
{
"epoch": 0.7994241842610365,
"grad_norm": 0.189453125,
"learning_rate": 1.0028790786948176e-05,
"loss": 0.6267,
"step": 833
},
{
"epoch": 0.800383877159309,
"grad_norm": 0.189453125,
"learning_rate": 9.98080614203455e-06,
"loss": 0.6171,
"step": 834
},
{
"epoch": 0.8013435700575816,
"grad_norm": 0.1796875,
"learning_rate": 9.932821497120921e-06,
"loss": 0.6174,
"step": 835
},
{
"epoch": 0.8023032629558541,
"grad_norm": 0.1826171875,
"learning_rate": 9.884836852207294e-06,
"loss": 0.7022,
"step": 836
},
{
"epoch": 0.8032629558541267,
"grad_norm": 0.2451171875,
"learning_rate": 9.836852207293666e-06,
"loss": 0.6331,
"step": 837
},
{
"epoch": 0.8042226487523992,
"grad_norm": 0.1826171875,
"learning_rate": 9.78886756238004e-06,
"loss": 0.6398,
"step": 838
},
{
"epoch": 0.8051823416506718,
"grad_norm": 0.1865234375,
"learning_rate": 9.740882917466411e-06,
"loss": 0.6572,
"step": 839
},
{
"epoch": 0.8061420345489443,
"grad_norm": 0.265625,
"learning_rate": 9.692898272552783e-06,
"loss": 0.6728,
"step": 840
},
{
"epoch": 0.8071017274472169,
"grad_norm": 0.2275390625,
"learning_rate": 9.644913627639156e-06,
"loss": 0.7734,
"step": 841
},
{
"epoch": 0.8080614203454894,
"grad_norm": 0.1884765625,
"learning_rate": 9.596928982725528e-06,
"loss": 0.6028,
"step": 842
},
{
"epoch": 0.809021113243762,
"grad_norm": 0.185546875,
"learning_rate": 9.548944337811901e-06,
"loss": 0.6531,
"step": 843
},
{
"epoch": 0.8099808061420346,
"grad_norm": 0.19140625,
"learning_rate": 9.500959692898273e-06,
"loss": 0.7045,
"step": 844
},
{
"epoch": 0.8109404990403071,
"grad_norm": 0.185546875,
"learning_rate": 9.452975047984646e-06,
"loss": 0.636,
"step": 845
},
{
"epoch": 0.8119001919385797,
"grad_norm": 0.205078125,
"learning_rate": 9.404990403071018e-06,
"loss": 0.632,
"step": 846
},
{
"epoch": 0.8128598848368522,
"grad_norm": 0.2109375,
"learning_rate": 9.357005758157391e-06,
"loss": 0.6391,
"step": 847
},
{
"epoch": 0.8138195777351248,
"grad_norm": 0.1962890625,
"learning_rate": 9.309021113243763e-06,
"loss": 0.6591,
"step": 848
},
{
"epoch": 0.8147792706333973,
"grad_norm": 0.1953125,
"learning_rate": 9.261036468330136e-06,
"loss": 0.6406,
"step": 849
},
{
"epoch": 0.8157389635316699,
"grad_norm": 0.1923828125,
"learning_rate": 9.213051823416507e-06,
"loss": 0.6303,
"step": 850
},
{
"epoch": 0.8166986564299424,
"grad_norm": 0.189453125,
"learning_rate": 9.165067178502879e-06,
"loss": 0.6677,
"step": 851
},
{
"epoch": 0.817658349328215,
"grad_norm": 0.1796875,
"learning_rate": 9.117082533589252e-06,
"loss": 0.6476,
"step": 852
},
{
"epoch": 0.8186180422264875,
"grad_norm": 0.1982421875,
"learning_rate": 9.069097888675624e-06,
"loss": 0.7144,
"step": 853
},
{
"epoch": 0.8195777351247601,
"grad_norm": 0.1884765625,
"learning_rate": 9.021113243761997e-06,
"loss": 0.6014,
"step": 854
},
{
"epoch": 0.8205374280230326,
"grad_norm": 0.19140625,
"learning_rate": 8.973128598848369e-06,
"loss": 0.6275,
"step": 855
},
{
"epoch": 0.8214971209213052,
"grad_norm": 0.18359375,
"learning_rate": 8.92514395393474e-06,
"loss": 0.7131,
"step": 856
},
{
"epoch": 0.8224568138195777,
"grad_norm": 0.2373046875,
"learning_rate": 8.877159309021112e-06,
"loss": 0.7126,
"step": 857
},
{
"epoch": 0.8234165067178503,
"grad_norm": 0.1865234375,
"learning_rate": 8.829174664107486e-06,
"loss": 0.6512,
"step": 858
},
{
"epoch": 0.8243761996161229,
"grad_norm": 0.1953125,
"learning_rate": 8.781190019193857e-06,
"loss": 0.6421,
"step": 859
},
{
"epoch": 0.8253358925143954,
"grad_norm": 0.1875,
"learning_rate": 8.73320537428023e-06,
"loss": 0.7562,
"step": 860
},
{
"epoch": 0.8262955854126679,
"grad_norm": 0.185546875,
"learning_rate": 8.685220729366602e-06,
"loss": 0.676,
"step": 861
},
{
"epoch": 0.8272552783109405,
"grad_norm": 0.1982421875,
"learning_rate": 8.637236084452976e-06,
"loss": 0.6452,
"step": 862
},
{
"epoch": 0.8282149712092131,
"grad_norm": 0.1884765625,
"learning_rate": 8.589251439539347e-06,
"loss": 0.6486,
"step": 863
},
{
"epoch": 0.8291746641074856,
"grad_norm": 0.1904296875,
"learning_rate": 8.54126679462572e-06,
"loss": 0.6452,
"step": 864
},
{
"epoch": 0.8301343570057581,
"grad_norm": 0.2275390625,
"learning_rate": 8.493282149712092e-06,
"loss": 0.6602,
"step": 865
},
{
"epoch": 0.8310940499040307,
"grad_norm": 0.1845703125,
"learning_rate": 8.445297504798465e-06,
"loss": 0.6752,
"step": 866
},
{
"epoch": 0.8320537428023033,
"grad_norm": 0.1953125,
"learning_rate": 8.397312859884837e-06,
"loss": 0.6572,
"step": 867
},
{
"epoch": 0.8330134357005758,
"grad_norm": 0.193359375,
"learning_rate": 8.34932821497121e-06,
"loss": 0.6129,
"step": 868
},
{
"epoch": 0.8339731285988484,
"grad_norm": 0.185546875,
"learning_rate": 8.301343570057582e-06,
"loss": 0.6541,
"step": 869
},
{
"epoch": 0.8349328214971209,
"grad_norm": 0.255859375,
"learning_rate": 8.253358925143954e-06,
"loss": 0.7138,
"step": 870
},
{
"epoch": 0.8358925143953935,
"grad_norm": 0.2021484375,
"learning_rate": 8.205374280230327e-06,
"loss": 0.5956,
"step": 871
},
{
"epoch": 0.836852207293666,
"grad_norm": 0.1943359375,
"learning_rate": 8.157389635316699e-06,
"loss": 0.6929,
"step": 872
},
{
"epoch": 0.8378119001919386,
"grad_norm": 0.1826171875,
"learning_rate": 8.109404990403072e-06,
"loss": 0.6584,
"step": 873
},
{
"epoch": 0.8387715930902111,
"grad_norm": 0.19921875,
"learning_rate": 8.061420345489444e-06,
"loss": 0.706,
"step": 874
},
{
"epoch": 0.8397312859884837,
"grad_norm": 0.2021484375,
"learning_rate": 8.013435700575817e-06,
"loss": 0.6272,
"step": 875
},
{
"epoch": 0.8406909788867563,
"grad_norm": 0.19140625,
"learning_rate": 7.965451055662189e-06,
"loss": 0.6094,
"step": 876
},
{
"epoch": 0.8416506717850288,
"grad_norm": 0.19140625,
"learning_rate": 7.917466410748562e-06,
"loss": 0.6238,
"step": 877
},
{
"epoch": 0.8426103646833013,
"grad_norm": 0.1923828125,
"learning_rate": 7.869481765834934e-06,
"loss": 0.6495,
"step": 878
},
{
"epoch": 0.8435700575815739,
"grad_norm": 0.1904296875,
"learning_rate": 7.821497120921307e-06,
"loss": 0.623,
"step": 879
},
{
"epoch": 0.8445297504798465,
"grad_norm": 0.197265625,
"learning_rate": 7.773512476007678e-06,
"loss": 0.6089,
"step": 880
},
{
"epoch": 0.845489443378119,
"grad_norm": 0.189453125,
"learning_rate": 7.72552783109405e-06,
"loss": 0.6583,
"step": 881
},
{
"epoch": 0.8464491362763915,
"grad_norm": 0.208984375,
"learning_rate": 7.677543186180422e-06,
"loss": 0.5981,
"step": 882
},
{
"epoch": 0.8474088291746641,
"grad_norm": 0.2001953125,
"learning_rate": 7.629558541266795e-06,
"loss": 0.6403,
"step": 883
},
{
"epoch": 0.8483685220729367,
"grad_norm": 0.19140625,
"learning_rate": 7.581573896353167e-06,
"loss": 0.6056,
"step": 884
},
{
"epoch": 0.8493282149712092,
"grad_norm": 0.1943359375,
"learning_rate": 7.533589251439539e-06,
"loss": 0.641,
"step": 885
},
{
"epoch": 0.8502879078694817,
"grad_norm": 0.197265625,
"learning_rate": 7.485604606525912e-06,
"loss": 0.6395,
"step": 886
},
{
"epoch": 0.8512476007677543,
"grad_norm": 0.1923828125,
"learning_rate": 7.437619961612284e-06,
"loss": 0.6442,
"step": 887
},
{
"epoch": 0.8522072936660269,
"grad_norm": 0.19140625,
"learning_rate": 7.389635316698657e-06,
"loss": 0.7474,
"step": 888
},
{
"epoch": 0.8531669865642995,
"grad_norm": 0.2060546875,
"learning_rate": 7.341650671785029e-06,
"loss": 0.7601,
"step": 889
},
{
"epoch": 0.8541266794625719,
"grad_norm": 0.19921875,
"learning_rate": 7.293666026871402e-06,
"loss": 0.6018,
"step": 890
},
{
"epoch": 0.8550863723608445,
"grad_norm": 0.19140625,
"learning_rate": 7.245681381957774e-06,
"loss": 0.678,
"step": 891
},
{
"epoch": 0.8560460652591171,
"grad_norm": 0.1865234375,
"learning_rate": 7.1976967370441466e-06,
"loss": 0.6355,
"step": 892
},
{
"epoch": 0.8570057581573897,
"grad_norm": 0.18359375,
"learning_rate": 7.149712092130518e-06,
"loss": 0.6635,
"step": 893
},
{
"epoch": 0.8579654510556622,
"grad_norm": 0.294921875,
"learning_rate": 7.101727447216891e-06,
"loss": 0.6118,
"step": 894
},
{
"epoch": 0.8589251439539347,
"grad_norm": 0.322265625,
"learning_rate": 7.053742802303263e-06,
"loss": 0.6892,
"step": 895
},
{
"epoch": 0.8598848368522073,
"grad_norm": 0.1923828125,
"learning_rate": 7.005758157389636e-06,
"loss": 0.6305,
"step": 896
},
{
"epoch": 0.8608445297504799,
"grad_norm": 0.185546875,
"learning_rate": 6.957773512476008e-06,
"loss": 0.617,
"step": 897
},
{
"epoch": 0.8618042226487524,
"grad_norm": 0.1953125,
"learning_rate": 6.909788867562381e-06,
"loss": 0.6202,
"step": 898
},
{
"epoch": 0.8627639155470249,
"grad_norm": 0.1884765625,
"learning_rate": 6.861804222648753e-06,
"loss": 0.6548,
"step": 899
},
{
"epoch": 0.8637236084452975,
"grad_norm": 0.1865234375,
"learning_rate": 6.8138195777351256e-06,
"loss": 0.6238,
"step": 900
},
{
"epoch": 0.8646833013435701,
"grad_norm": 0.2392578125,
"learning_rate": 6.765834932821498e-06,
"loss": 0.6827,
"step": 901
},
{
"epoch": 0.8656429942418427,
"grad_norm": 0.201171875,
"learning_rate": 6.7178502879078705e-06,
"loss": 0.6924,
"step": 902
},
{
"epoch": 0.8666026871401151,
"grad_norm": 0.193359375,
"learning_rate": 6.669865642994243e-06,
"loss": 0.6776,
"step": 903
},
{
"epoch": 0.8675623800383877,
"grad_norm": 0.1845703125,
"learning_rate": 6.6218809980806155e-06,
"loss": 0.6477,
"step": 904
},
{
"epoch": 0.8685220729366603,
"grad_norm": 0.2333984375,
"learning_rate": 6.573896353166988e-06,
"loss": 0.6498,
"step": 905
},
{
"epoch": 0.8694817658349329,
"grad_norm": 0.1953125,
"learning_rate": 6.525911708253359e-06,
"loss": 0.6242,
"step": 906
},
{
"epoch": 0.8704414587332053,
"grad_norm": 0.18359375,
"learning_rate": 6.477927063339731e-06,
"loss": 0.6607,
"step": 907
},
{
"epoch": 0.8714011516314779,
"grad_norm": 0.189453125,
"learning_rate": 6.429942418426104e-06,
"loss": 0.6355,
"step": 908
},
{
"epoch": 0.8723608445297505,
"grad_norm": 0.1884765625,
"learning_rate": 6.381957773512476e-06,
"loss": 0.7027,
"step": 909
},
{
"epoch": 0.8733205374280231,
"grad_norm": 0.1865234375,
"learning_rate": 6.333973128598848e-06,
"loss": 0.6315,
"step": 910
},
{
"epoch": 0.8742802303262955,
"grad_norm": 0.185546875,
"learning_rate": 6.28598848368522e-06,
"loss": 0.5826,
"step": 911
},
{
"epoch": 0.8752399232245681,
"grad_norm": 0.34765625,
"learning_rate": 6.238003838771593e-06,
"loss": 0.6937,
"step": 912
},
{
"epoch": 0.8761996161228407,
"grad_norm": 0.205078125,
"learning_rate": 6.190019193857965e-06,
"loss": 0.6438,
"step": 913
},
{
"epoch": 0.8771593090211133,
"grad_norm": 0.1845703125,
"learning_rate": 6.142034548944338e-06,
"loss": 0.6252,
"step": 914
},
{
"epoch": 0.8781190019193857,
"grad_norm": 0.1796875,
"learning_rate": 6.09404990403071e-06,
"loss": 0.6701,
"step": 915
},
{
"epoch": 0.8790786948176583,
"grad_norm": 0.1923828125,
"learning_rate": 6.046065259117083e-06,
"loss": 0.6044,
"step": 916
},
{
"epoch": 0.8800383877159309,
"grad_norm": 0.181640625,
"learning_rate": 5.998080614203455e-06,
"loss": 0.6991,
"step": 917
},
{
"epoch": 0.8809980806142035,
"grad_norm": 0.2001953125,
"learning_rate": 5.950095969289828e-06,
"loss": 0.6853,
"step": 918
},
{
"epoch": 0.8819577735124761,
"grad_norm": 0.208984375,
"learning_rate": 5.9021113243762e-06,
"loss": 0.5957,
"step": 919
},
{
"epoch": 0.8829174664107485,
"grad_norm": 0.1943359375,
"learning_rate": 5.854126679462573e-06,
"loss": 0.6777,
"step": 920
},
{
"epoch": 0.8838771593090211,
"grad_norm": 0.19140625,
"learning_rate": 5.806142034548945e-06,
"loss": 0.6569,
"step": 921
},
{
"epoch": 0.8848368522072937,
"grad_norm": 0.1884765625,
"learning_rate": 5.758157389635318e-06,
"loss": 0.6315,
"step": 922
},
{
"epoch": 0.8857965451055663,
"grad_norm": 0.197265625,
"learning_rate": 5.710172744721689e-06,
"loss": 0.655,
"step": 923
},
{
"epoch": 0.8867562380038387,
"grad_norm": 0.2158203125,
"learning_rate": 5.662188099808062e-06,
"loss": 0.6469,
"step": 924
},
{
"epoch": 0.8877159309021113,
"grad_norm": 0.1923828125,
"learning_rate": 5.614203454894433e-06,
"loss": 0.615,
"step": 925
},
{
"epoch": 0.8886756238003839,
"grad_norm": 0.1865234375,
"learning_rate": 5.566218809980806e-06,
"loss": 0.6481,
"step": 926
},
{
"epoch": 0.8896353166986565,
"grad_norm": 0.1884765625,
"learning_rate": 5.518234165067178e-06,
"loss": 0.6018,
"step": 927
},
{
"epoch": 0.8905950095969289,
"grad_norm": 0.1884765625,
"learning_rate": 5.470249520153551e-06,
"loss": 0.6369,
"step": 928
},
{
"epoch": 0.8915547024952015,
"grad_norm": 0.30078125,
"learning_rate": 5.422264875239923e-06,
"loss": 0.614,
"step": 929
},
{
"epoch": 0.8925143953934741,
"grad_norm": 0.2021484375,
"learning_rate": 5.374280230326296e-06,
"loss": 0.6087,
"step": 930
},
{
"epoch": 0.8934740882917467,
"grad_norm": 0.1953125,
"learning_rate": 5.326295585412668e-06,
"loss": 0.6486,
"step": 931
},
{
"epoch": 0.8944337811900192,
"grad_norm": 0.2080078125,
"learning_rate": 5.278310940499041e-06,
"loss": 0.6569,
"step": 932
},
{
"epoch": 0.8953934740882917,
"grad_norm": 0.185546875,
"learning_rate": 5.230326295585413e-06,
"loss": 0.6594,
"step": 933
},
{
"epoch": 0.8963531669865643,
"grad_norm": 0.189453125,
"learning_rate": 5.182341650671786e-06,
"loss": 0.6214,
"step": 934
},
{
"epoch": 0.8973128598848369,
"grad_norm": 0.1845703125,
"learning_rate": 5.134357005758158e-06,
"loss": 0.6567,
"step": 935
},
{
"epoch": 0.8982725527831094,
"grad_norm": 0.1923828125,
"learning_rate": 5.086372360844531e-06,
"loss": 0.7011,
"step": 936
},
{
"epoch": 0.8992322456813819,
"grad_norm": 0.1865234375,
"learning_rate": 5.038387715930902e-06,
"loss": 0.664,
"step": 937
},
{
"epoch": 0.9001919385796545,
"grad_norm": 0.2177734375,
"learning_rate": 4.990403071017275e-06,
"loss": 0.6461,
"step": 938
},
{
"epoch": 0.9011516314779271,
"grad_norm": 0.1845703125,
"learning_rate": 4.942418426103647e-06,
"loss": 0.7291,
"step": 939
},
{
"epoch": 0.9021113243761996,
"grad_norm": 0.1845703125,
"learning_rate": 4.89443378119002e-06,
"loss": 0.645,
"step": 940
},
{
"epoch": 0.9030710172744721,
"grad_norm": 0.201171875,
"learning_rate": 4.846449136276391e-06,
"loss": 0.6564,
"step": 941
},
{
"epoch": 0.9040307101727447,
"grad_norm": 0.1845703125,
"learning_rate": 4.798464491362764e-06,
"loss": 0.6059,
"step": 942
},
{
"epoch": 0.9049904030710173,
"grad_norm": 0.1884765625,
"learning_rate": 4.750479846449136e-06,
"loss": 0.6725,
"step": 943
},
{
"epoch": 0.9059500959692899,
"grad_norm": 0.189453125,
"learning_rate": 4.702495201535509e-06,
"loss": 0.696,
"step": 944
},
{
"epoch": 0.9069097888675623,
"grad_norm": 0.1943359375,
"learning_rate": 4.654510556621881e-06,
"loss": 0.7019,
"step": 945
},
{
"epoch": 0.9078694817658349,
"grad_norm": 0.1845703125,
"learning_rate": 4.606525911708254e-06,
"loss": 0.6446,
"step": 946
},
{
"epoch": 0.9088291746641075,
"grad_norm": 0.203125,
"learning_rate": 4.558541266794626e-06,
"loss": 0.6118,
"step": 947
},
{
"epoch": 0.9097888675623801,
"grad_norm": 0.3359375,
"learning_rate": 4.510556621880999e-06,
"loss": 0.7057,
"step": 948
},
{
"epoch": 0.9107485604606526,
"grad_norm": 0.19140625,
"learning_rate": 4.46257197696737e-06,
"loss": 0.5763,
"step": 949
},
{
"epoch": 0.9117082533589251,
"grad_norm": 0.283203125,
"learning_rate": 4.414587332053743e-06,
"loss": 0.7103,
"step": 950
},
{
"epoch": 0.9126679462571977,
"grad_norm": 0.193359375,
"learning_rate": 4.366602687140115e-06,
"loss": 0.6363,
"step": 951
},
{
"epoch": 0.9136276391554703,
"grad_norm": 0.234375,
"learning_rate": 4.318618042226488e-06,
"loss": 0.7067,
"step": 952
},
{
"epoch": 0.9145873320537428,
"grad_norm": 0.1865234375,
"learning_rate": 4.27063339731286e-06,
"loss": 0.5826,
"step": 953
},
{
"epoch": 0.9155470249520153,
"grad_norm": 0.1796875,
"learning_rate": 4.222648752399233e-06,
"loss": 0.6715,
"step": 954
},
{
"epoch": 0.9165067178502879,
"grad_norm": 0.341796875,
"learning_rate": 4.174664107485605e-06,
"loss": 0.752,
"step": 955
},
{
"epoch": 0.9174664107485605,
"grad_norm": 0.181640625,
"learning_rate": 4.126679462571977e-06,
"loss": 0.6337,
"step": 956
},
{
"epoch": 0.918426103646833,
"grad_norm": 0.185546875,
"learning_rate": 4.078694817658349e-06,
"loss": 0.6485,
"step": 957
},
{
"epoch": 0.9193857965451055,
"grad_norm": 0.189453125,
"learning_rate": 4.030710172744722e-06,
"loss": 0.6394,
"step": 958
},
{
"epoch": 0.9203454894433781,
"grad_norm": 0.181640625,
"learning_rate": 3.982725527831094e-06,
"loss": 0.6874,
"step": 959
},
{
"epoch": 0.9213051823416507,
"grad_norm": 0.1796875,
"learning_rate": 3.934740882917467e-06,
"loss": 0.6199,
"step": 960
},
{
"epoch": 0.9222648752399232,
"grad_norm": 0.19140625,
"learning_rate": 3.886756238003839e-06,
"loss": 0.6185,
"step": 961
},
{
"epoch": 0.9232245681381958,
"grad_norm": 0.193359375,
"learning_rate": 3.838771593090211e-06,
"loss": 0.6295,
"step": 962
},
{
"epoch": 0.9241842610364683,
"grad_norm": 0.255859375,
"learning_rate": 3.7907869481765834e-06,
"loss": 0.6538,
"step": 963
},
{
"epoch": 0.9251439539347409,
"grad_norm": 0.1875,
"learning_rate": 3.742802303262956e-06,
"loss": 0.6412,
"step": 964
},
{
"epoch": 0.9261036468330134,
"grad_norm": 0.1748046875,
"learning_rate": 3.6948176583493283e-06,
"loss": 0.7516,
"step": 965
},
{
"epoch": 0.927063339731286,
"grad_norm": 0.251953125,
"learning_rate": 3.646833013435701e-06,
"loss": 0.6511,
"step": 966
},
{
"epoch": 0.9280230326295585,
"grad_norm": 0.271484375,
"learning_rate": 3.5988483685220733e-06,
"loss": 0.7215,
"step": 967
},
{
"epoch": 0.9289827255278311,
"grad_norm": 0.1884765625,
"learning_rate": 3.5508637236084453e-06,
"loss": 0.6221,
"step": 968
},
{
"epoch": 0.9299424184261037,
"grad_norm": 0.1826171875,
"learning_rate": 3.502879078694818e-06,
"loss": 0.6791,
"step": 969
},
{
"epoch": 0.9309021113243762,
"grad_norm": 0.1884765625,
"learning_rate": 3.4548944337811903e-06,
"loss": 0.631,
"step": 970
},
{
"epoch": 0.9318618042226487,
"grad_norm": 0.240234375,
"learning_rate": 3.4069097888675628e-06,
"loss": 0.7288,
"step": 971
},
{
"epoch": 0.9328214971209213,
"grad_norm": 0.19140625,
"learning_rate": 3.3589251439539353e-06,
"loss": 0.6897,
"step": 972
},
{
"epoch": 0.9337811900191939,
"grad_norm": 0.1962890625,
"learning_rate": 3.3109404990403077e-06,
"loss": 0.6469,
"step": 973
},
{
"epoch": 0.9347408829174664,
"grad_norm": 0.1923828125,
"learning_rate": 3.2629558541266794e-06,
"loss": 0.6288,
"step": 974
},
{
"epoch": 0.935700575815739,
"grad_norm": 0.181640625,
"learning_rate": 3.214971209213052e-06,
"loss": 0.634,
"step": 975
},
{
"epoch": 0.9366602687140115,
"grad_norm": 0.189453125,
"learning_rate": 3.166986564299424e-06,
"loss": 0.6274,
"step": 976
},
{
"epoch": 0.9376199616122841,
"grad_norm": 0.20703125,
"learning_rate": 3.1190019193857964e-06,
"loss": 0.6977,
"step": 977
},
{
"epoch": 0.9385796545105566,
"grad_norm": 0.1923828125,
"learning_rate": 3.071017274472169e-06,
"loss": 0.6045,
"step": 978
},
{
"epoch": 0.9395393474088292,
"grad_norm": 0.1875,
"learning_rate": 3.0230326295585414e-06,
"loss": 0.6448,
"step": 979
},
{
"epoch": 0.9404990403071017,
"grad_norm": 0.189453125,
"learning_rate": 2.975047984644914e-06,
"loss": 0.6329,
"step": 980
},
{
"epoch": 0.9414587332053743,
"grad_norm": 0.1767578125,
"learning_rate": 2.9270633397312863e-06,
"loss": 0.5696,
"step": 981
},
{
"epoch": 0.9424184261036468,
"grad_norm": 0.1962890625,
"learning_rate": 2.879078694817659e-06,
"loss": 0.671,
"step": 982
},
{
"epoch": 0.9433781190019194,
"grad_norm": 0.1826171875,
"learning_rate": 2.831094049904031e-06,
"loss": 0.6083,
"step": 983
},
{
"epoch": 0.944337811900192,
"grad_norm": 0.1875,
"learning_rate": 2.783109404990403e-06,
"loss": 0.6567,
"step": 984
},
{
"epoch": 0.9452975047984645,
"grad_norm": 0.19140625,
"learning_rate": 2.7351247600767754e-06,
"loss": 0.6734,
"step": 985
},
{
"epoch": 0.946257197696737,
"grad_norm": 0.193359375,
"learning_rate": 2.687140115163148e-06,
"loss": 0.596,
"step": 986
},
{
"epoch": 0.9472168905950096,
"grad_norm": 0.1904296875,
"learning_rate": 2.6391554702495203e-06,
"loss": 0.6234,
"step": 987
},
{
"epoch": 0.9481765834932822,
"grad_norm": 0.1884765625,
"learning_rate": 2.591170825335893e-06,
"loss": 0.6006,
"step": 988
},
{
"epoch": 0.9491362763915547,
"grad_norm": 0.212890625,
"learning_rate": 2.5431861804222653e-06,
"loss": 0.7036,
"step": 989
},
{
"epoch": 0.9500959692898272,
"grad_norm": 0.19140625,
"learning_rate": 2.4952015355086374e-06,
"loss": 0.6851,
"step": 990
},
{
"epoch": 0.9510556621880998,
"grad_norm": 0.189453125,
"learning_rate": 2.44721689059501e-06,
"loss": 0.6142,
"step": 991
},
{
"epoch": 0.9520153550863724,
"grad_norm": 0.310546875,
"learning_rate": 2.399232245681382e-06,
"loss": 0.6545,
"step": 992
},
{
"epoch": 0.9529750479846449,
"grad_norm": 0.2333984375,
"learning_rate": 2.3512476007677544e-06,
"loss": 0.652,
"step": 993
},
{
"epoch": 0.9539347408829175,
"grad_norm": 0.3359375,
"learning_rate": 2.303262955854127e-06,
"loss": 0.604,
"step": 994
},
{
"epoch": 0.95489443378119,
"grad_norm": 0.1875,
"learning_rate": 2.2552783109404993e-06,
"loss": 0.6274,
"step": 995
},
{
"epoch": 0.9558541266794626,
"grad_norm": 0.205078125,
"learning_rate": 2.2072936660268714e-06,
"loss": 0.6445,
"step": 996
},
{
"epoch": 0.9568138195777351,
"grad_norm": 0.1875,
"learning_rate": 2.159309021113244e-06,
"loss": 0.6626,
"step": 997
},
{
"epoch": 0.9577735124760077,
"grad_norm": 0.2265625,
"learning_rate": 2.1113243761996164e-06,
"loss": 0.6961,
"step": 998
},
{
"epoch": 0.9587332053742802,
"grad_norm": 0.1806640625,
"learning_rate": 2.0633397312859884e-06,
"loss": 0.575,
"step": 999
},
{
"epoch": 0.9596928982725528,
"grad_norm": 0.2578125,
"learning_rate": 2.015355086372361e-06,
"loss": 0.5857,
"step": 1000
},
{
"epoch": 0.9606525911708254,
"grad_norm": 0.29296875,
"learning_rate": 1.9673704414587334e-06,
"loss": 0.6737,
"step": 1001
},
{
"epoch": 0.9616122840690979,
"grad_norm": 0.19921875,
"learning_rate": 1.9193857965451054e-06,
"loss": 0.7174,
"step": 1002
},
{
"epoch": 0.9625719769673704,
"grad_norm": 0.18359375,
"learning_rate": 1.871401151631478e-06,
"loss": 0.5874,
"step": 1003
},
{
"epoch": 0.963531669865643,
"grad_norm": 0.18359375,
"learning_rate": 1.8234165067178504e-06,
"loss": 0.5958,
"step": 1004
},
{
"epoch": 0.9644913627639156,
"grad_norm": 0.23828125,
"learning_rate": 1.7754318618042227e-06,
"loss": 0.6721,
"step": 1005
},
{
"epoch": 0.9654510556621881,
"grad_norm": 0.201171875,
"learning_rate": 1.7274472168905951e-06,
"loss": 0.6471,
"step": 1006
},
{
"epoch": 0.9664107485604606,
"grad_norm": 0.177734375,
"learning_rate": 1.6794625719769676e-06,
"loss": 0.6962,
"step": 1007
},
{
"epoch": 0.9673704414587332,
"grad_norm": 0.1962890625,
"learning_rate": 1.6314779270633397e-06,
"loss": 0.6519,
"step": 1008
},
{
"epoch": 0.9683301343570058,
"grad_norm": 0.197265625,
"learning_rate": 1.583493282149712e-06,
"loss": 0.6298,
"step": 1009
},
{
"epoch": 0.9692898272552783,
"grad_norm": 0.23046875,
"learning_rate": 1.5355086372360844e-06,
"loss": 0.768,
"step": 1010
},
{
"epoch": 0.9702495201535508,
"grad_norm": 0.1826171875,
"learning_rate": 1.487523992322457e-06,
"loss": 0.6426,
"step": 1011
},
{
"epoch": 0.9712092130518234,
"grad_norm": 0.2353515625,
"learning_rate": 1.4395393474088294e-06,
"loss": 0.6598,
"step": 1012
},
{
"epoch": 0.972168905950096,
"grad_norm": 0.1826171875,
"learning_rate": 1.3915547024952015e-06,
"loss": 0.5888,
"step": 1013
},
{
"epoch": 0.9731285988483686,
"grad_norm": 0.177734375,
"learning_rate": 1.343570057581574e-06,
"loss": 0.6282,
"step": 1014
},
{
"epoch": 0.974088291746641,
"grad_norm": 0.1767578125,
"learning_rate": 1.2955854126679464e-06,
"loss": 0.654,
"step": 1015
},
{
"epoch": 0.9750479846449136,
"grad_norm": 0.1904296875,
"learning_rate": 1.2476007677543187e-06,
"loss": 0.7413,
"step": 1016
},
{
"epoch": 0.9760076775431862,
"grad_norm": 0.18359375,
"learning_rate": 1.199616122840691e-06,
"loss": 0.5933,
"step": 1017
},
{
"epoch": 0.9769673704414588,
"grad_norm": 0.1943359375,
"learning_rate": 1.1516314779270634e-06,
"loss": 0.6595,
"step": 1018
},
{
"epoch": 0.9779270633397313,
"grad_norm": 0.189453125,
"learning_rate": 1.1036468330134357e-06,
"loss": 0.6091,
"step": 1019
},
{
"epoch": 0.9788867562380038,
"grad_norm": 0.1943359375,
"learning_rate": 1.0556621880998082e-06,
"loss": 0.6702,
"step": 1020
},
{
"epoch": 0.9798464491362764,
"grad_norm": 0.1962890625,
"learning_rate": 1.0076775431861805e-06,
"loss": 0.6348,
"step": 1021
},
{
"epoch": 0.980806142034549,
"grad_norm": 0.1865234375,
"learning_rate": 9.596928982725527e-07,
"loss": 0.7239,
"step": 1022
},
{
"epoch": 0.9817658349328215,
"grad_norm": 0.212890625,
"learning_rate": 9.117082533589252e-07,
"loss": 0.6252,
"step": 1023
},
{
"epoch": 0.982725527831094,
"grad_norm": 0.1826171875,
"learning_rate": 8.637236084452976e-07,
"loss": 0.6495,
"step": 1024
},
{
"epoch": 0.9836852207293666,
"grad_norm": 0.1884765625,
"learning_rate": 8.157389635316698e-07,
"loss": 0.6229,
"step": 1025
},
{
"epoch": 0.9846449136276392,
"grad_norm": 0.255859375,
"learning_rate": 7.677543186180422e-07,
"loss": 0.6672,
"step": 1026
},
{
"epoch": 0.9856046065259118,
"grad_norm": 0.1904296875,
"learning_rate": 7.197696737044147e-07,
"loss": 0.6027,
"step": 1027
},
{
"epoch": 0.9865642994241842,
"grad_norm": 0.1904296875,
"learning_rate": 6.71785028790787e-07,
"loss": 0.6075,
"step": 1028
},
{
"epoch": 0.9875239923224568,
"grad_norm": 0.1982421875,
"learning_rate": 6.238003838771593e-07,
"loss": 0.6318,
"step": 1029
},
{
"epoch": 0.9884836852207294,
"grad_norm": 0.1943359375,
"learning_rate": 5.758157389635317e-07,
"loss": 0.6317,
"step": 1030
},
{
"epoch": 0.989443378119002,
"grad_norm": 0.193359375,
"learning_rate": 5.278310940499041e-07,
"loss": 0.6753,
"step": 1031
},
{
"epoch": 0.9904030710172744,
"grad_norm": 0.220703125,
"learning_rate": 4.798464491362764e-07,
"loss": 0.6664,
"step": 1032
},
{
"epoch": 0.991362763915547,
"grad_norm": 0.1796875,
"learning_rate": 4.318618042226488e-07,
"loss": 0.6223,
"step": 1033
},
{
"epoch": 0.9923224568138196,
"grad_norm": 0.1904296875,
"learning_rate": 3.838771593090211e-07,
"loss": 0.657,
"step": 1034
},
{
"epoch": 0.9932821497120922,
"grad_norm": 0.1826171875,
"learning_rate": 3.358925143953935e-07,
"loss": 0.7154,
"step": 1035
},
{
"epoch": 0.9942418426103646,
"grad_norm": 0.1806640625,
"learning_rate": 2.8790786948176586e-07,
"loss": 0.6366,
"step": 1036
},
{
"epoch": 0.9952015355086372,
"grad_norm": 0.1845703125,
"learning_rate": 2.399232245681382e-07,
"loss": 0.7238,
"step": 1037
},
{
"epoch": 0.9961612284069098,
"grad_norm": 0.1865234375,
"learning_rate": 1.9193857965451055e-07,
"loss": 0.7089,
"step": 1038
},
{
"epoch": 0.9971209213051824,
"grad_norm": 0.1884765625,
"learning_rate": 1.4395393474088293e-07,
"loss": 0.6592,
"step": 1039
},
{
"epoch": 0.9980806142034548,
"grad_norm": 0.189453125,
"learning_rate": 9.596928982725528e-08,
"loss": 0.6543,
"step": 1040
},
{
"epoch": 0.9990403071017274,
"grad_norm": 0.1962890625,
"learning_rate": 4.798464491362764e-08,
"loss": 0.7084,
"step": 1041
},
{
"epoch": 1.0,
"grad_norm": 0.1875,
"learning_rate": 0.0,
"loss": 0.6073,
"step": 1042
}
],
"logging_steps": 1.0,
"max_steps": 1042,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.362946722077606e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}