gguf-try-promethus / trainer_state.json
Mithilss's picture
Upload 13 files
a8dfd9d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999072097986452,
"eval_steps": 500,
"global_step": 1347,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007423216108378956,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.6654,
"step": 1
},
{
"epoch": 0.0014846432216757911,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.6541,
"step": 2
},
{
"epoch": 0.0022269648325136866,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.4815,
"step": 3
},
{
"epoch": 0.0029692864433515822,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.5496,
"step": 4
},
{
"epoch": 0.0037116080541894775,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.477,
"step": 5
},
{
"epoch": 0.004453929665027373,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.5704,
"step": 6
},
{
"epoch": 0.005196251275865269,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.7545,
"step": 7
},
{
"epoch": 0.0059385728867031645,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.5284,
"step": 8
},
{
"epoch": 0.006680894497541059,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.4984,
"step": 9
},
{
"epoch": 0.007423216108378955,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.576,
"step": 10
},
{
"epoch": 0.008165537719216851,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.8177,
"step": 11
},
{
"epoch": 0.008907859330054746,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.6046,
"step": 12
},
{
"epoch": 0.009650180940892641,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.6048,
"step": 13
},
{
"epoch": 0.010392502551730538,
"grad_norm": 2.162328004837036,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.6391,
"step": 14
},
{
"epoch": 0.011134824162568432,
"grad_norm": 2.17216420173645,
"learning_rate": 8.000000000000001e-07,
"loss": 1.5029,
"step": 15
},
{
"epoch": 0.011877145773406329,
"grad_norm": 2.3341081142425537,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.6207,
"step": 16
},
{
"epoch": 0.012619467384244224,
"grad_norm": 2.0773301124572754,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.5508,
"step": 17
},
{
"epoch": 0.013361788995082119,
"grad_norm": 2.2292685508728027,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.5605,
"step": 18
},
{
"epoch": 0.014104110605920015,
"grad_norm": 1.7689887285232544,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.4014,
"step": 19
},
{
"epoch": 0.01484643221675791,
"grad_norm": 2.2630300521850586,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.6371,
"step": 20
},
{
"epoch": 0.015588753827595806,
"grad_norm": 1.9840623140335083,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.5566,
"step": 21
},
{
"epoch": 0.016331075438433703,
"grad_norm": 2.096895933151245,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.57,
"step": 22
},
{
"epoch": 0.017073397049271598,
"grad_norm": 1.6705613136291504,
"learning_rate": 4.000000000000001e-06,
"loss": 1.5249,
"step": 23
},
{
"epoch": 0.017815718660109493,
"grad_norm": 1.7144721746444702,
"learning_rate": 4.4e-06,
"loss": 1.4803,
"step": 24
},
{
"epoch": 0.018558040270947387,
"grad_norm": 1.4625688791275024,
"learning_rate": 4.800000000000001e-06,
"loss": 1.3866,
"step": 25
},
{
"epoch": 0.019300361881785282,
"grad_norm": 1.5206489562988281,
"learning_rate": 5.2e-06,
"loss": 1.5484,
"step": 26
},
{
"epoch": 0.02004268349262318,
"grad_norm": 1.3581645488739014,
"learning_rate": 5.600000000000001e-06,
"loss": 1.4105,
"step": 27
},
{
"epoch": 0.020785005103461075,
"grad_norm": 1.4587604999542236,
"learning_rate": 6e-06,
"loss": 1.5549,
"step": 28
},
{
"epoch": 0.02152732671429897,
"grad_norm": 1.3189691305160522,
"learning_rate": 6.4000000000000006e-06,
"loss": 1.3761,
"step": 29
},
{
"epoch": 0.022269648325136865,
"grad_norm": 1.3762433528900146,
"learning_rate": 6.800000000000001e-06,
"loss": 1.4088,
"step": 30
},
{
"epoch": 0.02301196993597476,
"grad_norm": 1.269411325454712,
"learning_rate": 7.2000000000000005e-06,
"loss": 1.4693,
"step": 31
},
{
"epoch": 0.023754291546812658,
"grad_norm": 1.4452253580093384,
"learning_rate": 7.600000000000001e-06,
"loss": 1.3392,
"step": 32
},
{
"epoch": 0.024496613157650553,
"grad_norm": 1.1886940002441406,
"learning_rate": 8.000000000000001e-06,
"loss": 1.3498,
"step": 33
},
{
"epoch": 0.025238934768488448,
"grad_norm": 1.0324665307998657,
"learning_rate": 8.400000000000001e-06,
"loss": 1.1602,
"step": 34
},
{
"epoch": 0.025981256379326342,
"grad_norm": 1.0723854303359985,
"learning_rate": 8.8e-06,
"loss": 1.2315,
"step": 35
},
{
"epoch": 0.026723577990164237,
"grad_norm": 1.045143961906433,
"learning_rate": 9.200000000000002e-06,
"loss": 1.1834,
"step": 36
},
{
"epoch": 0.027465899601002135,
"grad_norm": 0.9035683870315552,
"learning_rate": 9.600000000000001e-06,
"loss": 1.2346,
"step": 37
},
{
"epoch": 0.02820822121184003,
"grad_norm": 0.9809949994087219,
"learning_rate": 1e-05,
"loss": 1.161,
"step": 38
},
{
"epoch": 0.028950542822677925,
"grad_norm": 0.7753032445907593,
"learning_rate": 1.04e-05,
"loss": 1.1423,
"step": 39
},
{
"epoch": 0.02969286443351582,
"grad_norm": 0.821992814540863,
"learning_rate": 1.0800000000000002e-05,
"loss": 1.0866,
"step": 40
},
{
"epoch": 0.030435186044353715,
"grad_norm": 0.768587052822113,
"learning_rate": 1.1200000000000001e-05,
"loss": 1.1209,
"step": 41
},
{
"epoch": 0.031177507655191613,
"grad_norm": 0.7687053084373474,
"learning_rate": 1.16e-05,
"loss": 1.0443,
"step": 42
},
{
"epoch": 0.03191982926602951,
"grad_norm": 0.7157124280929565,
"learning_rate": 1.2e-05,
"loss": 1.0,
"step": 43
},
{
"epoch": 0.032662150876867406,
"grad_norm": 0.6085985898971558,
"learning_rate": 1.2400000000000002e-05,
"loss": 1.1363,
"step": 44
},
{
"epoch": 0.0334044724877053,
"grad_norm": 0.5863650441169739,
"learning_rate": 1.2800000000000001e-05,
"loss": 1.0818,
"step": 45
},
{
"epoch": 0.034146794098543196,
"grad_norm": 0.5280351638793945,
"learning_rate": 1.3200000000000002e-05,
"loss": 1.0964,
"step": 46
},
{
"epoch": 0.03488911570938109,
"grad_norm": 0.5869213938713074,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.9784,
"step": 47
},
{
"epoch": 0.035631437320218985,
"grad_norm": 0.5950367450714111,
"learning_rate": 1.4e-05,
"loss": 1.1197,
"step": 48
},
{
"epoch": 0.03637375893105688,
"grad_norm": 0.5516757369041443,
"learning_rate": 1.4400000000000001e-05,
"loss": 1.0141,
"step": 49
},
{
"epoch": 0.037116080541894775,
"grad_norm": 0.4884754717350006,
"learning_rate": 1.48e-05,
"loss": 1.0251,
"step": 50
},
{
"epoch": 0.03785840215273267,
"grad_norm": 0.570762038230896,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.9775,
"step": 51
},
{
"epoch": 0.038600723763570564,
"grad_norm": 0.5657237768173218,
"learning_rate": 1.5600000000000003e-05,
"loss": 1.0276,
"step": 52
},
{
"epoch": 0.03934304537440846,
"grad_norm": 0.584648609161377,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.9453,
"step": 53
},
{
"epoch": 0.04008536698524636,
"grad_norm": 0.4895365238189697,
"learning_rate": 1.64e-05,
"loss": 1.0402,
"step": 54
},
{
"epoch": 0.04082768859608425,
"grad_norm": 0.5166955590248108,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.9825,
"step": 55
},
{
"epoch": 0.04157001020692215,
"grad_norm": 0.5778055787086487,
"learning_rate": 1.72e-05,
"loss": 0.9668,
"step": 56
},
{
"epoch": 0.04231233181776004,
"grad_norm": 0.4417908489704132,
"learning_rate": 1.76e-05,
"loss": 0.9147,
"step": 57
},
{
"epoch": 0.04305465342859794,
"grad_norm": 0.5314612984657288,
"learning_rate": 1.8e-05,
"loss": 0.9739,
"step": 58
},
{
"epoch": 0.04379697503943584,
"grad_norm": 0.44156596064567566,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.9353,
"step": 59
},
{
"epoch": 0.04453929665027373,
"grad_norm": 0.46377748250961304,
"learning_rate": 1.88e-05,
"loss": 1.0251,
"step": 60
},
{
"epoch": 0.04528161826111163,
"grad_norm": 0.5635647177696228,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.9475,
"step": 61
},
{
"epoch": 0.04602393987194952,
"grad_norm": 0.5183905959129333,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.9855,
"step": 62
},
{
"epoch": 0.04676626148278742,
"grad_norm": 0.43670588731765747,
"learning_rate": 2e-05,
"loss": 0.9497,
"step": 63
},
{
"epoch": 0.047508583093625316,
"grad_norm": 0.42160764336586,
"learning_rate": 2.04e-05,
"loss": 0.9568,
"step": 64
},
{
"epoch": 0.04825090470446321,
"grad_norm": 0.46232110261917114,
"learning_rate": 2.08e-05,
"loss": 1.019,
"step": 65
},
{
"epoch": 0.048993226315301106,
"grad_norm": 0.46109461784362793,
"learning_rate": 2.1200000000000004e-05,
"loss": 0.9295,
"step": 66
},
{
"epoch": 0.049735547926139,
"grad_norm": 0.4705204665660858,
"learning_rate": 2.1600000000000003e-05,
"loss": 0.9813,
"step": 67
},
{
"epoch": 0.050477869536976895,
"grad_norm": 0.4682143032550812,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.9802,
"step": 68
},
{
"epoch": 0.05122019114781479,
"grad_norm": 0.5252828598022461,
"learning_rate": 2.2400000000000002e-05,
"loss": 1.002,
"step": 69
},
{
"epoch": 0.051962512758652685,
"grad_norm": 0.38845106959342957,
"learning_rate": 2.28e-05,
"loss": 0.8624,
"step": 70
},
{
"epoch": 0.05270483436949058,
"grad_norm": 0.4243197739124298,
"learning_rate": 2.32e-05,
"loss": 0.8991,
"step": 71
},
{
"epoch": 0.053447155980328474,
"grad_norm": 0.45470067858695984,
"learning_rate": 2.36e-05,
"loss": 0.8961,
"step": 72
},
{
"epoch": 0.05418947759116637,
"grad_norm": 0.5030398964881897,
"learning_rate": 2.4e-05,
"loss": 1.0106,
"step": 73
},
{
"epoch": 0.05493179920200427,
"grad_norm": 0.4755796194076538,
"learning_rate": 2.44e-05,
"loss": 0.9399,
"step": 74
},
{
"epoch": 0.05567412081284216,
"grad_norm": 0.5100265741348267,
"learning_rate": 2.4800000000000003e-05,
"loss": 0.9387,
"step": 75
},
{
"epoch": 0.05641644242368006,
"grad_norm": 0.4337320327758789,
"learning_rate": 2.5200000000000003e-05,
"loss": 0.92,
"step": 76
},
{
"epoch": 0.05715876403451795,
"grad_norm": 0.44164350628852844,
"learning_rate": 2.5600000000000002e-05,
"loss": 0.8263,
"step": 77
},
{
"epoch": 0.05790108564535585,
"grad_norm": 0.3856760561466217,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.9932,
"step": 78
},
{
"epoch": 0.05864340725619375,
"grad_norm": 0.3697056472301483,
"learning_rate": 2.6400000000000005e-05,
"loss": 0.9092,
"step": 79
},
{
"epoch": 0.05938572886703164,
"grad_norm": 0.4331798255443573,
"learning_rate": 2.6800000000000004e-05,
"loss": 0.8742,
"step": 80
},
{
"epoch": 0.06012805047786954,
"grad_norm": 0.49398401379585266,
"learning_rate": 2.7200000000000004e-05,
"loss": 0.9422,
"step": 81
},
{
"epoch": 0.06087037208870743,
"grad_norm": 0.3899066150188446,
"learning_rate": 2.76e-05,
"loss": 0.9762,
"step": 82
},
{
"epoch": 0.06161269369954533,
"grad_norm": 0.4635457992553711,
"learning_rate": 2.8e-05,
"loss": 0.7991,
"step": 83
},
{
"epoch": 0.062355015310383226,
"grad_norm": 0.4666687548160553,
"learning_rate": 2.8400000000000003e-05,
"loss": 0.8868,
"step": 84
},
{
"epoch": 0.06309733692122112,
"grad_norm": 0.3857990801334381,
"learning_rate": 2.8800000000000002e-05,
"loss": 0.8029,
"step": 85
},
{
"epoch": 0.06383965853205902,
"grad_norm": 0.44243311882019043,
"learning_rate": 2.92e-05,
"loss": 0.9814,
"step": 86
},
{
"epoch": 0.06458198014289691,
"grad_norm": 0.45594194531440735,
"learning_rate": 2.96e-05,
"loss": 0.9913,
"step": 87
},
{
"epoch": 0.06532430175373481,
"grad_norm": 0.43604806065559387,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.8859,
"step": 88
},
{
"epoch": 0.0660666233645727,
"grad_norm": 0.41517373919487,
"learning_rate": 3.0400000000000004e-05,
"loss": 0.8824,
"step": 89
},
{
"epoch": 0.0668089449754106,
"grad_norm": 0.4230550229549408,
"learning_rate": 3.08e-05,
"loss": 0.9572,
"step": 90
},
{
"epoch": 0.06755126658624849,
"grad_norm": 0.42611706256866455,
"learning_rate": 3.1200000000000006e-05,
"loss": 0.9215,
"step": 91
},
{
"epoch": 0.06829358819708639,
"grad_norm": 0.39391592144966125,
"learning_rate": 3.16e-05,
"loss": 0.8478,
"step": 92
},
{
"epoch": 0.06903590980792429,
"grad_norm": 0.39002394676208496,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.9072,
"step": 93
},
{
"epoch": 0.06977823141876217,
"grad_norm": 0.4524936079978943,
"learning_rate": 3.24e-05,
"loss": 0.899,
"step": 94
},
{
"epoch": 0.07052055302960007,
"grad_norm": 0.4666615128517151,
"learning_rate": 3.28e-05,
"loss": 0.9768,
"step": 95
},
{
"epoch": 0.07126287464043797,
"grad_norm": 0.4445357322692871,
"learning_rate": 3.32e-05,
"loss": 0.9929,
"step": 96
},
{
"epoch": 0.07200519625127587,
"grad_norm": 0.5923082232475281,
"learning_rate": 3.3600000000000004e-05,
"loss": 0.86,
"step": 97
},
{
"epoch": 0.07274751786211377,
"grad_norm": 0.41096585988998413,
"learning_rate": 3.4e-05,
"loss": 1.0012,
"step": 98
},
{
"epoch": 0.07348983947295165,
"grad_norm": 0.4832744896411896,
"learning_rate": 3.44e-05,
"loss": 0.8828,
"step": 99
},
{
"epoch": 0.07423216108378955,
"grad_norm": 0.4651799499988556,
"learning_rate": 3.4800000000000006e-05,
"loss": 0.9219,
"step": 100
},
{
"epoch": 0.07497448269462745,
"grad_norm": 0.4387566149234772,
"learning_rate": 3.52e-05,
"loss": 0.9121,
"step": 101
},
{
"epoch": 0.07571680430546535,
"grad_norm": 0.3984525501728058,
"learning_rate": 3.5600000000000005e-05,
"loss": 0.8894,
"step": 102
},
{
"epoch": 0.07645912591630324,
"grad_norm": 0.4379989206790924,
"learning_rate": 3.6e-05,
"loss": 0.9295,
"step": 103
},
{
"epoch": 0.07720144752714113,
"grad_norm": 0.44189852476119995,
"learning_rate": 3.6400000000000004e-05,
"loss": 0.8774,
"step": 104
},
{
"epoch": 0.07794376913797903,
"grad_norm": 0.4428102672100067,
"learning_rate": 3.680000000000001e-05,
"loss": 0.9494,
"step": 105
},
{
"epoch": 0.07868609074881693,
"grad_norm": 0.42305949330329895,
"learning_rate": 3.72e-05,
"loss": 0.8043,
"step": 106
},
{
"epoch": 0.07942841235965482,
"grad_norm": 0.44839444756507874,
"learning_rate": 3.76e-05,
"loss": 0.896,
"step": 107
},
{
"epoch": 0.08017073397049272,
"grad_norm": 0.5253618955612183,
"learning_rate": 3.8e-05,
"loss": 0.8899,
"step": 108
},
{
"epoch": 0.0809130555813306,
"grad_norm": 0.4362352192401886,
"learning_rate": 3.8400000000000005e-05,
"loss": 0.8541,
"step": 109
},
{
"epoch": 0.0816553771921685,
"grad_norm": 0.4516163468360901,
"learning_rate": 3.88e-05,
"loss": 0.9253,
"step": 110
},
{
"epoch": 0.0823976988030064,
"grad_norm": 0.5379387736320496,
"learning_rate": 3.9200000000000004e-05,
"loss": 0.9011,
"step": 111
},
{
"epoch": 0.0831400204138443,
"grad_norm": 0.48964133858680725,
"learning_rate": 3.96e-05,
"loss": 0.8902,
"step": 112
},
{
"epoch": 0.0838823420246822,
"grad_norm": 0.5008836984634399,
"learning_rate": 4e-05,
"loss": 0.8841,
"step": 113
},
{
"epoch": 0.08462466363552008,
"grad_norm": 0.47988569736480713,
"learning_rate": 3.9999985332376666e-05,
"loss": 0.9657,
"step": 114
},
{
"epoch": 0.08536698524635798,
"grad_norm": 0.4180363714694977,
"learning_rate": 3.999994132952817e-05,
"loss": 0.8685,
"step": 115
},
{
"epoch": 0.08610930685719588,
"grad_norm": 0.42892715334892273,
"learning_rate": 3.999986799151905e-05,
"loss": 0.8255,
"step": 116
},
{
"epoch": 0.08685162846803378,
"grad_norm": 0.5795795917510986,
"learning_rate": 3.999976531845688e-05,
"loss": 0.8694,
"step": 117
},
{
"epoch": 0.08759395007887168,
"grad_norm": 0.4624157249927521,
"learning_rate": 3.9999633310492266e-05,
"loss": 0.9485,
"step": 118
},
{
"epoch": 0.08833627168970956,
"grad_norm": 0.4330659806728363,
"learning_rate": 3.999947196781881e-05,
"loss": 0.9166,
"step": 119
},
{
"epoch": 0.08907859330054746,
"grad_norm": 0.46188855171203613,
"learning_rate": 3.999928129067319e-05,
"loss": 0.8938,
"step": 120
},
{
"epoch": 0.08982091491138536,
"grad_norm": 0.45563969016075134,
"learning_rate": 3.999906127933506e-05,
"loss": 0.9488,
"step": 121
},
{
"epoch": 0.09056323652222326,
"grad_norm": 0.46216803789138794,
"learning_rate": 3.999881193412714e-05,
"loss": 0.9242,
"step": 122
},
{
"epoch": 0.09130555813306115,
"grad_norm": 0.40392884612083435,
"learning_rate": 3.999853325541516e-05,
"loss": 1.0076,
"step": 123
},
{
"epoch": 0.09204787974389904,
"grad_norm": 0.5220507979393005,
"learning_rate": 3.999822524360787e-05,
"loss": 0.8663,
"step": 124
},
{
"epoch": 0.09279020135473694,
"grad_norm": 0.49559637904167175,
"learning_rate": 3.999788789915705e-05,
"loss": 0.8733,
"step": 125
},
{
"epoch": 0.09353252296557484,
"grad_norm": 0.5094253420829773,
"learning_rate": 3.9997521222557496e-05,
"loss": 0.9381,
"step": 126
},
{
"epoch": 0.09427484457641273,
"grad_norm": 0.4276386797428131,
"learning_rate": 3.999712521434705e-05,
"loss": 0.8812,
"step": 127
},
{
"epoch": 0.09501716618725063,
"grad_norm": 0.49965962767601013,
"learning_rate": 3.999669987510656e-05,
"loss": 1.0346,
"step": 128
},
{
"epoch": 0.09575948779808852,
"grad_norm": 0.442796528339386,
"learning_rate": 3.9996245205459894e-05,
"loss": 0.8995,
"step": 129
},
{
"epoch": 0.09650180940892641,
"grad_norm": 0.47127169370651245,
"learning_rate": 3.999576120607394e-05,
"loss": 0.925,
"step": 130
},
{
"epoch": 0.09724413101976431,
"grad_norm": 0.42022350430488586,
"learning_rate": 3.999524787765862e-05,
"loss": 0.9539,
"step": 131
},
{
"epoch": 0.09798645263060221,
"grad_norm": 0.4752717614173889,
"learning_rate": 3.999470522096685e-05,
"loss": 0.8577,
"step": 132
},
{
"epoch": 0.09872877424144011,
"grad_norm": 0.5515152812004089,
"learning_rate": 3.99941332367946e-05,
"loss": 0.8861,
"step": 133
},
{
"epoch": 0.099471095852278,
"grad_norm": 0.49411219358444214,
"learning_rate": 3.9993531925980816e-05,
"loss": 0.9313,
"step": 134
},
{
"epoch": 0.10021341746311589,
"grad_norm": 0.49033278226852417,
"learning_rate": 3.9992901289407486e-05,
"loss": 0.9737,
"step": 135
},
{
"epoch": 0.10095573907395379,
"grad_norm": 0.5231755375862122,
"learning_rate": 3.9992241327999596e-05,
"loss": 0.9684,
"step": 136
},
{
"epoch": 0.10169806068479169,
"grad_norm": 0.443928062915802,
"learning_rate": 3.999155204272517e-05,
"loss": 0.8833,
"step": 137
},
{
"epoch": 0.10244038229562959,
"grad_norm": 0.4927554726600647,
"learning_rate": 3.9990833434595204e-05,
"loss": 0.9229,
"step": 138
},
{
"epoch": 0.10318270390646747,
"grad_norm": 0.4793432950973511,
"learning_rate": 3.999008550466374e-05,
"loss": 0.8628,
"step": 139
},
{
"epoch": 0.10392502551730537,
"grad_norm": 0.44405823945999146,
"learning_rate": 3.998930825402781e-05,
"loss": 0.9081,
"step": 140
},
{
"epoch": 0.10466734712814327,
"grad_norm": 0.46231722831726074,
"learning_rate": 3.998850168382746e-05,
"loss": 0.9286,
"step": 141
},
{
"epoch": 0.10540966873898117,
"grad_norm": 0.42269936203956604,
"learning_rate": 3.9987665795245727e-05,
"loss": 0.9151,
"step": 142
},
{
"epoch": 0.10615199034981906,
"grad_norm": 0.44719186425209045,
"learning_rate": 3.998680058950867e-05,
"loss": 0.9411,
"step": 143
},
{
"epoch": 0.10689431196065695,
"grad_norm": 0.4178447127342224,
"learning_rate": 3.998590606788533e-05,
"loss": 0.9347,
"step": 144
},
{
"epoch": 0.10763663357149485,
"grad_norm": 0.5274525284767151,
"learning_rate": 3.9984982231687765e-05,
"loss": 0.9379,
"step": 145
},
{
"epoch": 0.10837895518233275,
"grad_norm": 0.4062984883785248,
"learning_rate": 3.9984029082271024e-05,
"loss": 0.7217,
"step": 146
},
{
"epoch": 0.10912127679317064,
"grad_norm": 0.42657554149627686,
"learning_rate": 3.998304662103315e-05,
"loss": 0.9075,
"step": 147
},
{
"epoch": 0.10986359840400854,
"grad_norm": 0.38112106919288635,
"learning_rate": 3.9982034849415174e-05,
"loss": 0.891,
"step": 148
},
{
"epoch": 0.11060592001484643,
"grad_norm": 0.4420554041862488,
"learning_rate": 3.998099376890114e-05,
"loss": 0.8228,
"step": 149
},
{
"epoch": 0.11134824162568432,
"grad_norm": 0.4574986696243286,
"learning_rate": 3.997992338101805e-05,
"loss": 0.8934,
"step": 150
},
{
"epoch": 0.11209056323652222,
"grad_norm": 0.5493448376655579,
"learning_rate": 3.997882368733591e-05,
"loss": 0.8621,
"step": 151
},
{
"epoch": 0.11283288484736012,
"grad_norm": 0.46607592701911926,
"learning_rate": 3.9977694689467714e-05,
"loss": 0.9219,
"step": 152
},
{
"epoch": 0.11357520645819802,
"grad_norm": 0.3952650725841522,
"learning_rate": 3.997653638906943e-05,
"loss": 0.9321,
"step": 153
},
{
"epoch": 0.1143175280690359,
"grad_norm": 0.45397239923477173,
"learning_rate": 3.997534878784002e-05,
"loss": 0.9325,
"step": 154
},
{
"epoch": 0.1150598496798738,
"grad_norm": 0.4392563998699188,
"learning_rate": 3.99741318875214e-05,
"loss": 0.9588,
"step": 155
},
{
"epoch": 0.1158021712907117,
"grad_norm": 0.44695863127708435,
"learning_rate": 3.997288568989848e-05,
"loss": 0.8417,
"step": 156
},
{
"epoch": 0.1165444929015496,
"grad_norm": 0.42836833000183105,
"learning_rate": 3.997161019679913e-05,
"loss": 0.9387,
"step": 157
},
{
"epoch": 0.1172868145123875,
"grad_norm": 0.493874728679657,
"learning_rate": 3.9970305410094206e-05,
"loss": 0.8795,
"step": 158
},
{
"epoch": 0.11802913612322538,
"grad_norm": 0.4306289851665497,
"learning_rate": 3.99689713316975e-05,
"loss": 0.9509,
"step": 159
},
{
"epoch": 0.11877145773406328,
"grad_norm": 0.512447714805603,
"learning_rate": 3.996760796356581e-05,
"loss": 1.0612,
"step": 160
},
{
"epoch": 0.11951377934490118,
"grad_norm": 0.42709997296333313,
"learning_rate": 3.9966215307698865e-05,
"loss": 0.8967,
"step": 161
},
{
"epoch": 0.12025610095573908,
"grad_norm": 0.4356139600276947,
"learning_rate": 3.996479336613936e-05,
"loss": 0.9122,
"step": 162
},
{
"epoch": 0.12099842256657697,
"grad_norm": 0.447262167930603,
"learning_rate": 3.996334214097294e-05,
"loss": 0.8335,
"step": 163
},
{
"epoch": 0.12174074417741486,
"grad_norm": 0.45617246627807617,
"learning_rate": 3.996186163432822e-05,
"loss": 0.9416,
"step": 164
},
{
"epoch": 0.12248306578825276,
"grad_norm": 0.4708889424800873,
"learning_rate": 3.996035184837674e-05,
"loss": 0.9881,
"step": 165
},
{
"epoch": 0.12322538739909066,
"grad_norm": 0.43020421266555786,
"learning_rate": 3.9958812785332994e-05,
"loss": 0.9952,
"step": 166
},
{
"epoch": 0.12396770900992855,
"grad_norm": 0.4495503306388855,
"learning_rate": 3.9957244447454436e-05,
"loss": 0.8973,
"step": 167
},
{
"epoch": 0.12471003062076645,
"grad_norm": 0.54659503698349,
"learning_rate": 3.9955646837041435e-05,
"loss": 0.9545,
"step": 168
},
{
"epoch": 0.12545235223160434,
"grad_norm": 0.5163118839263916,
"learning_rate": 3.9954019956437304e-05,
"loss": 0.9693,
"step": 169
},
{
"epoch": 0.12619467384244223,
"grad_norm": 0.4061867594718933,
"learning_rate": 3.99523638080283e-05,
"loss": 0.884,
"step": 170
},
{
"epoch": 0.12693699545328013,
"grad_norm": 0.3987230658531189,
"learning_rate": 3.995067839424359e-05,
"loss": 0.9333,
"step": 171
},
{
"epoch": 0.12767931706411803,
"grad_norm": 0.41483163833618164,
"learning_rate": 3.994896371755528e-05,
"loss": 0.8577,
"step": 172
},
{
"epoch": 0.12842163867495593,
"grad_norm": 0.5029232501983643,
"learning_rate": 3.9947219780478385e-05,
"loss": 0.9523,
"step": 173
},
{
"epoch": 0.12916396028579383,
"grad_norm": 0.5120161175727844,
"learning_rate": 3.994544658557086e-05,
"loss": 0.9828,
"step": 174
},
{
"epoch": 0.12990628189663173,
"grad_norm": 0.4548787474632263,
"learning_rate": 3.994364413543356e-05,
"loss": 0.9701,
"step": 175
},
{
"epoch": 0.13064860350746962,
"grad_norm": 0.5206537246704102,
"learning_rate": 3.9941812432710234e-05,
"loss": 0.9157,
"step": 176
},
{
"epoch": 0.1313909251183075,
"grad_norm": 0.3944074511528015,
"learning_rate": 3.993995148008757e-05,
"loss": 0.8267,
"step": 177
},
{
"epoch": 0.1321332467291454,
"grad_norm": 0.4675633907318115,
"learning_rate": 3.993806128029513e-05,
"loss": 0.8274,
"step": 178
},
{
"epoch": 0.1328755683399833,
"grad_norm": 0.5009557008743286,
"learning_rate": 3.9936141836105407e-05,
"loss": 0.9023,
"step": 179
},
{
"epoch": 0.1336178899508212,
"grad_norm": 0.43380340933799744,
"learning_rate": 3.9934193150333754e-05,
"loss": 0.7749,
"step": 180
},
{
"epoch": 0.1343602115616591,
"grad_norm": 0.4866674542427063,
"learning_rate": 3.9932215225838436e-05,
"loss": 0.9514,
"step": 181
},
{
"epoch": 0.13510253317249699,
"grad_norm": 0.40783071517944336,
"learning_rate": 3.9930208065520595e-05,
"loss": 0.8255,
"step": 182
},
{
"epoch": 0.13584485478333488,
"grad_norm": 0.46471208333969116,
"learning_rate": 3.992817167232426e-05,
"loss": 0.9077,
"step": 183
},
{
"epoch": 0.13658717639417278,
"grad_norm": 0.4536527395248413,
"learning_rate": 3.9926106049236345e-05,
"loss": 0.8559,
"step": 184
},
{
"epoch": 0.13732949800501068,
"grad_norm": 0.38513433933258057,
"learning_rate": 3.99240111992866e-05,
"loss": 0.8146,
"step": 185
},
{
"epoch": 0.13807181961584858,
"grad_norm": 0.4751570522785187,
"learning_rate": 3.9921887125547704e-05,
"loss": 0.9198,
"step": 186
},
{
"epoch": 0.13881414122668645,
"grad_norm": 0.4330991804599762,
"learning_rate": 3.9919733831135156e-05,
"loss": 0.9203,
"step": 187
},
{
"epoch": 0.13955646283752435,
"grad_norm": 0.4104817807674408,
"learning_rate": 3.991755131920732e-05,
"loss": 0.8185,
"step": 188
},
{
"epoch": 0.14029878444836225,
"grad_norm": 0.44837674498558044,
"learning_rate": 3.9915339592965436e-05,
"loss": 0.9184,
"step": 189
},
{
"epoch": 0.14104110605920014,
"grad_norm": 0.44174924492836,
"learning_rate": 3.991309865565357e-05,
"loss": 0.8141,
"step": 190
},
{
"epoch": 0.14178342767003804,
"grad_norm": 0.5452368855476379,
"learning_rate": 3.991082851055864e-05,
"loss": 0.9376,
"step": 191
},
{
"epoch": 0.14252574928087594,
"grad_norm": 0.5171647071838379,
"learning_rate": 3.9908529161010425e-05,
"loss": 0.9844,
"step": 192
},
{
"epoch": 0.14326807089171384,
"grad_norm": 0.46090564131736755,
"learning_rate": 3.990620061038152e-05,
"loss": 0.8576,
"step": 193
},
{
"epoch": 0.14401039250255174,
"grad_norm": 0.4665868580341339,
"learning_rate": 3.990384286208734e-05,
"loss": 0.9254,
"step": 194
},
{
"epoch": 0.14475271411338964,
"grad_norm": 0.41708534955978394,
"learning_rate": 3.9901455919586165e-05,
"loss": 0.865,
"step": 195
},
{
"epoch": 0.14549503572422753,
"grad_norm": 0.4729004502296448,
"learning_rate": 3.989903978637905e-05,
"loss": 0.9742,
"step": 196
},
{
"epoch": 0.1462373573350654,
"grad_norm": 0.45266857743263245,
"learning_rate": 3.989659446600991e-05,
"loss": 0.8797,
"step": 197
},
{
"epoch": 0.1469796789459033,
"grad_norm": 0.45862093567848206,
"learning_rate": 3.989411996206543e-05,
"loss": 0.8965,
"step": 198
},
{
"epoch": 0.1477220005567412,
"grad_norm": 0.4977729618549347,
"learning_rate": 3.9891616278175134e-05,
"loss": 0.8884,
"step": 199
},
{
"epoch": 0.1484643221675791,
"grad_norm": 0.43955883383750916,
"learning_rate": 3.988908341801131e-05,
"loss": 0.9651,
"step": 200
},
{
"epoch": 0.149206643778417,
"grad_norm": 0.4800843894481659,
"learning_rate": 3.9886521385289086e-05,
"loss": 0.836,
"step": 201
},
{
"epoch": 0.1499489653892549,
"grad_norm": 0.4543246924877167,
"learning_rate": 3.988393018376635e-05,
"loss": 0.975,
"step": 202
},
{
"epoch": 0.1506912870000928,
"grad_norm": 0.44506338238716125,
"learning_rate": 3.9881309817243766e-05,
"loss": 0.8872,
"step": 203
},
{
"epoch": 0.1514336086109307,
"grad_norm": 0.45602262020111084,
"learning_rate": 3.98786602895648e-05,
"loss": 0.8382,
"step": 204
},
{
"epoch": 0.1521759302217686,
"grad_norm": 0.4463014602661133,
"learning_rate": 3.9875981604615666e-05,
"loss": 0.8192,
"step": 205
},
{
"epoch": 0.1529182518326065,
"grad_norm": 0.42010006308555603,
"learning_rate": 3.9873273766325374e-05,
"loss": 0.9253,
"step": 206
},
{
"epoch": 0.15366057344344436,
"grad_norm": 0.4275091588497162,
"learning_rate": 3.987053677866568e-05,
"loss": 0.791,
"step": 207
},
{
"epoch": 0.15440289505428226,
"grad_norm": 0.505247175693512,
"learning_rate": 3.9867770645651084e-05,
"loss": 0.8528,
"step": 208
},
{
"epoch": 0.15514521666512016,
"grad_norm": 0.42931032180786133,
"learning_rate": 3.986497537133885e-05,
"loss": 0.8356,
"step": 209
},
{
"epoch": 0.15588753827595805,
"grad_norm": 0.6309205293655396,
"learning_rate": 3.986215095982897e-05,
"loss": 0.946,
"step": 210
},
{
"epoch": 0.15662985988679595,
"grad_norm": 0.44027337431907654,
"learning_rate": 3.985929741526421e-05,
"loss": 0.8109,
"step": 211
},
{
"epoch": 0.15737218149763385,
"grad_norm": 0.44372624158859253,
"learning_rate": 3.985641474183002e-05,
"loss": 0.9662,
"step": 212
},
{
"epoch": 0.15811450310847175,
"grad_norm": 0.4606941044330597,
"learning_rate": 3.985350294375461e-05,
"loss": 0.8919,
"step": 213
},
{
"epoch": 0.15885682471930965,
"grad_norm": 0.3926544189453125,
"learning_rate": 3.9850562025308874e-05,
"loss": 0.8643,
"step": 214
},
{
"epoch": 0.15959914633014755,
"grad_norm": 0.4373493492603302,
"learning_rate": 3.9847591990806466e-05,
"loss": 0.9673,
"step": 215
},
{
"epoch": 0.16034146794098544,
"grad_norm": 0.4592650830745697,
"learning_rate": 3.984459284460371e-05,
"loss": 0.9397,
"step": 216
},
{
"epoch": 0.16108378955182331,
"grad_norm": 0.43196824193000793,
"learning_rate": 3.984156459109965e-05,
"loss": 0.887,
"step": 217
},
{
"epoch": 0.1618261111626612,
"grad_norm": 0.44357168674468994,
"learning_rate": 3.983850723473599e-05,
"loss": 0.9653,
"step": 218
},
{
"epoch": 0.1625684327734991,
"grad_norm": 0.39339280128479004,
"learning_rate": 3.983542077999717e-05,
"loss": 0.7815,
"step": 219
},
{
"epoch": 0.163310754384337,
"grad_norm": 0.3603425920009613,
"learning_rate": 3.983230523141027e-05,
"loss": 0.8622,
"step": 220
},
{
"epoch": 0.1640530759951749,
"grad_norm": 0.4492432177066803,
"learning_rate": 3.982916059354507e-05,
"loss": 0.8553,
"step": 221
},
{
"epoch": 0.1647953976060128,
"grad_norm": 0.4893001317977905,
"learning_rate": 3.9825986871014e-05,
"loss": 0.8892,
"step": 222
},
{
"epoch": 0.1655377192168507,
"grad_norm": 0.4576875865459442,
"learning_rate": 3.9822784068472155e-05,
"loss": 0.8574,
"step": 223
},
{
"epoch": 0.1662800408276886,
"grad_norm": 0.47724637389183044,
"learning_rate": 3.981955219061729e-05,
"loss": 0.8827,
"step": 224
},
{
"epoch": 0.1670223624385265,
"grad_norm": 0.4396674931049347,
"learning_rate": 3.98162912421898e-05,
"loss": 0.9252,
"step": 225
},
{
"epoch": 0.1677646840493644,
"grad_norm": 0.4329068958759308,
"learning_rate": 3.981300122797273e-05,
"loss": 0.9277,
"step": 226
},
{
"epoch": 0.16850700566020227,
"grad_norm": 0.4214411973953247,
"learning_rate": 3.980968215279173e-05,
"loss": 0.8663,
"step": 227
},
{
"epoch": 0.16924932727104017,
"grad_norm": 0.4411622881889343,
"learning_rate": 3.980633402151511e-05,
"loss": 0.9531,
"step": 228
},
{
"epoch": 0.16999164888187807,
"grad_norm": 0.48397454619407654,
"learning_rate": 3.980295683905378e-05,
"loss": 0.8853,
"step": 229
},
{
"epoch": 0.17073397049271596,
"grad_norm": 0.4269099831581116,
"learning_rate": 3.979955061036125e-05,
"loss": 0.7328,
"step": 230
},
{
"epoch": 0.17147629210355386,
"grad_norm": 0.42758798599243164,
"learning_rate": 3.979611534043367e-05,
"loss": 0.8891,
"step": 231
},
{
"epoch": 0.17221861371439176,
"grad_norm": 0.42514568567276,
"learning_rate": 3.979265103430975e-05,
"loss": 0.7428,
"step": 232
},
{
"epoch": 0.17296093532522966,
"grad_norm": 0.40256327390670776,
"learning_rate": 3.978915769707081e-05,
"loss": 0.8206,
"step": 233
},
{
"epoch": 0.17370325693606756,
"grad_norm": 0.4632292687892914,
"learning_rate": 3.9785635333840746e-05,
"loss": 0.8407,
"step": 234
},
{
"epoch": 0.17444557854690546,
"grad_norm": 0.41627851128578186,
"learning_rate": 3.9782083949786026e-05,
"loss": 0.9274,
"step": 235
},
{
"epoch": 0.17518790015774335,
"grad_norm": 0.416707843542099,
"learning_rate": 3.977850355011568e-05,
"loss": 0.8109,
"step": 236
},
{
"epoch": 0.17593022176858122,
"grad_norm": 0.4402737319469452,
"learning_rate": 3.977489414008131e-05,
"loss": 0.7994,
"step": 237
},
{
"epoch": 0.17667254337941912,
"grad_norm": 0.4560914635658264,
"learning_rate": 3.9771255724977065e-05,
"loss": 0.8921,
"step": 238
},
{
"epoch": 0.17741486499025702,
"grad_norm": 0.42798957228660583,
"learning_rate": 3.9767588310139625e-05,
"loss": 0.9176,
"step": 239
},
{
"epoch": 0.17815718660109492,
"grad_norm": 0.5003705024719238,
"learning_rate": 3.976389190094823e-05,
"loss": 0.8734,
"step": 240
},
{
"epoch": 0.17889950821193282,
"grad_norm": 0.45051515102386475,
"learning_rate": 3.976016650282462e-05,
"loss": 0.8313,
"step": 241
},
{
"epoch": 0.17964182982277072,
"grad_norm": 0.4295995831489563,
"learning_rate": 3.975641212123308e-05,
"loss": 0.9299,
"step": 242
},
{
"epoch": 0.1803841514336086,
"grad_norm": 0.451457679271698,
"learning_rate": 3.975262876168039e-05,
"loss": 0.9284,
"step": 243
},
{
"epoch": 0.1811264730444465,
"grad_norm": 0.4843652546405792,
"learning_rate": 3.974881642971584e-05,
"loss": 0.861,
"step": 244
},
{
"epoch": 0.1818687946552844,
"grad_norm": 0.4577447474002838,
"learning_rate": 3.9744975130931214e-05,
"loss": 0.8976,
"step": 245
},
{
"epoch": 0.1826111162661223,
"grad_norm": 0.40924352407455444,
"learning_rate": 3.974110487096079e-05,
"loss": 0.7861,
"step": 246
},
{
"epoch": 0.18335343787696018,
"grad_norm": 0.49304714798927307,
"learning_rate": 3.973720565548131e-05,
"loss": 0.8693,
"step": 247
},
{
"epoch": 0.18409575948779808,
"grad_norm": 0.4253219664096832,
"learning_rate": 3.9733277490212e-05,
"loss": 0.8902,
"step": 248
},
{
"epoch": 0.18483808109863598,
"grad_norm": 0.44300004839897156,
"learning_rate": 3.9729320380914546e-05,
"loss": 0.8948,
"step": 249
},
{
"epoch": 0.18558040270947387,
"grad_norm": 0.4048704504966736,
"learning_rate": 3.972533433339309e-05,
"loss": 0.8657,
"step": 250
},
{
"epoch": 0.18632272432031177,
"grad_norm": 0.4248516261577606,
"learning_rate": 3.972131935349421e-05,
"loss": 0.8626,
"step": 251
},
{
"epoch": 0.18706504593114967,
"grad_norm": 0.41515034437179565,
"learning_rate": 3.9717275447106936e-05,
"loss": 0.8483,
"step": 252
},
{
"epoch": 0.18780736754198757,
"grad_norm": 0.4110088348388672,
"learning_rate": 3.9713202620162704e-05,
"loss": 0.8291,
"step": 253
},
{
"epoch": 0.18854968915282547,
"grad_norm": 0.4878021478652954,
"learning_rate": 3.9709100878635397e-05,
"loss": 0.88,
"step": 254
},
{
"epoch": 0.18929201076366337,
"grad_norm": 0.4222434163093567,
"learning_rate": 3.970497022854129e-05,
"loss": 0.855,
"step": 255
},
{
"epoch": 0.19003433237450126,
"grad_norm": 0.40751904249191284,
"learning_rate": 3.9700810675939067e-05,
"loss": 0.8315,
"step": 256
},
{
"epoch": 0.19077665398533916,
"grad_norm": 0.431059330701828,
"learning_rate": 3.969662222692979e-05,
"loss": 0.7909,
"step": 257
},
{
"epoch": 0.19151897559617703,
"grad_norm": 0.43389439582824707,
"learning_rate": 3.9692404887656936e-05,
"loss": 0.9269,
"step": 258
},
{
"epoch": 0.19226129720701493,
"grad_norm": 0.4089961051940918,
"learning_rate": 3.9688158664306333e-05,
"loss": 0.8458,
"step": 259
},
{
"epoch": 0.19300361881785283,
"grad_norm": 0.43871837854385376,
"learning_rate": 3.968388356310618e-05,
"loss": 0.8475,
"step": 260
},
{
"epoch": 0.19374594042869073,
"grad_norm": 0.4958783686161041,
"learning_rate": 3.967957959032703e-05,
"loss": 0.9589,
"step": 261
},
{
"epoch": 0.19448826203952863,
"grad_norm": 0.47255924344062805,
"learning_rate": 3.96752467522818e-05,
"loss": 0.8113,
"step": 262
},
{
"epoch": 0.19523058365036652,
"grad_norm": 0.39204686880111694,
"learning_rate": 3.967088505532572e-05,
"loss": 0.8786,
"step": 263
},
{
"epoch": 0.19597290526120442,
"grad_norm": 0.4337204396724701,
"learning_rate": 3.966649450585637e-05,
"loss": 0.8074,
"step": 264
},
{
"epoch": 0.19671522687204232,
"grad_norm": 0.44740915298461914,
"learning_rate": 3.966207511031365e-05,
"loss": 0.9015,
"step": 265
},
{
"epoch": 0.19745754848288022,
"grad_norm": 0.4619744122028351,
"learning_rate": 3.9657626875179746e-05,
"loss": 0.8584,
"step": 266
},
{
"epoch": 0.19819987009371812,
"grad_norm": 0.4195767641067505,
"learning_rate": 3.9653149806979174e-05,
"loss": 0.9042,
"step": 267
},
{
"epoch": 0.198942191704556,
"grad_norm": 0.3937821388244629,
"learning_rate": 3.9648643912278726e-05,
"loss": 0.9548,
"step": 268
},
{
"epoch": 0.19968451331539389,
"grad_norm": 0.4471302032470703,
"learning_rate": 3.9644109197687475e-05,
"loss": 0.9643,
"step": 269
},
{
"epoch": 0.20042683492623178,
"grad_norm": 0.48815852403640747,
"learning_rate": 3.963954566985678e-05,
"loss": 1.0094,
"step": 270
},
{
"epoch": 0.20116915653706968,
"grad_norm": 0.43544676899909973,
"learning_rate": 3.963495333548024e-05,
"loss": 0.7804,
"step": 271
},
{
"epoch": 0.20191147814790758,
"grad_norm": 0.4542047679424286,
"learning_rate": 3.963033220129372e-05,
"loss": 0.7921,
"step": 272
},
{
"epoch": 0.20265379975874548,
"grad_norm": 0.39049360156059265,
"learning_rate": 3.962568227407533e-05,
"loss": 0.9492,
"step": 273
},
{
"epoch": 0.20339612136958338,
"grad_norm": 0.4454337954521179,
"learning_rate": 3.962100356064541e-05,
"loss": 0.9064,
"step": 274
},
{
"epoch": 0.20413844298042128,
"grad_norm": 0.41931623220443726,
"learning_rate": 3.961629606786652e-05,
"loss": 0.7879,
"step": 275
},
{
"epoch": 0.20488076459125917,
"grad_norm": 0.4287397265434265,
"learning_rate": 3.9611559802643427e-05,
"loss": 0.846,
"step": 276
},
{
"epoch": 0.20562308620209707,
"grad_norm": 0.5089631676673889,
"learning_rate": 3.9606794771923106e-05,
"loss": 0.864,
"step": 277
},
{
"epoch": 0.20636540781293494,
"grad_norm": 0.4220036268234253,
"learning_rate": 3.960200098269473e-05,
"loss": 0.8548,
"step": 278
},
{
"epoch": 0.20710772942377284,
"grad_norm": 0.4368970990180969,
"learning_rate": 3.9597178441989646e-05,
"loss": 0.8912,
"step": 279
},
{
"epoch": 0.20785005103461074,
"grad_norm": 0.4061053395271301,
"learning_rate": 3.9592327156881375e-05,
"loss": 0.8345,
"step": 280
},
{
"epoch": 0.20859237264544864,
"grad_norm": 0.40495994687080383,
"learning_rate": 3.9587447134485605e-05,
"loss": 0.9221,
"step": 281
},
{
"epoch": 0.20933469425628654,
"grad_norm": 0.4269440472126007,
"learning_rate": 3.958253838196016e-05,
"loss": 0.8588,
"step": 282
},
{
"epoch": 0.21007701586712443,
"grad_norm": 0.5028578042984009,
"learning_rate": 3.957760090650501e-05,
"loss": 0.9026,
"step": 283
},
{
"epoch": 0.21081933747796233,
"grad_norm": 0.40312886238098145,
"learning_rate": 3.957263471536227e-05,
"loss": 0.9118,
"step": 284
},
{
"epoch": 0.21156165908880023,
"grad_norm": 0.4545218050479889,
"learning_rate": 3.956763981581617e-05,
"loss": 0.8419,
"step": 285
},
{
"epoch": 0.21230398069963813,
"grad_norm": 0.37066158652305603,
"learning_rate": 3.956261621519302e-05,
"loss": 0.8473,
"step": 286
},
{
"epoch": 0.21304630231047603,
"grad_norm": 0.45726296305656433,
"learning_rate": 3.955756392086125e-05,
"loss": 0.8868,
"step": 287
},
{
"epoch": 0.2137886239213139,
"grad_norm": 0.38632458448410034,
"learning_rate": 3.955248294023139e-05,
"loss": 0.7743,
"step": 288
},
{
"epoch": 0.2145309455321518,
"grad_norm": 0.399452269077301,
"learning_rate": 3.9547373280756016e-05,
"loss": 0.8737,
"step": 289
},
{
"epoch": 0.2152732671429897,
"grad_norm": 0.42522764205932617,
"learning_rate": 3.9542234949929793e-05,
"loss": 0.9165,
"step": 290
},
{
"epoch": 0.2160155887538276,
"grad_norm": 0.4133375883102417,
"learning_rate": 3.9537067955289424e-05,
"loss": 0.8659,
"step": 291
},
{
"epoch": 0.2167579103646655,
"grad_norm": 0.3964545428752899,
"learning_rate": 3.953187230441367e-05,
"loss": 0.8724,
"step": 292
},
{
"epoch": 0.2175002319755034,
"grad_norm": 0.41253530979156494,
"learning_rate": 3.952664800492331e-05,
"loss": 0.8626,
"step": 293
},
{
"epoch": 0.2182425535863413,
"grad_norm": 0.42898398637771606,
"learning_rate": 3.952139506448116e-05,
"loss": 0.8405,
"step": 294
},
{
"epoch": 0.21898487519717919,
"grad_norm": 0.4509212374687195,
"learning_rate": 3.951611349079202e-05,
"loss": 0.9459,
"step": 295
},
{
"epoch": 0.21972719680801708,
"grad_norm": 0.42532891035079956,
"learning_rate": 3.951080329160271e-05,
"loss": 0.8979,
"step": 296
},
{
"epoch": 0.22046951841885498,
"grad_norm": 0.40851664543151855,
"learning_rate": 3.950546447470203e-05,
"loss": 0.7875,
"step": 297
},
{
"epoch": 0.22121184002969285,
"grad_norm": 0.38995298743247986,
"learning_rate": 3.9500097047920764e-05,
"loss": 0.7943,
"step": 298
},
{
"epoch": 0.22195416164053075,
"grad_norm": 0.4160335063934326,
"learning_rate": 3.949470101913163e-05,
"loss": 0.8773,
"step": 299
},
{
"epoch": 0.22269648325136865,
"grad_norm": 0.4847520887851715,
"learning_rate": 3.948927639624935e-05,
"loss": 0.9078,
"step": 300
},
{
"epoch": 0.22343880486220655,
"grad_norm": 0.4614812135696411,
"learning_rate": 3.9483823187230534e-05,
"loss": 0.9538,
"step": 301
},
{
"epoch": 0.22418112647304445,
"grad_norm": 0.478161096572876,
"learning_rate": 3.947834140007375e-05,
"loss": 0.9021,
"step": 302
},
{
"epoch": 0.22492344808388234,
"grad_norm": 0.4866395890712738,
"learning_rate": 3.947283104281947e-05,
"loss": 0.9058,
"step": 303
},
{
"epoch": 0.22566576969472024,
"grad_norm": 0.47090503573417664,
"learning_rate": 3.946729212355009e-05,
"loss": 0.805,
"step": 304
},
{
"epoch": 0.22640809130555814,
"grad_norm": 0.44924086332321167,
"learning_rate": 3.9461724650389886e-05,
"loss": 0.9648,
"step": 305
},
{
"epoch": 0.22715041291639604,
"grad_norm": 0.43780702352523804,
"learning_rate": 3.9456128631505014e-05,
"loss": 0.9912,
"step": 306
},
{
"epoch": 0.22789273452723394,
"grad_norm": 0.4452776312828064,
"learning_rate": 3.9450504075103507e-05,
"loss": 0.9099,
"step": 307
},
{
"epoch": 0.2286350561380718,
"grad_norm": 0.41327065229415894,
"learning_rate": 3.944485098943524e-05,
"loss": 0.8384,
"step": 308
},
{
"epoch": 0.2293773777489097,
"grad_norm": 0.43018415570259094,
"learning_rate": 3.9439169382791965e-05,
"loss": 0.9611,
"step": 309
},
{
"epoch": 0.2301196993597476,
"grad_norm": 0.414542555809021,
"learning_rate": 3.9433459263507236e-05,
"loss": 0.8249,
"step": 310
},
{
"epoch": 0.2308620209705855,
"grad_norm": 0.4167710542678833,
"learning_rate": 3.942772063995645e-05,
"loss": 0.8588,
"step": 311
},
{
"epoch": 0.2316043425814234,
"grad_norm": 0.5119878649711609,
"learning_rate": 3.94219535205568e-05,
"loss": 0.9008,
"step": 312
},
{
"epoch": 0.2323466641922613,
"grad_norm": 0.4627315402030945,
"learning_rate": 3.941615791376727e-05,
"loss": 0.7732,
"step": 313
},
{
"epoch": 0.2330889858030992,
"grad_norm": 0.3760371208190918,
"learning_rate": 3.941033382808865e-05,
"loss": 0.9079,
"step": 314
},
{
"epoch": 0.2338313074139371,
"grad_norm": 0.38762086629867554,
"learning_rate": 3.9404481272063486e-05,
"loss": 0.8229,
"step": 315
},
{
"epoch": 0.234573629024775,
"grad_norm": 0.38175562024116516,
"learning_rate": 3.9398600254276085e-05,
"loss": 0.8135,
"step": 316
},
{
"epoch": 0.2353159506356129,
"grad_norm": 0.4348665177822113,
"learning_rate": 3.939269078335251e-05,
"loss": 0.8682,
"step": 317
},
{
"epoch": 0.23605827224645076,
"grad_norm": 0.3956005871295929,
"learning_rate": 3.938675286796054e-05,
"loss": 0.7935,
"step": 318
},
{
"epoch": 0.23680059385728866,
"grad_norm": 0.402208149433136,
"learning_rate": 3.938078651680969e-05,
"loss": 0.9668,
"step": 319
},
{
"epoch": 0.23754291546812656,
"grad_norm": 0.44131597876548767,
"learning_rate": 3.9374791738651175e-05,
"loss": 0.8008,
"step": 320
},
{
"epoch": 0.23828523707896446,
"grad_norm": 0.43316978216171265,
"learning_rate": 3.936876854227792e-05,
"loss": 0.8541,
"step": 321
},
{
"epoch": 0.23902755868980236,
"grad_norm": 0.4488168954849243,
"learning_rate": 3.936271693652451e-05,
"loss": 0.8773,
"step": 322
},
{
"epoch": 0.23976988030064025,
"grad_norm": 0.4164707362651825,
"learning_rate": 3.935663693026722e-05,
"loss": 0.9193,
"step": 323
},
{
"epoch": 0.24051220191147815,
"grad_norm": 0.4314461648464203,
"learning_rate": 3.935052853242398e-05,
"loss": 0.8206,
"step": 324
},
{
"epoch": 0.24125452352231605,
"grad_norm": 0.4680315852165222,
"learning_rate": 3.934439175195434e-05,
"loss": 0.9033,
"step": 325
},
{
"epoch": 0.24199684513315395,
"grad_norm": 0.4575926959514618,
"learning_rate": 3.933822659785951e-05,
"loss": 0.8849,
"step": 326
},
{
"epoch": 0.24273916674399185,
"grad_norm": 0.4294450879096985,
"learning_rate": 3.933203307918231e-05,
"loss": 0.7656,
"step": 327
},
{
"epoch": 0.24348148835482972,
"grad_norm": 0.4561205506324768,
"learning_rate": 3.932581120500715e-05,
"loss": 0.8032,
"step": 328
},
{
"epoch": 0.24422380996566762,
"grad_norm": 0.39646250009536743,
"learning_rate": 3.931956098446006e-05,
"loss": 0.7969,
"step": 329
},
{
"epoch": 0.2449661315765055,
"grad_norm": 0.41189101338386536,
"learning_rate": 3.9313282426708594e-05,
"loss": 0.8546,
"step": 330
},
{
"epoch": 0.2457084531873434,
"grad_norm": 0.4370476305484772,
"learning_rate": 3.9306975540961935e-05,
"loss": 0.8269,
"step": 331
},
{
"epoch": 0.2464507747981813,
"grad_norm": 0.44512081146240234,
"learning_rate": 3.930064033647077e-05,
"loss": 0.884,
"step": 332
},
{
"epoch": 0.2471930964090192,
"grad_norm": 0.3812503516674042,
"learning_rate": 3.9294276822527344e-05,
"loss": 0.8196,
"step": 333
},
{
"epoch": 0.2479354180198571,
"grad_norm": 0.39937490224838257,
"learning_rate": 3.9287885008465416e-05,
"loss": 0.8481,
"step": 334
},
{
"epoch": 0.248677739630695,
"grad_norm": 0.5105488300323486,
"learning_rate": 3.9281464903660266e-05,
"loss": 0.8751,
"step": 335
},
{
"epoch": 0.2494200612415329,
"grad_norm": 0.3878211975097656,
"learning_rate": 3.927501651752865e-05,
"loss": 0.8023,
"step": 336
},
{
"epoch": 0.2501623828523708,
"grad_norm": 0.413700133562088,
"learning_rate": 3.926853985952883e-05,
"loss": 0.8475,
"step": 337
},
{
"epoch": 0.25090470446320867,
"grad_norm": 0.3983979821205139,
"learning_rate": 3.926203493916051e-05,
"loss": 0.7672,
"step": 338
},
{
"epoch": 0.2516470260740466,
"grad_norm": 0.41137751936912537,
"learning_rate": 3.9255501765964874e-05,
"loss": 0.8201,
"step": 339
},
{
"epoch": 0.25238934768488447,
"grad_norm": 0.46660321950912476,
"learning_rate": 3.9248940349524526e-05,
"loss": 0.8046,
"step": 340
},
{
"epoch": 0.2531316692957224,
"grad_norm": 0.42749106884002686,
"learning_rate": 3.9242350699463516e-05,
"loss": 0.8447,
"step": 341
},
{
"epoch": 0.25387399090656027,
"grad_norm": 0.3725341856479645,
"learning_rate": 3.9235732825447284e-05,
"loss": 0.8874,
"step": 342
},
{
"epoch": 0.25461631251739814,
"grad_norm": 0.36658021807670593,
"learning_rate": 3.9229086737182676e-05,
"loss": 0.7429,
"step": 343
},
{
"epoch": 0.25535863412823606,
"grad_norm": 0.4336754381656647,
"learning_rate": 3.922241244441794e-05,
"loss": 0.8506,
"step": 344
},
{
"epoch": 0.25610095573907393,
"grad_norm": 0.4874718487262726,
"learning_rate": 3.921570995694266e-05,
"loss": 0.9424,
"step": 345
},
{
"epoch": 0.25684327734991186,
"grad_norm": 0.4024730920791626,
"learning_rate": 3.92089792845878e-05,
"loss": 0.8936,
"step": 346
},
{
"epoch": 0.25758559896074973,
"grad_norm": 0.4135879874229431,
"learning_rate": 3.9202220437225665e-05,
"loss": 0.8154,
"step": 347
},
{
"epoch": 0.25832792057158765,
"grad_norm": 0.3991898000240326,
"learning_rate": 3.9195433424769857e-05,
"loss": 0.8992,
"step": 348
},
{
"epoch": 0.2590702421824255,
"grad_norm": 0.4020022749900818,
"learning_rate": 3.9188618257175326e-05,
"loss": 0.9374,
"step": 349
},
{
"epoch": 0.25981256379326345,
"grad_norm": 0.4002429246902466,
"learning_rate": 3.9181774944438294e-05,
"loss": 0.8036,
"step": 350
},
{
"epoch": 0.2605548854041013,
"grad_norm": 0.4645288288593292,
"learning_rate": 3.9174903496596286e-05,
"loss": 0.9058,
"step": 351
},
{
"epoch": 0.26129720701493925,
"grad_norm": 0.5104494690895081,
"learning_rate": 3.916800392372807e-05,
"loss": 0.858,
"step": 352
},
{
"epoch": 0.2620395286257771,
"grad_norm": 0.42878013849258423,
"learning_rate": 3.9161076235953693e-05,
"loss": 0.8975,
"step": 353
},
{
"epoch": 0.262781850236615,
"grad_norm": 0.47523993253707886,
"learning_rate": 3.915412044343441e-05,
"loss": 0.8762,
"step": 354
},
{
"epoch": 0.2635241718474529,
"grad_norm": 0.4574279189109802,
"learning_rate": 3.914713655637273e-05,
"loss": 0.9237,
"step": 355
},
{
"epoch": 0.2642664934582908,
"grad_norm": 0.4427787959575653,
"learning_rate": 3.914012458501235e-05,
"loss": 0.9013,
"step": 356
},
{
"epoch": 0.2650088150691287,
"grad_norm": 0.4818330407142639,
"learning_rate": 3.913308453963817e-05,
"loss": 0.8158,
"step": 357
},
{
"epoch": 0.2657511366799666,
"grad_norm": 0.3826821744441986,
"learning_rate": 3.912601643057625e-05,
"loss": 0.8757,
"step": 358
},
{
"epoch": 0.2664934582908045,
"grad_norm": 0.447723388671875,
"learning_rate": 3.9118920268193844e-05,
"loss": 0.8342,
"step": 359
},
{
"epoch": 0.2672357799016424,
"grad_norm": 0.42185550928115845,
"learning_rate": 3.911179606289932e-05,
"loss": 0.9412,
"step": 360
},
{
"epoch": 0.2679781015124803,
"grad_norm": 0.471258282661438,
"learning_rate": 3.91046438251422e-05,
"loss": 0.85,
"step": 361
},
{
"epoch": 0.2687204231233182,
"grad_norm": 0.42661038041114807,
"learning_rate": 3.909746356541312e-05,
"loss": 0.7643,
"step": 362
},
{
"epoch": 0.26946274473415605,
"grad_norm": 0.4222320020198822,
"learning_rate": 3.909025529424382e-05,
"loss": 0.83,
"step": 363
},
{
"epoch": 0.27020506634499397,
"grad_norm": 0.388431578874588,
"learning_rate": 3.90830190222071e-05,
"loss": 0.7993,
"step": 364
},
{
"epoch": 0.27094738795583184,
"grad_norm": 0.41494980454444885,
"learning_rate": 3.907575475991687e-05,
"loss": 0.9685,
"step": 365
},
{
"epoch": 0.27168970956666977,
"grad_norm": 0.4347587823867798,
"learning_rate": 3.9068462518028074e-05,
"loss": 0.8882,
"step": 366
},
{
"epoch": 0.27243203117750764,
"grad_norm": 0.38997599482536316,
"learning_rate": 3.906114230723669e-05,
"loss": 0.9071,
"step": 367
},
{
"epoch": 0.27317435278834556,
"grad_norm": 0.413625031709671,
"learning_rate": 3.9053794138279734e-05,
"loss": 0.8533,
"step": 368
},
{
"epoch": 0.27391667439918344,
"grad_norm": 0.397564560174942,
"learning_rate": 3.9046418021935214e-05,
"loss": 0.8478,
"step": 369
},
{
"epoch": 0.27465899601002136,
"grad_norm": 0.38332417607307434,
"learning_rate": 3.903901396902216e-05,
"loss": 0.884,
"step": 370
},
{
"epoch": 0.27540131762085923,
"grad_norm": 0.4055768847465515,
"learning_rate": 3.903158199040053e-05,
"loss": 0.81,
"step": 371
},
{
"epoch": 0.27614363923169716,
"grad_norm": 0.41119036078453064,
"learning_rate": 3.902412209697129e-05,
"loss": 0.8914,
"step": 372
},
{
"epoch": 0.27688596084253503,
"grad_norm": 0.4107328951358795,
"learning_rate": 3.901663429967633e-05,
"loss": 0.8504,
"step": 373
},
{
"epoch": 0.2776282824533729,
"grad_norm": 0.46609365940093994,
"learning_rate": 3.900911860949847e-05,
"loss": 0.842,
"step": 374
},
{
"epoch": 0.2783706040642108,
"grad_norm": 0.4163331687450409,
"learning_rate": 3.900157503746142e-05,
"loss": 0.7783,
"step": 375
},
{
"epoch": 0.2791129256750487,
"grad_norm": 0.4238855838775635,
"learning_rate": 3.899400359462983e-05,
"loss": 0.9216,
"step": 376
},
{
"epoch": 0.2798552472858866,
"grad_norm": 0.4086857736110687,
"learning_rate": 3.8986404292109206e-05,
"loss": 0.836,
"step": 377
},
{
"epoch": 0.2805975688967245,
"grad_norm": 0.47745761275291443,
"learning_rate": 3.8978777141045904e-05,
"loss": 0.7913,
"step": 378
},
{
"epoch": 0.2813398905075624,
"grad_norm": 0.39942634105682373,
"learning_rate": 3.897112215262716e-05,
"loss": 0.9627,
"step": 379
},
{
"epoch": 0.2820822121184003,
"grad_norm": 0.38598352670669556,
"learning_rate": 3.896343933808101e-05,
"loss": 0.9239,
"step": 380
},
{
"epoch": 0.2828245337292382,
"grad_norm": 0.49593856930732727,
"learning_rate": 3.895572870867632e-05,
"loss": 0.898,
"step": 381
},
{
"epoch": 0.2835668553400761,
"grad_norm": 0.40926507115364075,
"learning_rate": 3.8947990275722756e-05,
"loss": 0.8174,
"step": 382
},
{
"epoch": 0.284309176950914,
"grad_norm": 0.37032049894332886,
"learning_rate": 3.894022405057075e-05,
"loss": 0.8367,
"step": 383
},
{
"epoch": 0.2850514985617519,
"grad_norm": 0.3458698093891144,
"learning_rate": 3.893243004461151e-05,
"loss": 0.7966,
"step": 384
},
{
"epoch": 0.28579382017258975,
"grad_norm": 0.47391876578330994,
"learning_rate": 3.8924608269277004e-05,
"loss": 0.943,
"step": 385
},
{
"epoch": 0.2865361417834277,
"grad_norm": 0.4300226867198944,
"learning_rate": 3.89167587360399e-05,
"loss": 0.8706,
"step": 386
},
{
"epoch": 0.28727846339426555,
"grad_norm": 0.37660089135169983,
"learning_rate": 3.890888145641361e-05,
"loss": 0.8721,
"step": 387
},
{
"epoch": 0.2880207850051035,
"grad_norm": 0.35884180665016174,
"learning_rate": 3.890097644195223e-05,
"loss": 0.8307,
"step": 388
},
{
"epoch": 0.28876310661594135,
"grad_norm": 0.42722952365875244,
"learning_rate": 3.889304370425053e-05,
"loss": 0.9278,
"step": 389
},
{
"epoch": 0.28950542822677927,
"grad_norm": 0.4369671046733856,
"learning_rate": 3.888508325494395e-05,
"loss": 0.7752,
"step": 390
},
{
"epoch": 0.29024774983761714,
"grad_norm": 0.40296581387519836,
"learning_rate": 3.887709510570859e-05,
"loss": 0.8253,
"step": 391
},
{
"epoch": 0.29099007144845507,
"grad_norm": 0.37896981835365295,
"learning_rate": 3.8869079268261155e-05,
"loss": 0.758,
"step": 392
},
{
"epoch": 0.29173239305929294,
"grad_norm": 0.4341178834438324,
"learning_rate": 3.886103575435897e-05,
"loss": 0.8982,
"step": 393
},
{
"epoch": 0.2924747146701308,
"grad_norm": 0.36294007301330566,
"learning_rate": 3.885296457579998e-05,
"loss": 0.7998,
"step": 394
},
{
"epoch": 0.29321703628096873,
"grad_norm": 0.4262610971927643,
"learning_rate": 3.884486574442265e-05,
"loss": 0.8212,
"step": 395
},
{
"epoch": 0.2939593578918066,
"grad_norm": 0.41790515184402466,
"learning_rate": 3.883673927210608e-05,
"loss": 0.8739,
"step": 396
},
{
"epoch": 0.29470167950264453,
"grad_norm": 0.4500531554222107,
"learning_rate": 3.8828585170769854e-05,
"loss": 0.8168,
"step": 397
},
{
"epoch": 0.2954440011134824,
"grad_norm": 0.424883633852005,
"learning_rate": 3.8820403452374093e-05,
"loss": 0.8356,
"step": 398
},
{
"epoch": 0.29618632272432033,
"grad_norm": 0.4678550958633423,
"learning_rate": 3.881219412891945e-05,
"loss": 0.9249,
"step": 399
},
{
"epoch": 0.2969286443351582,
"grad_norm": 0.4226338565349579,
"learning_rate": 3.880395721244704e-05,
"loss": 0.8317,
"step": 400
},
{
"epoch": 0.2976709659459961,
"grad_norm": 0.44711941480636597,
"learning_rate": 3.8795692715038465e-05,
"loss": 0.8585,
"step": 401
},
{
"epoch": 0.298413287556834,
"grad_norm": 0.3793635666370392,
"learning_rate": 3.878740064881578e-05,
"loss": 0.8116,
"step": 402
},
{
"epoch": 0.2991556091676719,
"grad_norm": 0.41916918754577637,
"learning_rate": 3.877908102594147e-05,
"loss": 0.838,
"step": 403
},
{
"epoch": 0.2998979307785098,
"grad_norm": 0.44144803285598755,
"learning_rate": 3.877073385861846e-05,
"loss": 0.8794,
"step": 404
},
{
"epoch": 0.30064025238934766,
"grad_norm": 0.41615739464759827,
"learning_rate": 3.876235915909004e-05,
"loss": 0.8119,
"step": 405
},
{
"epoch": 0.3013825740001856,
"grad_norm": 0.38933223485946655,
"learning_rate": 3.8753956939639915e-05,
"loss": 0.9161,
"step": 406
},
{
"epoch": 0.30212489561102346,
"grad_norm": 0.38489091396331787,
"learning_rate": 3.874552721259215e-05,
"loss": 0.7344,
"step": 407
},
{
"epoch": 0.3028672172218614,
"grad_norm": 0.405105859041214,
"learning_rate": 3.873706999031113e-05,
"loss": 0.9039,
"step": 408
},
{
"epoch": 0.30360953883269926,
"grad_norm": 0.4045158624649048,
"learning_rate": 3.872858528520161e-05,
"loss": 0.883,
"step": 409
},
{
"epoch": 0.3043518604435372,
"grad_norm": 0.4408639967441559,
"learning_rate": 3.872007310970864e-05,
"loss": 0.8605,
"step": 410
},
{
"epoch": 0.30509418205437505,
"grad_norm": 0.39430099725723267,
"learning_rate": 3.871153347631753e-05,
"loss": 0.7924,
"step": 411
},
{
"epoch": 0.305836503665213,
"grad_norm": 0.4196453392505646,
"learning_rate": 3.8702966397553917e-05,
"loss": 0.8715,
"step": 412
},
{
"epoch": 0.30657882527605085,
"grad_norm": 0.3732720613479614,
"learning_rate": 3.869437188598366e-05,
"loss": 0.8603,
"step": 413
},
{
"epoch": 0.3073211468868887,
"grad_norm": 0.4107831120491028,
"learning_rate": 3.868574995421288e-05,
"loss": 0.7688,
"step": 414
},
{
"epoch": 0.30806346849772664,
"grad_norm": 0.43467453122138977,
"learning_rate": 3.867710061488788e-05,
"loss": 0.8764,
"step": 415
},
{
"epoch": 0.3088057901085645,
"grad_norm": 0.3707616329193115,
"learning_rate": 3.866842388069519e-05,
"loss": 0.8257,
"step": 416
},
{
"epoch": 0.30954811171940244,
"grad_norm": 0.4145166575908661,
"learning_rate": 3.8659719764361526e-05,
"loss": 0.8701,
"step": 417
},
{
"epoch": 0.3102904333302403,
"grad_norm": 0.33687037229537964,
"learning_rate": 3.8650988278653755e-05,
"loss": 0.8158,
"step": 418
},
{
"epoch": 0.31103275494107824,
"grad_norm": 0.40490809082984924,
"learning_rate": 3.864222943637889e-05,
"loss": 0.7536,
"step": 419
},
{
"epoch": 0.3117750765519161,
"grad_norm": 0.4449961483478546,
"learning_rate": 3.863344325038407e-05,
"loss": 0.8911,
"step": 420
},
{
"epoch": 0.31251739816275403,
"grad_norm": 0.47596439719200134,
"learning_rate": 3.862462973355654e-05,
"loss": 0.869,
"step": 421
},
{
"epoch": 0.3132597197735919,
"grad_norm": 0.3893236517906189,
"learning_rate": 3.861578889882364e-05,
"loss": 0.8048,
"step": 422
},
{
"epoch": 0.31400204138442983,
"grad_norm": 0.4097529351711273,
"learning_rate": 3.860692075915277e-05,
"loss": 0.8832,
"step": 423
},
{
"epoch": 0.3147443629952677,
"grad_norm": 0.4777495563030243,
"learning_rate": 3.859802532755139e-05,
"loss": 0.8201,
"step": 424
},
{
"epoch": 0.31548668460610557,
"grad_norm": 0.40158170461654663,
"learning_rate": 3.858910261706696e-05,
"loss": 0.9073,
"step": 425
},
{
"epoch": 0.3162290062169435,
"grad_norm": 0.4094659090042114,
"learning_rate": 3.8580152640787014e-05,
"loss": 0.8032,
"step": 426
},
{
"epoch": 0.31697132782778137,
"grad_norm": 0.4358592629432678,
"learning_rate": 3.8571175411839006e-05,
"loss": 0.8377,
"step": 427
},
{
"epoch": 0.3177136494386193,
"grad_norm": 0.45116010308265686,
"learning_rate": 3.856217094339041e-05,
"loss": 0.7675,
"step": 428
},
{
"epoch": 0.31845597104945716,
"grad_norm": 0.3857732117176056,
"learning_rate": 3.8553139248648645e-05,
"loss": 0.8392,
"step": 429
},
{
"epoch": 0.3191982926602951,
"grad_norm": 0.39530789852142334,
"learning_rate": 3.854408034086106e-05,
"loss": 0.8753,
"step": 430
},
{
"epoch": 0.31994061427113296,
"grad_norm": 0.3955869972705841,
"learning_rate": 3.853499423331492e-05,
"loss": 0.8537,
"step": 431
},
{
"epoch": 0.3206829358819709,
"grad_norm": 0.40758398175239563,
"learning_rate": 3.8525880939337375e-05,
"loss": 0.9214,
"step": 432
},
{
"epoch": 0.32142525749280876,
"grad_norm": 0.4302966892719269,
"learning_rate": 3.8516740472295474e-05,
"loss": 0.9284,
"step": 433
},
{
"epoch": 0.32216757910364663,
"grad_norm": 0.3887743651866913,
"learning_rate": 3.8507572845596095e-05,
"loss": 0.8329,
"step": 434
},
{
"epoch": 0.32290990071448455,
"grad_norm": 0.38320812582969666,
"learning_rate": 3.849837807268599e-05,
"loss": 0.8195,
"step": 435
},
{
"epoch": 0.3236522223253224,
"grad_norm": 0.4431188404560089,
"learning_rate": 3.8489156167051686e-05,
"loss": 0.9228,
"step": 436
},
{
"epoch": 0.32439454393616035,
"grad_norm": 0.4172087609767914,
"learning_rate": 3.847990714221953e-05,
"loss": 0.9269,
"step": 437
},
{
"epoch": 0.3251368655469982,
"grad_norm": 0.41571882367134094,
"learning_rate": 3.8470631011755654e-05,
"loss": 0.8732,
"step": 438
},
{
"epoch": 0.32587918715783615,
"grad_norm": 0.4024905860424042,
"learning_rate": 3.846132778926593e-05,
"loss": 0.7738,
"step": 439
},
{
"epoch": 0.326621508768674,
"grad_norm": 0.4435541331768036,
"learning_rate": 3.845199748839597e-05,
"loss": 0.9251,
"step": 440
},
{
"epoch": 0.32736383037951194,
"grad_norm": 0.38982489705085754,
"learning_rate": 3.844264012283111e-05,
"loss": 0.7196,
"step": 441
},
{
"epoch": 0.3281061519903498,
"grad_norm": 0.37881848216056824,
"learning_rate": 3.8433255706296384e-05,
"loss": 0.8207,
"step": 442
},
{
"epoch": 0.32884847360118774,
"grad_norm": 0.35831478238105774,
"learning_rate": 3.8423844252556504e-05,
"loss": 0.8365,
"step": 443
},
{
"epoch": 0.3295907952120256,
"grad_norm": 0.4337356984615326,
"learning_rate": 3.841440577541583e-05,
"loss": 0.8655,
"step": 444
},
{
"epoch": 0.3303331168228635,
"grad_norm": 0.41039177775382996,
"learning_rate": 3.840494028871836e-05,
"loss": 0.818,
"step": 445
},
{
"epoch": 0.3310754384337014,
"grad_norm": 0.3991888165473938,
"learning_rate": 3.839544780634772e-05,
"loss": 0.7963,
"step": 446
},
{
"epoch": 0.3318177600445393,
"grad_norm": 0.34694334864616394,
"learning_rate": 3.838592834222714e-05,
"loss": 0.8754,
"step": 447
},
{
"epoch": 0.3325600816553772,
"grad_norm": 0.3922919034957886,
"learning_rate": 3.837638191031938e-05,
"loss": 0.9201,
"step": 448
},
{
"epoch": 0.3333024032662151,
"grad_norm": 0.3862178921699524,
"learning_rate": 3.836680852462681e-05,
"loss": 0.7506,
"step": 449
},
{
"epoch": 0.334044724877053,
"grad_norm": 0.4066505432128906,
"learning_rate": 3.835720819919131e-05,
"loss": 0.8283,
"step": 450
},
{
"epoch": 0.33478704648789087,
"grad_norm": 0.4454757273197174,
"learning_rate": 3.834758094809426e-05,
"loss": 0.8575,
"step": 451
},
{
"epoch": 0.3355293680987288,
"grad_norm": 0.49195176362991333,
"learning_rate": 3.8337926785456575e-05,
"loss": 0.8578,
"step": 452
},
{
"epoch": 0.33627168970956667,
"grad_norm": 0.44131457805633545,
"learning_rate": 3.832824572543859e-05,
"loss": 0.7825,
"step": 453
},
{
"epoch": 0.33701401132040454,
"grad_norm": 0.45455294847488403,
"learning_rate": 3.8318537782240135e-05,
"loss": 0.8687,
"step": 454
},
{
"epoch": 0.33775633293124246,
"grad_norm": 0.40335503220558167,
"learning_rate": 3.830880297010045e-05,
"loss": 0.7981,
"step": 455
},
{
"epoch": 0.33849865454208033,
"grad_norm": 0.3739373981952667,
"learning_rate": 3.829904130329819e-05,
"loss": 0.8766,
"step": 456
},
{
"epoch": 0.33924097615291826,
"grad_norm": 0.4278407692909241,
"learning_rate": 3.828925279615141e-05,
"loss": 0.8353,
"step": 457
},
{
"epoch": 0.33998329776375613,
"grad_norm": 0.4283519387245178,
"learning_rate": 3.8279437463017516e-05,
"loss": 0.9147,
"step": 458
},
{
"epoch": 0.34072561937459406,
"grad_norm": 0.3937970995903015,
"learning_rate": 3.826959531829326e-05,
"loss": 0.885,
"step": 459
},
{
"epoch": 0.34146794098543193,
"grad_norm": 0.40536752343177795,
"learning_rate": 3.8259726376414754e-05,
"loss": 0.9063,
"step": 460
},
{
"epoch": 0.34221026259626985,
"grad_norm": 0.3611074388027191,
"learning_rate": 3.8249830651857355e-05,
"loss": 0.8843,
"step": 461
},
{
"epoch": 0.3429525842071077,
"grad_norm": 0.3790298402309418,
"learning_rate": 3.8239908159135774e-05,
"loss": 0.912,
"step": 462
},
{
"epoch": 0.34369490581794565,
"grad_norm": 0.4086690843105316,
"learning_rate": 3.822995891280393e-05,
"loss": 0.8264,
"step": 463
},
{
"epoch": 0.3444372274287835,
"grad_norm": 0.3959694802761078,
"learning_rate": 3.8219982927455006e-05,
"loss": 0.8477,
"step": 464
},
{
"epoch": 0.3451795490396214,
"grad_norm": 0.35375314950942993,
"learning_rate": 3.82099802177214e-05,
"loss": 0.9514,
"step": 465
},
{
"epoch": 0.3459218706504593,
"grad_norm": 0.38223370909690857,
"learning_rate": 3.819995079827472e-05,
"loss": 0.7945,
"step": 466
},
{
"epoch": 0.3466641922612972,
"grad_norm": 0.46122410893440247,
"learning_rate": 3.8189894683825726e-05,
"loss": 0.9686,
"step": 467
},
{
"epoch": 0.3474065138721351,
"grad_norm": 0.4506250023841858,
"learning_rate": 3.817981188912436e-05,
"loss": 0.8485,
"step": 468
},
{
"epoch": 0.348148835482973,
"grad_norm": 0.4149094223976135,
"learning_rate": 3.816970242895968e-05,
"loss": 0.8154,
"step": 469
},
{
"epoch": 0.3488911570938109,
"grad_norm": 0.40498077869415283,
"learning_rate": 3.815956631815985e-05,
"loss": 0.8611,
"step": 470
},
{
"epoch": 0.3496334787046488,
"grad_norm": 0.3880930542945862,
"learning_rate": 3.8149403571592164e-05,
"loss": 0.8212,
"step": 471
},
{
"epoch": 0.3503758003154867,
"grad_norm": 0.43260103464126587,
"learning_rate": 3.8139214204162934e-05,
"loss": 0.8884,
"step": 472
},
{
"epoch": 0.3511181219263246,
"grad_norm": 0.4028635621070862,
"learning_rate": 3.8128998230817544e-05,
"loss": 0.8843,
"step": 473
},
{
"epoch": 0.35186044353716245,
"grad_norm": 0.3990902304649353,
"learning_rate": 3.81187556665404e-05,
"loss": 0.8498,
"step": 474
},
{
"epoch": 0.3526027651480004,
"grad_norm": 0.3879133462905884,
"learning_rate": 3.810848652635491e-05,
"loss": 0.7933,
"step": 475
},
{
"epoch": 0.35334508675883824,
"grad_norm": 0.3822309076786041,
"learning_rate": 3.8098190825323466e-05,
"loss": 0.8271,
"step": 476
},
{
"epoch": 0.35408740836967617,
"grad_norm": 0.3867788314819336,
"learning_rate": 3.808786857854741e-05,
"loss": 0.761,
"step": 477
},
{
"epoch": 0.35482972998051404,
"grad_norm": 0.4349990487098694,
"learning_rate": 3.807751980116702e-05,
"loss": 0.9026,
"step": 478
},
{
"epoch": 0.35557205159135197,
"grad_norm": 0.4421882927417755,
"learning_rate": 3.80671445083615e-05,
"loss": 0.8214,
"step": 479
},
{
"epoch": 0.35631437320218984,
"grad_norm": 0.4466465711593628,
"learning_rate": 3.805674271534894e-05,
"loss": 0.9893,
"step": 480
},
{
"epoch": 0.35705669481302776,
"grad_norm": 0.4131970703601837,
"learning_rate": 3.8046314437386286e-05,
"loss": 0.8921,
"step": 481
},
{
"epoch": 0.35779901642386563,
"grad_norm": 0.36232057213783264,
"learning_rate": 3.803585968976936e-05,
"loss": 0.816,
"step": 482
},
{
"epoch": 0.35854133803470356,
"grad_norm": 0.36747488379478455,
"learning_rate": 3.8025378487832786e-05,
"loss": 0.863,
"step": 483
},
{
"epoch": 0.35928365964554143,
"grad_norm": 0.4169939160346985,
"learning_rate": 3.801487084695e-05,
"loss": 0.8733,
"step": 484
},
{
"epoch": 0.3600259812563793,
"grad_norm": 0.40697571635246277,
"learning_rate": 3.8004336782533194e-05,
"loss": 0.8453,
"step": 485
},
{
"epoch": 0.3607683028672172,
"grad_norm": 0.4079035818576813,
"learning_rate": 3.799377631003336e-05,
"loss": 0.7795,
"step": 486
},
{
"epoch": 0.3615106244780551,
"grad_norm": 0.35980504751205444,
"learning_rate": 3.7983189444940196e-05,
"loss": 0.8452,
"step": 487
},
{
"epoch": 0.362252946088893,
"grad_norm": 0.3967490494251251,
"learning_rate": 3.7972576202782104e-05,
"loss": 0.8732,
"step": 488
},
{
"epoch": 0.3629952676997309,
"grad_norm": 0.3711526393890381,
"learning_rate": 3.796193659912621e-05,
"loss": 0.8941,
"step": 489
},
{
"epoch": 0.3637375893105688,
"grad_norm": 0.358286589384079,
"learning_rate": 3.795127064957825e-05,
"loss": 0.8332,
"step": 490
},
{
"epoch": 0.3644799109214067,
"grad_norm": 0.3899291455745697,
"learning_rate": 3.7940578369782676e-05,
"loss": 0.8518,
"step": 491
},
{
"epoch": 0.3652222325322446,
"grad_norm": 0.4205942451953888,
"learning_rate": 3.79298597754225e-05,
"loss": 0.7851,
"step": 492
},
{
"epoch": 0.3659645541430825,
"grad_norm": 0.443820059299469,
"learning_rate": 3.791911488221936e-05,
"loss": 0.847,
"step": 493
},
{
"epoch": 0.36670687575392036,
"grad_norm": 0.3978535234928131,
"learning_rate": 3.790834370593345e-05,
"loss": 0.8564,
"step": 494
},
{
"epoch": 0.3674491973647583,
"grad_norm": 0.4479997754096985,
"learning_rate": 3.789754626236353e-05,
"loss": 0.8226,
"step": 495
},
{
"epoch": 0.36819151897559615,
"grad_norm": 0.3979516923427582,
"learning_rate": 3.78867225673469e-05,
"loss": 0.7801,
"step": 496
},
{
"epoch": 0.3689338405864341,
"grad_norm": 0.4172329306602478,
"learning_rate": 3.7875872636759327e-05,
"loss": 0.7681,
"step": 497
},
{
"epoch": 0.36967616219727195,
"grad_norm": 0.3862856924533844,
"learning_rate": 3.786499648651508e-05,
"loss": 0.7748,
"step": 498
},
{
"epoch": 0.3704184838081099,
"grad_norm": 0.3918432295322418,
"learning_rate": 3.78540941325669e-05,
"loss": 0.9062,
"step": 499
},
{
"epoch": 0.37116080541894775,
"grad_norm": 0.4164000153541565,
"learning_rate": 3.784316559090594e-05,
"loss": 0.8612,
"step": 500
},
{
"epoch": 0.3719031270297857,
"grad_norm": 0.40305987000465393,
"learning_rate": 3.783221087756178e-05,
"loss": 0.9142,
"step": 501
},
{
"epoch": 0.37264544864062354,
"grad_norm": 0.4271204173564911,
"learning_rate": 3.782123000860238e-05,
"loss": 0.795,
"step": 502
},
{
"epoch": 0.37338777025146147,
"grad_norm": 0.4040716290473938,
"learning_rate": 3.781022300013406e-05,
"loss": 0.8365,
"step": 503
},
{
"epoch": 0.37413009186229934,
"grad_norm": 0.41757476329803467,
"learning_rate": 3.779918986830148e-05,
"loss": 0.8417,
"step": 504
},
{
"epoch": 0.3748724134731372,
"grad_norm": 0.36810386180877686,
"learning_rate": 3.7788130629287645e-05,
"loss": 0.8355,
"step": 505
},
{
"epoch": 0.37561473508397514,
"grad_norm": 0.42814144492149353,
"learning_rate": 3.777704529931381e-05,
"loss": 0.8722,
"step": 506
},
{
"epoch": 0.376357056694813,
"grad_norm": 0.44470280408859253,
"learning_rate": 3.776593389463952e-05,
"loss": 0.8156,
"step": 507
},
{
"epoch": 0.37709937830565093,
"grad_norm": 0.3996254801750183,
"learning_rate": 3.775479643156257e-05,
"loss": 0.8909,
"step": 508
},
{
"epoch": 0.3778416999164888,
"grad_norm": 0.4491524398326874,
"learning_rate": 3.774363292641897e-05,
"loss": 0.8144,
"step": 509
},
{
"epoch": 0.37858402152732673,
"grad_norm": 0.4341849088668823,
"learning_rate": 3.7732443395582935e-05,
"loss": 0.9167,
"step": 510
},
{
"epoch": 0.3793263431381646,
"grad_norm": 0.38755953311920166,
"learning_rate": 3.772122785546684e-05,
"loss": 0.8664,
"step": 511
},
{
"epoch": 0.3800686647490025,
"grad_norm": 0.39869973063468933,
"learning_rate": 3.7709986322521217e-05,
"loss": 0.8358,
"step": 512
},
{
"epoch": 0.3808109863598404,
"grad_norm": 0.3752814531326294,
"learning_rate": 3.769871881323473e-05,
"loss": 0.8321,
"step": 513
},
{
"epoch": 0.3815533079706783,
"grad_norm": 0.4040491282939911,
"learning_rate": 3.768742534413413e-05,
"loss": 0.8743,
"step": 514
},
{
"epoch": 0.3822956295815162,
"grad_norm": 0.4491027295589447,
"learning_rate": 3.7676105931784256e-05,
"loss": 0.8705,
"step": 515
},
{
"epoch": 0.38303795119235406,
"grad_norm": 0.4306964874267578,
"learning_rate": 3.766476059278799e-05,
"loss": 0.892,
"step": 516
},
{
"epoch": 0.383780272803192,
"grad_norm": 0.43329861760139465,
"learning_rate": 3.765338934378625e-05,
"loss": 0.8469,
"step": 517
},
{
"epoch": 0.38452259441402986,
"grad_norm": 0.36683329939842224,
"learning_rate": 3.764199220145796e-05,
"loss": 0.8179,
"step": 518
},
{
"epoch": 0.3852649160248678,
"grad_norm": 0.3505510985851288,
"learning_rate": 3.7630569182520015e-05,
"loss": 0.797,
"step": 519
},
{
"epoch": 0.38600723763570566,
"grad_norm": 0.4275372624397278,
"learning_rate": 3.7619120303727274e-05,
"loss": 0.8307,
"step": 520
},
{
"epoch": 0.3867495592465436,
"grad_norm": 0.3772055208683014,
"learning_rate": 3.760764558187252e-05,
"loss": 0.8697,
"step": 521
},
{
"epoch": 0.38749188085738145,
"grad_norm": 0.348351389169693,
"learning_rate": 3.759614503378644e-05,
"loss": 0.7987,
"step": 522
},
{
"epoch": 0.3882342024682194,
"grad_norm": 0.40751683712005615,
"learning_rate": 3.75846186763376e-05,
"loss": 0.7577,
"step": 523
},
{
"epoch": 0.38897652407905725,
"grad_norm": 0.413409948348999,
"learning_rate": 3.7573066526432435e-05,
"loss": 0.9066,
"step": 524
},
{
"epoch": 0.3897188456898951,
"grad_norm": 0.39468032121658325,
"learning_rate": 3.7561488601015206e-05,
"loss": 0.8524,
"step": 525
},
{
"epoch": 0.39046116730073305,
"grad_norm": 0.38360586762428284,
"learning_rate": 3.754988491706796e-05,
"loss": 0.8282,
"step": 526
},
{
"epoch": 0.3912034889115709,
"grad_norm": 0.38212329149246216,
"learning_rate": 3.7538255491610567e-05,
"loss": 0.8489,
"step": 527
},
{
"epoch": 0.39194581052240884,
"grad_norm": 0.37275058031082153,
"learning_rate": 3.752660034170062e-05,
"loss": 0.8327,
"step": 528
},
{
"epoch": 0.3926881321332467,
"grad_norm": 0.39423465728759766,
"learning_rate": 3.7514919484433444e-05,
"loss": 0.8281,
"step": 529
},
{
"epoch": 0.39343045374408464,
"grad_norm": 0.40932732820510864,
"learning_rate": 3.75032129369421e-05,
"loss": 0.8683,
"step": 530
},
{
"epoch": 0.3941727753549225,
"grad_norm": 0.41477513313293457,
"learning_rate": 3.7491480716397296e-05,
"loss": 0.8427,
"step": 531
},
{
"epoch": 0.39491509696576044,
"grad_norm": 0.38719847798347473,
"learning_rate": 3.747972284000741e-05,
"loss": 0.8367,
"step": 532
},
{
"epoch": 0.3956574185765983,
"grad_norm": 0.39567679166793823,
"learning_rate": 3.746793932501847e-05,
"loss": 0.9394,
"step": 533
},
{
"epoch": 0.39639974018743623,
"grad_norm": 0.42474374175071716,
"learning_rate": 3.745613018871407e-05,
"loss": 0.8696,
"step": 534
},
{
"epoch": 0.3971420617982741,
"grad_norm": 0.39733052253723145,
"learning_rate": 3.7444295448415423e-05,
"loss": 0.7553,
"step": 535
},
{
"epoch": 0.397884383409112,
"grad_norm": 0.4539112150669098,
"learning_rate": 3.743243512148127e-05,
"loss": 0.791,
"step": 536
},
{
"epoch": 0.3986267050199499,
"grad_norm": 0.40971097350120544,
"learning_rate": 3.74205492253079e-05,
"loss": 0.9272,
"step": 537
},
{
"epoch": 0.39936902663078777,
"grad_norm": 0.3799310624599457,
"learning_rate": 3.740863777732909e-05,
"loss": 0.8347,
"step": 538
},
{
"epoch": 0.4001113482416257,
"grad_norm": 0.3867487609386444,
"learning_rate": 3.7396700795016106e-05,
"loss": 0.8936,
"step": 539
},
{
"epoch": 0.40085366985246357,
"grad_norm": 0.39778730273246765,
"learning_rate": 3.738473829587766e-05,
"loss": 0.8159,
"step": 540
},
{
"epoch": 0.4015959914633015,
"grad_norm": 0.3948611915111542,
"learning_rate": 3.7372750297459904e-05,
"loss": 0.7943,
"step": 541
},
{
"epoch": 0.40233831307413936,
"grad_norm": 0.3824806809425354,
"learning_rate": 3.736073681734638e-05,
"loss": 0.8733,
"step": 542
},
{
"epoch": 0.4030806346849773,
"grad_norm": 0.413933664560318,
"learning_rate": 3.7348697873158e-05,
"loss": 0.8837,
"step": 543
},
{
"epoch": 0.40382295629581516,
"grad_norm": 0.4327161908149719,
"learning_rate": 3.733663348255305e-05,
"loss": 0.8503,
"step": 544
},
{
"epoch": 0.40456527790665303,
"grad_norm": 0.4562986195087433,
"learning_rate": 3.7324543663227105e-05,
"loss": 0.7086,
"step": 545
},
{
"epoch": 0.40530759951749096,
"grad_norm": 0.40474218130111694,
"learning_rate": 3.731242843291307e-05,
"loss": 0.8736,
"step": 546
},
{
"epoch": 0.40604992112832883,
"grad_norm": 0.4206937849521637,
"learning_rate": 3.73002878093811e-05,
"loss": 0.9176,
"step": 547
},
{
"epoch": 0.40679224273916675,
"grad_norm": 0.40525129437446594,
"learning_rate": 3.7288121810438614e-05,
"loss": 0.9027,
"step": 548
},
{
"epoch": 0.4075345643500046,
"grad_norm": 0.39779412746429443,
"learning_rate": 3.727593045393024e-05,
"loss": 0.8264,
"step": 549
},
{
"epoch": 0.40827688596084255,
"grad_norm": 0.3574381470680237,
"learning_rate": 3.726371375773778e-05,
"loss": 0.8602,
"step": 550
},
{
"epoch": 0.4090192075716804,
"grad_norm": 0.37428656220436096,
"learning_rate": 3.725147173978025e-05,
"loss": 0.9414,
"step": 551
},
{
"epoch": 0.40976152918251835,
"grad_norm": 0.3972627818584442,
"learning_rate": 3.7239204418013767e-05,
"loss": 0.9304,
"step": 552
},
{
"epoch": 0.4105038507933562,
"grad_norm": 0.43916741013526917,
"learning_rate": 3.722691181043158e-05,
"loss": 0.8894,
"step": 553
},
{
"epoch": 0.41124617240419414,
"grad_norm": 0.42011746764183044,
"learning_rate": 3.7214593935064026e-05,
"loss": 0.7993,
"step": 554
},
{
"epoch": 0.411988494015032,
"grad_norm": 0.4282603859901428,
"learning_rate": 3.72022508099785e-05,
"loss": 0.8025,
"step": 555
},
{
"epoch": 0.4127308156258699,
"grad_norm": 0.4019373953342438,
"learning_rate": 3.718988245327942e-05,
"loss": 0.7938,
"step": 556
},
{
"epoch": 0.4134731372367078,
"grad_norm": 0.3733516335487366,
"learning_rate": 3.7177488883108245e-05,
"loss": 0.8268,
"step": 557
},
{
"epoch": 0.4142154588475457,
"grad_norm": 0.41670674085617065,
"learning_rate": 3.716507011764338e-05,
"loss": 0.8477,
"step": 558
},
{
"epoch": 0.4149577804583836,
"grad_norm": 0.40059223771095276,
"learning_rate": 3.715262617510022e-05,
"loss": 0.9232,
"step": 559
},
{
"epoch": 0.4157001020692215,
"grad_norm": 0.3981556296348572,
"learning_rate": 3.714015707373106e-05,
"loss": 0.7917,
"step": 560
},
{
"epoch": 0.4164424236800594,
"grad_norm": 0.42250776290893555,
"learning_rate": 3.7127662831825104e-05,
"loss": 0.831,
"step": 561
},
{
"epoch": 0.4171847452908973,
"grad_norm": 0.3976747393608093,
"learning_rate": 3.711514346770844e-05,
"loss": 0.9161,
"step": 562
},
{
"epoch": 0.4179270669017352,
"grad_norm": 0.45880788564682007,
"learning_rate": 3.710259899974401e-05,
"loss": 0.926,
"step": 563
},
{
"epoch": 0.41866938851257307,
"grad_norm": 0.4061499536037445,
"learning_rate": 3.7090029446331555e-05,
"loss": 0.9172,
"step": 564
},
{
"epoch": 0.41941171012341094,
"grad_norm": 0.5111361145973206,
"learning_rate": 3.707743482590762e-05,
"loss": 0.8568,
"step": 565
},
{
"epoch": 0.42015403173424887,
"grad_norm": 0.3791842758655548,
"learning_rate": 3.7064815156945534e-05,
"loss": 0.8478,
"step": 566
},
{
"epoch": 0.42089635334508674,
"grad_norm": 0.3697012960910797,
"learning_rate": 3.705217045795534e-05,
"loss": 0.807,
"step": 567
},
{
"epoch": 0.42163867495592466,
"grad_norm": 0.3967568874359131,
"learning_rate": 3.703950074748381e-05,
"loss": 0.7994,
"step": 568
},
{
"epoch": 0.42238099656676253,
"grad_norm": 0.379448801279068,
"learning_rate": 3.7026806044114396e-05,
"loss": 0.8293,
"step": 569
},
{
"epoch": 0.42312331817760046,
"grad_norm": 0.40183165669441223,
"learning_rate": 3.701408636646721e-05,
"loss": 0.9044,
"step": 570
},
{
"epoch": 0.42386563978843833,
"grad_norm": 0.4314356744289398,
"learning_rate": 3.7001341733199003e-05,
"loss": 0.782,
"step": 571
},
{
"epoch": 0.42460796139927626,
"grad_norm": 0.4023379385471344,
"learning_rate": 3.698857216300312e-05,
"loss": 0.9449,
"step": 572
},
{
"epoch": 0.4253502830101141,
"grad_norm": 0.37457001209259033,
"learning_rate": 3.6975777674609473e-05,
"loss": 0.8054,
"step": 573
},
{
"epoch": 0.42609260462095205,
"grad_norm": 0.3759816288948059,
"learning_rate": 3.696295828678456e-05,
"loss": 0.8421,
"step": 574
},
{
"epoch": 0.4268349262317899,
"grad_norm": 0.383489727973938,
"learning_rate": 3.695011401833136e-05,
"loss": 0.901,
"step": 575
},
{
"epoch": 0.4275772478426278,
"grad_norm": 0.41077226400375366,
"learning_rate": 3.693724488808936e-05,
"loss": 0.8094,
"step": 576
},
{
"epoch": 0.4283195694534657,
"grad_norm": 0.36784303188323975,
"learning_rate": 3.692435091493453e-05,
"loss": 0.7936,
"step": 577
},
{
"epoch": 0.4290618910643036,
"grad_norm": 0.39439964294433594,
"learning_rate": 3.691143211777925e-05,
"loss": 0.8754,
"step": 578
},
{
"epoch": 0.4298042126751415,
"grad_norm": 0.4088978171348572,
"learning_rate": 3.6898488515572326e-05,
"loss": 0.9492,
"step": 579
},
{
"epoch": 0.4305465342859794,
"grad_norm": 0.364327609539032,
"learning_rate": 3.6885520127298955e-05,
"loss": 0.9537,
"step": 580
},
{
"epoch": 0.4312888558968173,
"grad_norm": 0.4293886125087738,
"learning_rate": 3.6872526971980676e-05,
"loss": 0.8285,
"step": 581
},
{
"epoch": 0.4320311775076552,
"grad_norm": 0.3976035416126251,
"learning_rate": 3.685950906867535e-05,
"loss": 0.8095,
"step": 582
},
{
"epoch": 0.4327734991184931,
"grad_norm": 0.4648852050304413,
"learning_rate": 3.684646643647717e-05,
"loss": 0.897,
"step": 583
},
{
"epoch": 0.433515820729331,
"grad_norm": 0.36569103598594666,
"learning_rate": 3.683339909451656e-05,
"loss": 0.8027,
"step": 584
},
{
"epoch": 0.43425814234016885,
"grad_norm": 0.4245463013648987,
"learning_rate": 3.6820307061960206e-05,
"loss": 0.8928,
"step": 585
},
{
"epoch": 0.4350004639510068,
"grad_norm": 0.3979533910751343,
"learning_rate": 3.680719035801102e-05,
"loss": 0.9094,
"step": 586
},
{
"epoch": 0.43574278556184465,
"grad_norm": 0.3963554799556732,
"learning_rate": 3.679404900190807e-05,
"loss": 0.7871,
"step": 587
},
{
"epoch": 0.4364851071726826,
"grad_norm": 0.38191401958465576,
"learning_rate": 3.678088301292662e-05,
"loss": 0.7979,
"step": 588
},
{
"epoch": 0.43722742878352044,
"grad_norm": 0.3779836893081665,
"learning_rate": 3.676769241037803e-05,
"loss": 0.76,
"step": 589
},
{
"epoch": 0.43796975039435837,
"grad_norm": 0.4065380394458771,
"learning_rate": 3.67544772136098e-05,
"loss": 0.8759,
"step": 590
},
{
"epoch": 0.43871207200519624,
"grad_norm": 0.43756797909736633,
"learning_rate": 3.674123744200547e-05,
"loss": 0.9037,
"step": 591
},
{
"epoch": 0.43945439361603417,
"grad_norm": 0.36865559220314026,
"learning_rate": 3.672797311498464e-05,
"loss": 0.7835,
"step": 592
},
{
"epoch": 0.44019671522687204,
"grad_norm": 0.45453381538391113,
"learning_rate": 3.671468425200292e-05,
"loss": 0.8964,
"step": 593
},
{
"epoch": 0.44093903683770996,
"grad_norm": 0.35778266191482544,
"learning_rate": 3.670137087255192e-05,
"loss": 0.8124,
"step": 594
},
{
"epoch": 0.44168135844854783,
"grad_norm": 0.41455769538879395,
"learning_rate": 3.66880329961592e-05,
"loss": 0.8156,
"step": 595
},
{
"epoch": 0.4424236800593857,
"grad_norm": 0.38713696599006653,
"learning_rate": 3.6674670642388264e-05,
"loss": 0.9086,
"step": 596
},
{
"epoch": 0.44316600167022363,
"grad_norm": 0.36686089634895325,
"learning_rate": 3.66612838308385e-05,
"loss": 0.825,
"step": 597
},
{
"epoch": 0.4439083232810615,
"grad_norm": 0.4371680021286011,
"learning_rate": 3.664787258114518e-05,
"loss": 0.8598,
"step": 598
},
{
"epoch": 0.4446506448918994,
"grad_norm": 0.40336138010025024,
"learning_rate": 3.663443691297942e-05,
"loss": 0.8545,
"step": 599
},
{
"epoch": 0.4453929665027373,
"grad_norm": 0.4215971827507019,
"learning_rate": 3.662097684604815e-05,
"loss": 0.905,
"step": 600
},
{
"epoch": 0.4461352881135752,
"grad_norm": 0.42287206649780273,
"learning_rate": 3.66074924000941e-05,
"loss": 0.9016,
"step": 601
},
{
"epoch": 0.4468776097244131,
"grad_norm": 0.4127437472343445,
"learning_rate": 3.659398359489574e-05,
"loss": 0.8348,
"step": 602
},
{
"epoch": 0.447619931335251,
"grad_norm": 0.42018598318099976,
"learning_rate": 3.658045045026727e-05,
"loss": 0.8192,
"step": 603
},
{
"epoch": 0.4483622529460889,
"grad_norm": 0.429340660572052,
"learning_rate": 3.65668929860586e-05,
"loss": 0.8346,
"step": 604
},
{
"epoch": 0.44910457455692676,
"grad_norm": 0.4273694157600403,
"learning_rate": 3.6553311222155313e-05,
"loss": 0.8744,
"step": 605
},
{
"epoch": 0.4498468961677647,
"grad_norm": 0.38480550050735474,
"learning_rate": 3.6539705178478636e-05,
"loss": 0.8466,
"step": 606
},
{
"epoch": 0.45058921777860256,
"grad_norm": 0.38357681035995483,
"learning_rate": 3.6526074874985385e-05,
"loss": 0.7869,
"step": 607
},
{
"epoch": 0.4513315393894405,
"grad_norm": 0.4218459129333496,
"learning_rate": 3.651242033166798e-05,
"loss": 0.8489,
"step": 608
},
{
"epoch": 0.45207386100027835,
"grad_norm": 0.3404082953929901,
"learning_rate": 3.64987415685544e-05,
"loss": 0.8556,
"step": 609
},
{
"epoch": 0.4528161826111163,
"grad_norm": 0.38693150877952576,
"learning_rate": 3.648503860570813e-05,
"loss": 0.8234,
"step": 610
},
{
"epoch": 0.45355850422195415,
"grad_norm": 0.39517828822135925,
"learning_rate": 3.6471311463228164e-05,
"loss": 0.9296,
"step": 611
},
{
"epoch": 0.4543008258327921,
"grad_norm": 0.4109130799770355,
"learning_rate": 3.6457560161248956e-05,
"loss": 0.8357,
"step": 612
},
{
"epoch": 0.45504314744362995,
"grad_norm": 0.39871203899383545,
"learning_rate": 3.64437847199404e-05,
"loss": 0.9087,
"step": 613
},
{
"epoch": 0.4557854690544679,
"grad_norm": 0.3984218239784241,
"learning_rate": 3.642998515950779e-05,
"loss": 0.7781,
"step": 614
},
{
"epoch": 0.45652779066530574,
"grad_norm": 0.37950950860977173,
"learning_rate": 3.6416161500191806e-05,
"loss": 0.8539,
"step": 615
},
{
"epoch": 0.4572701122761436,
"grad_norm": 0.3692931830883026,
"learning_rate": 3.640231376226847e-05,
"loss": 0.8544,
"step": 616
},
{
"epoch": 0.45801243388698154,
"grad_norm": 0.3967374265193939,
"learning_rate": 3.638844196604911e-05,
"loss": 0.8591,
"step": 617
},
{
"epoch": 0.4587547554978194,
"grad_norm": 0.3724336624145508,
"learning_rate": 3.6374546131880375e-05,
"loss": 0.8421,
"step": 618
},
{
"epoch": 0.45949707710865734,
"grad_norm": 0.398163765668869,
"learning_rate": 3.636062628014414e-05,
"loss": 0.8612,
"step": 619
},
{
"epoch": 0.4602393987194952,
"grad_norm": 0.3659161329269409,
"learning_rate": 3.634668243125752e-05,
"loss": 0.8387,
"step": 620
},
{
"epoch": 0.46098172033033313,
"grad_norm": 0.4477706551551819,
"learning_rate": 3.633271460567284e-05,
"loss": 0.9355,
"step": 621
},
{
"epoch": 0.461724041941171,
"grad_norm": 0.4127034842967987,
"learning_rate": 3.6318722823877555e-05,
"loss": 0.8034,
"step": 622
},
{
"epoch": 0.46246636355200893,
"grad_norm": 0.4307912290096283,
"learning_rate": 3.6304707106394295e-05,
"loss": 0.874,
"step": 623
},
{
"epoch": 0.4632086851628468,
"grad_norm": 0.4227927029132843,
"learning_rate": 3.6290667473780796e-05,
"loss": 0.8189,
"step": 624
},
{
"epoch": 0.4639510067736847,
"grad_norm": 0.3703235983848572,
"learning_rate": 3.627660394662986e-05,
"loss": 0.8073,
"step": 625
},
{
"epoch": 0.4646933283845226,
"grad_norm": 0.46729329228401184,
"learning_rate": 3.6262516545569325e-05,
"loss": 0.8373,
"step": 626
},
{
"epoch": 0.46543564999536047,
"grad_norm": 0.41088923811912537,
"learning_rate": 3.624840529126207e-05,
"loss": 0.8126,
"step": 627
},
{
"epoch": 0.4661779716061984,
"grad_norm": 0.3681904673576355,
"learning_rate": 3.623427020440595e-05,
"loss": 0.8599,
"step": 628
},
{
"epoch": 0.46692029321703626,
"grad_norm": 0.3651503324508667,
"learning_rate": 3.622011130573378e-05,
"loss": 0.8385,
"step": 629
},
{
"epoch": 0.4676626148278742,
"grad_norm": 0.3684818148612976,
"learning_rate": 3.62059286160133e-05,
"loss": 0.8341,
"step": 630
},
{
"epoch": 0.46840493643871206,
"grad_norm": 0.37057459354400635,
"learning_rate": 3.619172215604714e-05,
"loss": 0.9151,
"step": 631
},
{
"epoch": 0.46914725804955,
"grad_norm": 0.4190065264701843,
"learning_rate": 3.61774919466728e-05,
"loss": 0.8517,
"step": 632
},
{
"epoch": 0.46988957966038786,
"grad_norm": 0.42590153217315674,
"learning_rate": 3.616323800876262e-05,
"loss": 0.7829,
"step": 633
},
{
"epoch": 0.4706319012712258,
"grad_norm": 0.407207190990448,
"learning_rate": 3.614896036322374e-05,
"loss": 0.7631,
"step": 634
},
{
"epoch": 0.47137422288206365,
"grad_norm": 0.4581393897533417,
"learning_rate": 3.613465903099807e-05,
"loss": 0.8206,
"step": 635
},
{
"epoch": 0.4721165444929015,
"grad_norm": 0.39499345421791077,
"learning_rate": 3.6120334033062264e-05,
"loss": 0.8448,
"step": 636
},
{
"epoch": 0.47285886610373945,
"grad_norm": 0.35499414801597595,
"learning_rate": 3.6105985390427696e-05,
"loss": 0.7845,
"step": 637
},
{
"epoch": 0.4736011877145773,
"grad_norm": 0.38710108399391174,
"learning_rate": 3.6091613124140404e-05,
"loss": 0.901,
"step": 638
},
{
"epoch": 0.47434350932541525,
"grad_norm": 0.4199868142604828,
"learning_rate": 3.6077217255281094e-05,
"loss": 0.8268,
"step": 639
},
{
"epoch": 0.4750858309362531,
"grad_norm": 0.384318470954895,
"learning_rate": 3.606279780496508e-05,
"loss": 0.8498,
"step": 640
},
{
"epoch": 0.47582815254709104,
"grad_norm": 0.38041505217552185,
"learning_rate": 3.604835479434227e-05,
"loss": 0.7877,
"step": 641
},
{
"epoch": 0.4765704741579289,
"grad_norm": 0.3970247209072113,
"learning_rate": 3.6033888244597136e-05,
"loss": 0.8143,
"step": 642
},
{
"epoch": 0.47731279576876684,
"grad_norm": 0.40868648886680603,
"learning_rate": 3.6019398176948656e-05,
"loss": 0.8718,
"step": 643
},
{
"epoch": 0.4780551173796047,
"grad_norm": 0.3927657902240753,
"learning_rate": 3.6004884612650326e-05,
"loss": 0.8719,
"step": 644
},
{
"epoch": 0.47879743899044264,
"grad_norm": 0.37920475006103516,
"learning_rate": 3.599034757299009e-05,
"loss": 0.8496,
"step": 645
},
{
"epoch": 0.4795397606012805,
"grad_norm": 0.3785390257835388,
"learning_rate": 3.597578707929033e-05,
"loss": 0.7623,
"step": 646
},
{
"epoch": 0.4802820822121184,
"grad_norm": 0.3962065279483795,
"learning_rate": 3.596120315290782e-05,
"loss": 0.7983,
"step": 647
},
{
"epoch": 0.4810244038229563,
"grad_norm": 0.40997880697250366,
"learning_rate": 3.594659581523374e-05,
"loss": 0.7733,
"step": 648
},
{
"epoch": 0.4817667254337942,
"grad_norm": 0.3517996072769165,
"learning_rate": 3.593196508769355e-05,
"loss": 0.85,
"step": 649
},
{
"epoch": 0.4825090470446321,
"grad_norm": 0.3954550325870514,
"learning_rate": 3.591731099174708e-05,
"loss": 0.837,
"step": 650
},
{
"epoch": 0.48325136865546997,
"grad_norm": 0.3854595124721527,
"learning_rate": 3.590263354888839e-05,
"loss": 0.8456,
"step": 651
},
{
"epoch": 0.4839936902663079,
"grad_norm": 0.3883722126483917,
"learning_rate": 3.58879327806458e-05,
"loss": 0.7988,
"step": 652
},
{
"epoch": 0.48473601187714577,
"grad_norm": 0.432900607585907,
"learning_rate": 3.5873208708581844e-05,
"loss": 0.8776,
"step": 653
},
{
"epoch": 0.4854783334879837,
"grad_norm": 0.37475481629371643,
"learning_rate": 3.5858461354293244e-05,
"loss": 0.8021,
"step": 654
},
{
"epoch": 0.48622065509882156,
"grad_norm": 0.4133627116680145,
"learning_rate": 3.584369073941086e-05,
"loss": 0.8003,
"step": 655
},
{
"epoch": 0.48696297670965943,
"grad_norm": 0.3836146891117096,
"learning_rate": 3.5828896885599666e-05,
"loss": 0.8419,
"step": 656
},
{
"epoch": 0.48770529832049736,
"grad_norm": 0.40715518593788147,
"learning_rate": 3.5814079814558737e-05,
"loss": 0.8652,
"step": 657
},
{
"epoch": 0.48844761993133523,
"grad_norm": 0.4351827800273895,
"learning_rate": 3.57992395480212e-05,
"loss": 0.9617,
"step": 658
},
{
"epoch": 0.48918994154217316,
"grad_norm": 0.3888394832611084,
"learning_rate": 3.578437610775418e-05,
"loss": 0.8957,
"step": 659
},
{
"epoch": 0.489932263153011,
"grad_norm": 0.3931278586387634,
"learning_rate": 3.5769489515558835e-05,
"loss": 0.9177,
"step": 660
},
{
"epoch": 0.49067458476384895,
"grad_norm": 0.3771383464336395,
"learning_rate": 3.575457979327024e-05,
"loss": 0.8871,
"step": 661
},
{
"epoch": 0.4914169063746868,
"grad_norm": 0.43624410033226013,
"learning_rate": 3.5739646962757426e-05,
"loss": 0.8575,
"step": 662
},
{
"epoch": 0.49215922798552475,
"grad_norm": 0.38663193583488464,
"learning_rate": 3.5724691045923296e-05,
"loss": 0.7904,
"step": 663
},
{
"epoch": 0.4929015495963626,
"grad_norm": 0.40553534030914307,
"learning_rate": 3.570971206470463e-05,
"loss": 0.8474,
"step": 664
},
{
"epoch": 0.49364387120720055,
"grad_norm": 0.37837696075439453,
"learning_rate": 3.5694710041072034e-05,
"loss": 0.7521,
"step": 665
},
{
"epoch": 0.4943861928180384,
"grad_norm": 0.3879217505455017,
"learning_rate": 3.567968499702991e-05,
"loss": 0.9081,
"step": 666
},
{
"epoch": 0.4951285144288763,
"grad_norm": 0.32567813992500305,
"learning_rate": 3.566463695461643e-05,
"loss": 0.7584,
"step": 667
},
{
"epoch": 0.4958708360397142,
"grad_norm": 0.40887537598609924,
"learning_rate": 3.564956593590349e-05,
"loss": 0.7644,
"step": 668
},
{
"epoch": 0.4966131576505521,
"grad_norm": 0.3539418876171112,
"learning_rate": 3.5634471962996704e-05,
"loss": 0.7637,
"step": 669
},
{
"epoch": 0.49735547926139,
"grad_norm": 0.43159979581832886,
"learning_rate": 3.5619355058035326e-05,
"loss": 0.8638,
"step": 670
},
{
"epoch": 0.4980978008722279,
"grad_norm": 0.3916085958480835,
"learning_rate": 3.560421524319227e-05,
"loss": 0.8409,
"step": 671
},
{
"epoch": 0.4988401224830658,
"grad_norm": 0.36610978841781616,
"learning_rate": 3.558905254067405e-05,
"loss": 0.7831,
"step": 672
},
{
"epoch": 0.4995824440939037,
"grad_norm": 0.3962661623954773,
"learning_rate": 3.5573866972720746e-05,
"loss": 0.8764,
"step": 673
},
{
"epoch": 0.5003247657047416,
"grad_norm": 0.5491713881492615,
"learning_rate": 3.555865856160598e-05,
"loss": 0.8311,
"step": 674
},
{
"epoch": 0.5010670873155795,
"grad_norm": 0.34136977791786194,
"learning_rate": 3.554342732963687e-05,
"loss": 0.7774,
"step": 675
},
{
"epoch": 0.5018094089264173,
"grad_norm": 0.3658745586872101,
"learning_rate": 3.5528173299154015e-05,
"loss": 0.7664,
"step": 676
},
{
"epoch": 0.5025517305372552,
"grad_norm": 0.40912461280822754,
"learning_rate": 3.551289649253144e-05,
"loss": 0.7996,
"step": 677
},
{
"epoch": 0.5032940521480932,
"grad_norm": 0.4073297083377838,
"learning_rate": 3.5497596932176624e-05,
"loss": 0.8359,
"step": 678
},
{
"epoch": 0.5040363737589311,
"grad_norm": 0.3738069534301758,
"learning_rate": 3.548227464053035e-05,
"loss": 0.928,
"step": 679
},
{
"epoch": 0.5047786953697689,
"grad_norm": 0.37688982486724854,
"learning_rate": 3.54669296400668e-05,
"loss": 0.8951,
"step": 680
},
{
"epoch": 0.5055210169806068,
"grad_norm": 0.3885107934474945,
"learning_rate": 3.545156195329343e-05,
"loss": 0.8279,
"step": 681
},
{
"epoch": 0.5062633385914448,
"grad_norm": 0.35046374797821045,
"learning_rate": 3.5436171602750995e-05,
"loss": 0.7523,
"step": 682
},
{
"epoch": 0.5070056602022827,
"grad_norm": 0.3633882403373718,
"learning_rate": 3.542075861101347e-05,
"loss": 0.8077,
"step": 683
},
{
"epoch": 0.5077479818131205,
"grad_norm": 0.3783382475376129,
"learning_rate": 3.5405323000688056e-05,
"loss": 0.7674,
"step": 684
},
{
"epoch": 0.5084903034239584,
"grad_norm": 0.38828450441360474,
"learning_rate": 3.538986479441513e-05,
"loss": 0.7737,
"step": 685
},
{
"epoch": 0.5092326250347963,
"grad_norm": 0.4310080409049988,
"learning_rate": 3.53743840148682e-05,
"loss": 0.8116,
"step": 686
},
{
"epoch": 0.5099749466456343,
"grad_norm": 0.38372164964675903,
"learning_rate": 3.53588806847539e-05,
"loss": 0.7766,
"step": 687
},
{
"epoch": 0.5107172682564721,
"grad_norm": 0.3972887098789215,
"learning_rate": 3.534335482681192e-05,
"loss": 0.7485,
"step": 688
},
{
"epoch": 0.51145958986731,
"grad_norm": 0.3987019956111908,
"learning_rate": 3.5327806463815e-05,
"loss": 0.7545,
"step": 689
},
{
"epoch": 0.5122019114781479,
"grad_norm": 0.36997538805007935,
"learning_rate": 3.53122356185689e-05,
"loss": 0.8385,
"step": 690
},
{
"epoch": 0.5129442330889858,
"grad_norm": 0.40917664766311646,
"learning_rate": 3.529664231391236e-05,
"loss": 0.9488,
"step": 691
},
{
"epoch": 0.5136865546998237,
"grad_norm": 0.3602358102798462,
"learning_rate": 3.5281026572717025e-05,
"loss": 0.7981,
"step": 692
},
{
"epoch": 0.5144288763106616,
"grad_norm": 0.44117608666419983,
"learning_rate": 3.52653884178875e-05,
"loss": 0.7832,
"step": 693
},
{
"epoch": 0.5151711979214995,
"grad_norm": 0.3614718019962311,
"learning_rate": 3.524972787236124e-05,
"loss": 0.8741,
"step": 694
},
{
"epoch": 0.5159135195323374,
"grad_norm": 0.3987668752670288,
"learning_rate": 3.5234044959108534e-05,
"loss": 0.8139,
"step": 695
},
{
"epoch": 0.5166558411431753,
"grad_norm": 0.37257644534111023,
"learning_rate": 3.5218339701132486e-05,
"loss": 0.8719,
"step": 696
},
{
"epoch": 0.5173981627540132,
"grad_norm": 0.43900442123413086,
"learning_rate": 3.520261212146899e-05,
"loss": 0.797,
"step": 697
},
{
"epoch": 0.518140484364851,
"grad_norm": 0.3919014632701874,
"learning_rate": 3.518686224318665e-05,
"loss": 0.8669,
"step": 698
},
{
"epoch": 0.5188828059756889,
"grad_norm": 0.3852793574333191,
"learning_rate": 3.5171090089386816e-05,
"loss": 0.8776,
"step": 699
},
{
"epoch": 0.5196251275865269,
"grad_norm": 0.3601354956626892,
"learning_rate": 3.515529568320347e-05,
"loss": 0.8386,
"step": 700
},
{
"epoch": 0.5203674491973648,
"grad_norm": 0.4443444609642029,
"learning_rate": 3.5139479047803254e-05,
"loss": 0.8605,
"step": 701
},
{
"epoch": 0.5211097708082026,
"grad_norm": 0.3493961989879608,
"learning_rate": 3.5123640206385425e-05,
"loss": 0.7663,
"step": 702
},
{
"epoch": 0.5218520924190405,
"grad_norm": 0.4082690179347992,
"learning_rate": 3.510777918218179e-05,
"loss": 0.7387,
"step": 703
},
{
"epoch": 0.5225944140298785,
"grad_norm": 0.41370537877082825,
"learning_rate": 3.509189599845671e-05,
"loss": 0.8461,
"step": 704
},
{
"epoch": 0.5233367356407164,
"grad_norm": 0.39665308594703674,
"learning_rate": 3.507599067850703e-05,
"loss": 0.8814,
"step": 705
},
{
"epoch": 0.5240790572515542,
"grad_norm": 0.37258079648017883,
"learning_rate": 3.5060063245662085e-05,
"loss": 0.9054,
"step": 706
},
{
"epoch": 0.5248213788623921,
"grad_norm": 0.3601747751235962,
"learning_rate": 3.504411372328362e-05,
"loss": 0.8276,
"step": 707
},
{
"epoch": 0.52556370047323,
"grad_norm": 0.3841734826564789,
"learning_rate": 3.5028142134765794e-05,
"loss": 0.8039,
"step": 708
},
{
"epoch": 0.526306022084068,
"grad_norm": 0.432404100894928,
"learning_rate": 3.501214850353515e-05,
"loss": 0.8261,
"step": 709
},
{
"epoch": 0.5270483436949058,
"grad_norm": 0.42703214287757874,
"learning_rate": 3.499613285305053e-05,
"loss": 0.8143,
"step": 710
},
{
"epoch": 0.5277906653057437,
"grad_norm": 0.4320433735847473,
"learning_rate": 3.498009520680309e-05,
"loss": 0.9653,
"step": 711
},
{
"epoch": 0.5285329869165816,
"grad_norm": 0.37552034854888916,
"learning_rate": 3.496403558831625e-05,
"loss": 0.8471,
"step": 712
},
{
"epoch": 0.5292753085274196,
"grad_norm": 0.4038369953632355,
"learning_rate": 3.494795402114564e-05,
"loss": 0.9201,
"step": 713
},
{
"epoch": 0.5300176301382574,
"grad_norm": 0.41206666827201843,
"learning_rate": 3.4931850528879105e-05,
"loss": 0.9038,
"step": 714
},
{
"epoch": 0.5307599517490953,
"grad_norm": 0.4328741431236267,
"learning_rate": 3.491572513513664e-05,
"loss": 0.7807,
"step": 715
},
{
"epoch": 0.5315022733599332,
"grad_norm": 0.39441853761672974,
"learning_rate": 3.489957786357037e-05,
"loss": 0.8205,
"step": 716
},
{
"epoch": 0.532244594970771,
"grad_norm": 0.38545891642570496,
"learning_rate": 3.488340873786451e-05,
"loss": 0.8253,
"step": 717
},
{
"epoch": 0.532986916581609,
"grad_norm": 0.3683164417743683,
"learning_rate": 3.4867217781735296e-05,
"loss": 0.7755,
"step": 718
},
{
"epoch": 0.5337292381924469,
"grad_norm": 0.3887660503387451,
"learning_rate": 3.485100501893105e-05,
"loss": 0.8543,
"step": 719
},
{
"epoch": 0.5344715598032848,
"grad_norm": 0.44087541103363037,
"learning_rate": 3.4834770473232014e-05,
"loss": 0.8477,
"step": 720
},
{
"epoch": 0.5352138814141226,
"grad_norm": 0.3935564458370209,
"learning_rate": 3.481851416845042e-05,
"loss": 0.7372,
"step": 721
},
{
"epoch": 0.5359562030249606,
"grad_norm": 0.3968266546726227,
"learning_rate": 3.4802236128430404e-05,
"loss": 0.8384,
"step": 722
},
{
"epoch": 0.5366985246357985,
"grad_norm": 0.4122190773487091,
"learning_rate": 3.4785936377047975e-05,
"loss": 0.9467,
"step": 723
},
{
"epoch": 0.5374408462466364,
"grad_norm": 0.40589356422424316,
"learning_rate": 3.476961493821101e-05,
"loss": 0.849,
"step": 724
},
{
"epoch": 0.5381831678574742,
"grad_norm": 0.37581416964530945,
"learning_rate": 3.475327183585916e-05,
"loss": 0.8122,
"step": 725
},
{
"epoch": 0.5389254894683121,
"grad_norm": 0.41619259119033813,
"learning_rate": 3.473690709396389e-05,
"loss": 0.8647,
"step": 726
},
{
"epoch": 0.5396678110791501,
"grad_norm": 0.3226104974746704,
"learning_rate": 3.472052073652837e-05,
"loss": 0.7746,
"step": 727
},
{
"epoch": 0.5404101326899879,
"grad_norm": 0.37998801469802856,
"learning_rate": 3.4704112787587496e-05,
"loss": 0.869,
"step": 728
},
{
"epoch": 0.5411524543008258,
"grad_norm": 0.3882545530796051,
"learning_rate": 3.468768327120784e-05,
"loss": 0.8667,
"step": 729
},
{
"epoch": 0.5418947759116637,
"grad_norm": 0.3522816002368927,
"learning_rate": 3.4671232211487595e-05,
"loss": 0.7567,
"step": 730
},
{
"epoch": 0.5426370975225017,
"grad_norm": 0.4263498783111572,
"learning_rate": 3.4654759632556555e-05,
"loss": 0.8234,
"step": 731
},
{
"epoch": 0.5433794191333395,
"grad_norm": 0.34016892313957214,
"learning_rate": 3.463826555857607e-05,
"loss": 0.8479,
"step": 732
},
{
"epoch": 0.5441217407441774,
"grad_norm": 0.4033363461494446,
"learning_rate": 3.4621750013739035e-05,
"loss": 0.8393,
"step": 733
},
{
"epoch": 0.5448640623550153,
"grad_norm": 0.3858112394809723,
"learning_rate": 3.4605213022269825e-05,
"loss": 0.8389,
"step": 734
},
{
"epoch": 0.5456063839658533,
"grad_norm": 0.3489677608013153,
"learning_rate": 3.458865460842428e-05,
"loss": 0.7695,
"step": 735
},
{
"epoch": 0.5463487055766911,
"grad_norm": 0.39663171768188477,
"learning_rate": 3.457207479648965e-05,
"loss": 0.8842,
"step": 736
},
{
"epoch": 0.547091027187529,
"grad_norm": 0.3862421214580536,
"learning_rate": 3.455547361078459e-05,
"loss": 0.849,
"step": 737
},
{
"epoch": 0.5478333487983669,
"grad_norm": 0.3735657036304474,
"learning_rate": 3.453885107565908e-05,
"loss": 0.7996,
"step": 738
},
{
"epoch": 0.5485756704092047,
"grad_norm": 0.4164029359817505,
"learning_rate": 3.4522207215494444e-05,
"loss": 0.8688,
"step": 739
},
{
"epoch": 0.5493179920200427,
"grad_norm": 0.3550073802471161,
"learning_rate": 3.4505542054703256e-05,
"loss": 0.7474,
"step": 740
},
{
"epoch": 0.5500603136308806,
"grad_norm": 0.3763352036476135,
"learning_rate": 3.4488855617729356e-05,
"loss": 0.7542,
"step": 741
},
{
"epoch": 0.5508026352417185,
"grad_norm": 0.41175779700279236,
"learning_rate": 3.447214792904778e-05,
"loss": 0.7904,
"step": 742
},
{
"epoch": 0.5515449568525563,
"grad_norm": 0.40034011006355286,
"learning_rate": 3.445541901316473e-05,
"loss": 0.8647,
"step": 743
},
{
"epoch": 0.5522872784633943,
"grad_norm": 0.3602341115474701,
"learning_rate": 3.443866889461755e-05,
"loss": 0.8479,
"step": 744
},
{
"epoch": 0.5530296000742322,
"grad_norm": 0.4051341712474823,
"learning_rate": 3.442189759797469e-05,
"loss": 0.8418,
"step": 745
},
{
"epoch": 0.5537719216850701,
"grad_norm": 0.4291275441646576,
"learning_rate": 3.440510514783565e-05,
"loss": 0.8203,
"step": 746
},
{
"epoch": 0.5545142432959079,
"grad_norm": 0.38578981161117554,
"learning_rate": 3.4388291568830965e-05,
"loss": 0.8703,
"step": 747
},
{
"epoch": 0.5552565649067458,
"grad_norm": 0.48124009370803833,
"learning_rate": 3.437145688562216e-05,
"loss": 0.8084,
"step": 748
},
{
"epoch": 0.5559988865175838,
"grad_norm": 0.41740983724594116,
"learning_rate": 3.435460112290172e-05,
"loss": 0.7703,
"step": 749
},
{
"epoch": 0.5567412081284216,
"grad_norm": 0.4573703110218048,
"learning_rate": 3.433772430539303e-05,
"loss": 0.9169,
"step": 750
},
{
"epoch": 0.5574835297392595,
"grad_norm": 0.4573703110218048,
"learning_rate": 3.433772430539303e-05,
"loss": 0.7791,
"step": 751
},
{
"epoch": 0.5582258513500974,
"grad_norm": 0.4058649241924286,
"learning_rate": 3.432082645785038e-05,
"loss": 0.8604,
"step": 752
},
{
"epoch": 0.5589681729609354,
"grad_norm": 0.3486841917037964,
"learning_rate": 3.430390760505889e-05,
"loss": 0.7316,
"step": 753
},
{
"epoch": 0.5597104945717732,
"grad_norm": 0.36052748560905457,
"learning_rate": 3.428696777183451e-05,
"loss": 0.6917,
"step": 754
},
{
"epoch": 0.5604528161826111,
"grad_norm": 0.40812695026397705,
"learning_rate": 3.427000698302393e-05,
"loss": 0.9129,
"step": 755
},
{
"epoch": 0.561195137793449,
"grad_norm": 0.402180939912796,
"learning_rate": 3.4253025263504605e-05,
"loss": 0.8565,
"step": 756
},
{
"epoch": 0.5619374594042869,
"grad_norm": 0.3584100604057312,
"learning_rate": 3.4236022638184684e-05,
"loss": 0.926,
"step": 757
},
{
"epoch": 0.5626797810151248,
"grad_norm": 0.4021860957145691,
"learning_rate": 3.421899913200297e-05,
"loss": 0.8613,
"step": 758
},
{
"epoch": 0.5634221026259627,
"grad_norm": 0.4699723422527313,
"learning_rate": 3.420195476992892e-05,
"loss": 0.8572,
"step": 759
},
{
"epoch": 0.5641644242368006,
"grad_norm": 0.37108978629112244,
"learning_rate": 3.418488957696253e-05,
"loss": 0.8848,
"step": 760
},
{
"epoch": 0.5649067458476384,
"grad_norm": 0.4078989326953888,
"learning_rate": 3.4167803578134406e-05,
"loss": 0.9227,
"step": 761
},
{
"epoch": 0.5656490674584764,
"grad_norm": 0.399107426404953,
"learning_rate": 3.4150696798505644e-05,
"loss": 0.7975,
"step": 762
},
{
"epoch": 0.5663913890693143,
"grad_norm": 0.3846902549266815,
"learning_rate": 3.413356926316782e-05,
"loss": 0.7224,
"step": 763
},
{
"epoch": 0.5671337106801522,
"grad_norm": 0.37825313210487366,
"learning_rate": 3.4116420997242964e-05,
"loss": 0.8094,
"step": 764
},
{
"epoch": 0.56787603229099,
"grad_norm": 0.37176600098609924,
"learning_rate": 3.40992520258835e-05,
"loss": 0.8543,
"step": 765
},
{
"epoch": 0.568618353901828,
"grad_norm": 0.3635413646697998,
"learning_rate": 3.408206237427223e-05,
"loss": 0.8697,
"step": 766
},
{
"epoch": 0.5693606755126659,
"grad_norm": 0.4106957018375397,
"learning_rate": 3.406485206762229e-05,
"loss": 0.8703,
"step": 767
},
{
"epoch": 0.5701029971235038,
"grad_norm": 0.35687169432640076,
"learning_rate": 3.40476211311771e-05,
"loss": 0.9626,
"step": 768
},
{
"epoch": 0.5708453187343416,
"grad_norm": 0.4074583351612091,
"learning_rate": 3.403036959021036e-05,
"loss": 0.888,
"step": 769
},
{
"epoch": 0.5715876403451795,
"grad_norm": 0.33353114128112793,
"learning_rate": 3.401309747002598e-05,
"loss": 0.8315,
"step": 770
},
{
"epoch": 0.5723299619560175,
"grad_norm": 0.3577583432197571,
"learning_rate": 3.3995804795958055e-05,
"loss": 0.8436,
"step": 771
},
{
"epoch": 0.5730722835668554,
"grad_norm": 0.4142861068248749,
"learning_rate": 3.3978491593370814e-05,
"loss": 0.9741,
"step": 772
},
{
"epoch": 0.5738146051776932,
"grad_norm": 0.4031902253627777,
"learning_rate": 3.3961157887658624e-05,
"loss": 0.8151,
"step": 773
},
{
"epoch": 0.5745569267885311,
"grad_norm": 0.45672792196273804,
"learning_rate": 3.394380370424592e-05,
"loss": 0.8195,
"step": 774
},
{
"epoch": 0.5752992483993691,
"grad_norm": 0.3539574444293976,
"learning_rate": 3.392642906858714e-05,
"loss": 0.753,
"step": 775
},
{
"epoch": 0.576041570010207,
"grad_norm": 0.4349236488342285,
"learning_rate": 3.390903400616677e-05,
"loss": 0.7667,
"step": 776
},
{
"epoch": 0.5767838916210448,
"grad_norm": 0.42073139548301697,
"learning_rate": 3.389161854249921e-05,
"loss": 0.8512,
"step": 777
},
{
"epoch": 0.5775262132318827,
"grad_norm": 0.39800146222114563,
"learning_rate": 3.3874182703128825e-05,
"loss": 0.8022,
"step": 778
},
{
"epoch": 0.5782685348427206,
"grad_norm": 0.47772639989852905,
"learning_rate": 3.385672651362984e-05,
"loss": 0.9928,
"step": 779
},
{
"epoch": 0.5790108564535585,
"grad_norm": 0.35686033964157104,
"learning_rate": 3.383924999960633e-05,
"loss": 0.7766,
"step": 780
},
{
"epoch": 0.5797531780643964,
"grad_norm": 0.38933950662612915,
"learning_rate": 3.3821753186692194e-05,
"loss": 0.7371,
"step": 781
},
{
"epoch": 0.5804954996752343,
"grad_norm": 0.3493152856826782,
"learning_rate": 3.38042361005511e-05,
"loss": 0.8075,
"step": 782
},
{
"epoch": 0.5812378212860722,
"grad_norm": 0.3548669219017029,
"learning_rate": 3.378669876687645e-05,
"loss": 0.7585,
"step": 783
},
{
"epoch": 0.5819801428969101,
"grad_norm": 0.4344249665737152,
"learning_rate": 3.3769141211391336e-05,
"loss": 0.818,
"step": 784
},
{
"epoch": 0.582722464507748,
"grad_norm": 0.3780219852924347,
"learning_rate": 3.375156345984853e-05,
"loss": 0.8477,
"step": 785
},
{
"epoch": 0.5834647861185859,
"grad_norm": 0.3857625722885132,
"learning_rate": 3.373396553803041e-05,
"loss": 0.9338,
"step": 786
},
{
"epoch": 0.5842071077294237,
"grad_norm": 0.4386730194091797,
"learning_rate": 3.371634747174895e-05,
"loss": 0.8064,
"step": 787
},
{
"epoch": 0.5849494293402616,
"grad_norm": 0.39791983366012573,
"learning_rate": 3.3698709286845655e-05,
"loss": 0.9154,
"step": 788
},
{
"epoch": 0.5856917509510996,
"grad_norm": 0.3847380578517914,
"learning_rate": 3.368105100919156e-05,
"loss": 0.8217,
"step": 789
},
{
"epoch": 0.5864340725619375,
"grad_norm": 0.36979803442955017,
"learning_rate": 3.3663372664687154e-05,
"loss": 0.7634,
"step": 790
},
{
"epoch": 0.5871763941727753,
"grad_norm": 0.38267242908477783,
"learning_rate": 3.3645674279262376e-05,
"loss": 0.7904,
"step": 791
},
{
"epoch": 0.5879187157836132,
"grad_norm": 0.47703686356544495,
"learning_rate": 3.362795587887655e-05,
"loss": 0.8899,
"step": 792
},
{
"epoch": 0.5886610373944512,
"grad_norm": 0.3864832818508148,
"learning_rate": 3.361021748951835e-05,
"loss": 0.8206,
"step": 793
},
{
"epoch": 0.5894033590052891,
"grad_norm": 0.367291659116745,
"learning_rate": 3.359245913720578e-05,
"loss": 0.7678,
"step": 794
},
{
"epoch": 0.5901456806161269,
"grad_norm": 0.3677731454372406,
"learning_rate": 3.357468084798613e-05,
"loss": 0.7437,
"step": 795
},
{
"epoch": 0.5908880022269648,
"grad_norm": 0.3390463888645172,
"learning_rate": 3.355688264793592e-05,
"loss": 0.8462,
"step": 796
},
{
"epoch": 0.5916303238378027,
"grad_norm": 0.3929853141307831,
"learning_rate": 3.3539064563160874e-05,
"loss": 0.8486,
"step": 797
},
{
"epoch": 0.5923726454486407,
"grad_norm": 0.42393356561660767,
"learning_rate": 3.35212266197959e-05,
"loss": 0.9258,
"step": 798
},
{
"epoch": 0.5931149670594785,
"grad_norm": 0.4595308005809784,
"learning_rate": 3.350336884400501e-05,
"loss": 0.8199,
"step": 799
},
{
"epoch": 0.5938572886703164,
"grad_norm": 0.39049914479255676,
"learning_rate": 3.348549126198132e-05,
"loss": 0.8381,
"step": 800
},
{
"epoch": 0.5945996102811543,
"grad_norm": 0.3757772147655487,
"learning_rate": 3.346759389994699e-05,
"loss": 0.7859,
"step": 801
},
{
"epoch": 0.5953419318919922,
"grad_norm": 0.3846074938774109,
"learning_rate": 3.344967678415321e-05,
"loss": 0.9707,
"step": 802
},
{
"epoch": 0.5960842535028301,
"grad_norm": 0.3639870285987854,
"learning_rate": 3.343173994088012e-05,
"loss": 0.8917,
"step": 803
},
{
"epoch": 0.596826575113668,
"grad_norm": 0.4003622531890869,
"learning_rate": 3.3413783396436805e-05,
"loss": 0.8328,
"step": 804
},
{
"epoch": 0.5975688967245059,
"grad_norm": 0.37097591161727905,
"learning_rate": 3.339580717716126e-05,
"loss": 0.8419,
"step": 805
},
{
"epoch": 0.5983112183353438,
"grad_norm": 0.377591997385025,
"learning_rate": 3.337781130942031e-05,
"loss": 0.7994,
"step": 806
},
{
"epoch": 0.5990535399461817,
"grad_norm": 0.3893373906612396,
"learning_rate": 3.3359795819609624e-05,
"loss": 0.8612,
"step": 807
},
{
"epoch": 0.5997958615570196,
"grad_norm": 0.43286728858947754,
"learning_rate": 3.334176073415364e-05,
"loss": 0.804,
"step": 808
},
{
"epoch": 0.6005381831678575,
"grad_norm": 0.38128551840782166,
"learning_rate": 3.332370607950555e-05,
"loss": 0.8117,
"step": 809
},
{
"epoch": 0.6012805047786953,
"grad_norm": 0.42448368668556213,
"learning_rate": 3.330563188214724e-05,
"loss": 0.9025,
"step": 810
},
{
"epoch": 0.6020228263895333,
"grad_norm": 0.38770368695259094,
"learning_rate": 3.328753816858925e-05,
"loss": 0.8392,
"step": 811
},
{
"epoch": 0.6027651480003712,
"grad_norm": 0.39272114634513855,
"learning_rate": 3.326942496537077e-05,
"loss": 0.8368,
"step": 812
},
{
"epoch": 0.603507469611209,
"grad_norm": 0.49103134870529175,
"learning_rate": 3.325129229905956e-05,
"loss": 0.9017,
"step": 813
},
{
"epoch": 0.6042497912220469,
"grad_norm": 0.3955812454223633,
"learning_rate": 3.323314019625193e-05,
"loss": 0.8856,
"step": 814
},
{
"epoch": 0.6049921128328849,
"grad_norm": 0.34001222252845764,
"learning_rate": 3.321496868357272e-05,
"loss": 0.7656,
"step": 815
},
{
"epoch": 0.6057344344437228,
"grad_norm": 0.43722572922706604,
"learning_rate": 3.319677778767518e-05,
"loss": 0.8833,
"step": 816
},
{
"epoch": 0.6064767560545606,
"grad_norm": 0.42245838046073914,
"learning_rate": 3.3178567535241066e-05,
"loss": 0.8043,
"step": 817
},
{
"epoch": 0.6072190776653985,
"grad_norm": 0.3846532106399536,
"learning_rate": 3.3160337952980475e-05,
"loss": 0.8255,
"step": 818
},
{
"epoch": 0.6079613992762364,
"grad_norm": 0.34817036986351013,
"learning_rate": 3.314208906763188e-05,
"loss": 0.8635,
"step": 819
},
{
"epoch": 0.6087037208870744,
"grad_norm": 0.3706674575805664,
"learning_rate": 3.312382090596205e-05,
"loss": 0.7863,
"step": 820
},
{
"epoch": 0.6094460424979122,
"grad_norm": 0.39172613620758057,
"learning_rate": 3.310553349476604e-05,
"loss": 0.7935,
"step": 821
},
{
"epoch": 0.6101883641087501,
"grad_norm": 0.401347279548645,
"learning_rate": 3.308722686086714e-05,
"loss": 0.8842,
"step": 822
},
{
"epoch": 0.610930685719588,
"grad_norm": 0.384919136762619,
"learning_rate": 3.306890103111682e-05,
"loss": 0.8038,
"step": 823
},
{
"epoch": 0.611673007330426,
"grad_norm": 0.4191407859325409,
"learning_rate": 3.305055603239473e-05,
"loss": 0.9038,
"step": 824
},
{
"epoch": 0.6124153289412638,
"grad_norm": 0.41282200813293457,
"learning_rate": 3.303219189160862e-05,
"loss": 0.768,
"step": 825
},
{
"epoch": 0.6131576505521017,
"grad_norm": 0.3830862045288086,
"learning_rate": 3.301380863569431e-05,
"loss": 0.8179,
"step": 826
},
{
"epoch": 0.6138999721629396,
"grad_norm": 0.3940469026565552,
"learning_rate": 3.299540629161568e-05,
"loss": 0.8294,
"step": 827
},
{
"epoch": 0.6146422937737774,
"grad_norm": 0.38618114590644836,
"learning_rate": 3.2976984886364586e-05,
"loss": 0.8532,
"step": 828
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.394499272108078,
"learning_rate": 3.295854444696086e-05,
"loss": 0.8044,
"step": 829
},
{
"epoch": 0.6161269369954533,
"grad_norm": 0.4039349853992462,
"learning_rate": 3.294008500045223e-05,
"loss": 0.8388,
"step": 830
},
{
"epoch": 0.6168692586062912,
"grad_norm": 0.38985222578048706,
"learning_rate": 3.292160657391433e-05,
"loss": 0.8092,
"step": 831
},
{
"epoch": 0.617611580217129,
"grad_norm": 0.416485458612442,
"learning_rate": 3.290310919445062e-05,
"loss": 0.8467,
"step": 832
},
{
"epoch": 0.618353901827967,
"grad_norm": 0.4300270676612854,
"learning_rate": 3.288459288919236e-05,
"loss": 0.8217,
"step": 833
},
{
"epoch": 0.6190962234388049,
"grad_norm": 0.3902897238731384,
"learning_rate": 3.286605768529856e-05,
"loss": 0.8181,
"step": 834
},
{
"epoch": 0.6198385450496428,
"grad_norm": 0.4227379262447357,
"learning_rate": 3.2847503609955955e-05,
"loss": 0.8944,
"step": 835
},
{
"epoch": 0.6205808666604806,
"grad_norm": 0.4002430737018585,
"learning_rate": 3.2828930690378976e-05,
"loss": 0.7856,
"step": 836
},
{
"epoch": 0.6213231882713185,
"grad_norm": 0.42526379227638245,
"learning_rate": 3.281033895380969e-05,
"loss": 0.8948,
"step": 837
},
{
"epoch": 0.6220655098821565,
"grad_norm": 0.37326619029045105,
"learning_rate": 3.279172842751773e-05,
"loss": 0.8338,
"step": 838
},
{
"epoch": 0.6228078314929943,
"grad_norm": 0.3695979714393616,
"learning_rate": 3.277309913880033e-05,
"loss": 0.8823,
"step": 839
},
{
"epoch": 0.6235501531038322,
"grad_norm": 0.3828786015510559,
"learning_rate": 3.2754451114982235e-05,
"loss": 0.9321,
"step": 840
},
{
"epoch": 0.6242924747146701,
"grad_norm": 0.3889092803001404,
"learning_rate": 3.273578438341566e-05,
"loss": 0.8804,
"step": 841
},
{
"epoch": 0.6250347963255081,
"grad_norm": 0.35321757197380066,
"learning_rate": 3.271709897148025e-05,
"loss": 0.8862,
"step": 842
},
{
"epoch": 0.6257771179363459,
"grad_norm": 0.3559962213039398,
"learning_rate": 3.269839490658308e-05,
"loss": 0.8031,
"step": 843
},
{
"epoch": 0.6265194395471838,
"grad_norm": 0.3700624108314514,
"learning_rate": 3.267967221615856e-05,
"loss": 0.7905,
"step": 844
},
{
"epoch": 0.6272617611580217,
"grad_norm": 0.3585308790206909,
"learning_rate": 3.266093092766843e-05,
"loss": 0.7957,
"step": 845
},
{
"epoch": 0.6280040827688597,
"grad_norm": 0.36834481358528137,
"learning_rate": 3.2642171068601705e-05,
"loss": 0.8328,
"step": 846
},
{
"epoch": 0.6287464043796975,
"grad_norm": 0.38698065280914307,
"learning_rate": 3.262339266647464e-05,
"loss": 0.8505,
"step": 847
},
{
"epoch": 0.6294887259905354,
"grad_norm": 0.39008861780166626,
"learning_rate": 3.260459574883069e-05,
"loss": 0.8754,
"step": 848
},
{
"epoch": 0.6302310476013733,
"grad_norm": 0.3963302671909332,
"learning_rate": 3.258578034324046e-05,
"loss": 0.8419,
"step": 849
},
{
"epoch": 0.6309733692122111,
"grad_norm": 0.39984002709388733,
"learning_rate": 3.2566946477301684e-05,
"loss": 0.8469,
"step": 850
},
{
"epoch": 0.6317156908230491,
"grad_norm": 0.39819765090942383,
"learning_rate": 3.254809417863916e-05,
"loss": 0.8328,
"step": 851
},
{
"epoch": 0.632458012433887,
"grad_norm": 0.4232073128223419,
"learning_rate": 3.252922347490474e-05,
"loss": 0.8927,
"step": 852
},
{
"epoch": 0.6332003340447249,
"grad_norm": 0.3462536334991455,
"learning_rate": 3.2510334393777254e-05,
"loss": 0.8007,
"step": 853
},
{
"epoch": 0.6339426556555627,
"grad_norm": 0.3509646952152252,
"learning_rate": 3.24914269629625e-05,
"loss": 0.8285,
"step": 854
},
{
"epoch": 0.6346849772664007,
"grad_norm": 0.38063347339630127,
"learning_rate": 3.247250121019318e-05,
"loss": 0.8141,
"step": 855
},
{
"epoch": 0.6354272988772386,
"grad_norm": 0.4435429871082306,
"learning_rate": 3.245355716322887e-05,
"loss": 0.8522,
"step": 856
},
{
"epoch": 0.6361696204880765,
"grad_norm": 0.41526708006858826,
"learning_rate": 3.2434594849856006e-05,
"loss": 0.8535,
"step": 857
},
{
"epoch": 0.6369119420989143,
"grad_norm": 0.43962687253952026,
"learning_rate": 3.241561429788777e-05,
"loss": 0.8415,
"step": 858
},
{
"epoch": 0.6376542637097522,
"grad_norm": 0.3527955710887909,
"learning_rate": 3.239661553516414e-05,
"loss": 0.9008,
"step": 859
},
{
"epoch": 0.6383965853205902,
"grad_norm": 0.36493077874183655,
"learning_rate": 3.237759858955177e-05,
"loss": 0.7132,
"step": 860
},
{
"epoch": 0.639138906931428,
"grad_norm": 0.38288044929504395,
"learning_rate": 3.235856348894401e-05,
"loss": 0.8561,
"step": 861
},
{
"epoch": 0.6398812285422659,
"grad_norm": 0.4172011911869049,
"learning_rate": 3.2339510261260826e-05,
"loss": 0.8374,
"step": 862
},
{
"epoch": 0.6406235501531038,
"grad_norm": 0.4358551502227783,
"learning_rate": 3.2320438934448774e-05,
"loss": 0.9169,
"step": 863
},
{
"epoch": 0.6413658717639418,
"grad_norm": 0.35578832030296326,
"learning_rate": 3.230134953648096e-05,
"loss": 0.8731,
"step": 864
},
{
"epoch": 0.6421081933747796,
"grad_norm": 0.4214344620704651,
"learning_rate": 3.228224209535698e-05,
"loss": 0.8212,
"step": 865
},
{
"epoch": 0.6428505149856175,
"grad_norm": 0.40245211124420166,
"learning_rate": 3.226311663910293e-05,
"loss": 0.8055,
"step": 866
},
{
"epoch": 0.6435928365964554,
"grad_norm": 0.3829837739467621,
"learning_rate": 3.22439731957713e-05,
"loss": 0.8086,
"step": 867
},
{
"epoch": 0.6443351582072933,
"grad_norm": 0.43586400151252747,
"learning_rate": 3.222481179344096e-05,
"loss": 0.8499,
"step": 868
},
{
"epoch": 0.6450774798181312,
"grad_norm": 0.3507193326950073,
"learning_rate": 3.220563246021716e-05,
"loss": 0.8941,
"step": 869
},
{
"epoch": 0.6458198014289691,
"grad_norm": 0.3587462306022644,
"learning_rate": 3.21864352242314e-05,
"loss": 0.7786,
"step": 870
},
{
"epoch": 0.646562123039807,
"grad_norm": 0.37669169902801514,
"learning_rate": 3.216722011364147e-05,
"loss": 0.7859,
"step": 871
},
{
"epoch": 0.6473044446506449,
"grad_norm": 0.4303325414657593,
"learning_rate": 3.2147987156631377e-05,
"loss": 0.8269,
"step": 872
},
{
"epoch": 0.6480467662614828,
"grad_norm": 0.45145362615585327,
"learning_rate": 3.212873638141129e-05,
"loss": 0.8287,
"step": 873
},
{
"epoch": 0.6487890878723207,
"grad_norm": 0.4037097692489624,
"learning_rate": 3.210946781621752e-05,
"loss": 0.7869,
"step": 874
},
{
"epoch": 0.6495314094831586,
"grad_norm": 0.37830254435539246,
"learning_rate": 3.2090181489312476e-05,
"loss": 0.8967,
"step": 875
},
{
"epoch": 0.6502737310939964,
"grad_norm": 0.39029765129089355,
"learning_rate": 3.207087742898461e-05,
"loss": 0.7591,
"step": 876
},
{
"epoch": 0.6510160527048344,
"grad_norm": 0.35120412707328796,
"learning_rate": 3.20515556635484e-05,
"loss": 0.8334,
"step": 877
},
{
"epoch": 0.6517583743156723,
"grad_norm": 0.36138907074928284,
"learning_rate": 3.203221622134428e-05,
"loss": 0.7944,
"step": 878
},
{
"epoch": 0.6525006959265102,
"grad_norm": 0.3629201650619507,
"learning_rate": 3.2012859130738615e-05,
"loss": 0.852,
"step": 879
},
{
"epoch": 0.653243017537348,
"grad_norm": 0.4027819037437439,
"learning_rate": 3.199348442012366e-05,
"loss": 0.9275,
"step": 880
},
{
"epoch": 0.6539853391481859,
"grad_norm": 0.36106258630752563,
"learning_rate": 3.1974092117917504e-05,
"loss": 0.7495,
"step": 881
},
{
"epoch": 0.6547276607590239,
"grad_norm": 0.49002131819725037,
"learning_rate": 3.1954682252564054e-05,
"loss": 0.818,
"step": 882
},
{
"epoch": 0.6554699823698618,
"grad_norm": 0.36758434772491455,
"learning_rate": 3.1935254852532956e-05,
"loss": 0.8609,
"step": 883
},
{
"epoch": 0.6562123039806996,
"grad_norm": 0.4289117157459259,
"learning_rate": 3.19158099463196e-05,
"loss": 0.8596,
"step": 884
},
{
"epoch": 0.6569546255915375,
"grad_norm": 0.3696727752685547,
"learning_rate": 3.189634756244504e-05,
"loss": 0.8629,
"step": 885
},
{
"epoch": 0.6576969472023755,
"grad_norm": 0.3562766909599304,
"learning_rate": 3.187686772945597e-05,
"loss": 0.7651,
"step": 886
},
{
"epoch": 0.6584392688132134,
"grad_norm": 0.38660067319869995,
"learning_rate": 3.185737047592467e-05,
"loss": 0.9487,
"step": 887
},
{
"epoch": 0.6591815904240512,
"grad_norm": 0.421682208776474,
"learning_rate": 3.1837855830448976e-05,
"loss": 0.8885,
"step": 888
},
{
"epoch": 0.6599239120348891,
"grad_norm": 0.366974800825119,
"learning_rate": 3.181832382165223e-05,
"loss": 0.8434,
"step": 889
},
{
"epoch": 0.660666233645727,
"grad_norm": 0.4101247787475586,
"learning_rate": 3.179877447818326e-05,
"loss": 0.8329,
"step": 890
},
{
"epoch": 0.661408555256565,
"grad_norm": 0.3925361931324005,
"learning_rate": 3.17792078287163e-05,
"loss": 0.866,
"step": 891
},
{
"epoch": 0.6621508768674028,
"grad_norm": 0.39480844140052795,
"learning_rate": 3.175962390195098e-05,
"loss": 0.8342,
"step": 892
},
{
"epoch": 0.6628931984782407,
"grad_norm": 0.42709940671920776,
"learning_rate": 3.174002272661226e-05,
"loss": 0.8079,
"step": 893
},
{
"epoch": 0.6636355200890786,
"grad_norm": 0.3921566307544708,
"learning_rate": 3.172040433145041e-05,
"loss": 0.8701,
"step": 894
},
{
"epoch": 0.6643778416999165,
"grad_norm": 0.37820595502853394,
"learning_rate": 3.1700768745240945e-05,
"loss": 0.8168,
"step": 895
},
{
"epoch": 0.6651201633107544,
"grad_norm": 0.4000493884086609,
"learning_rate": 3.1681115996784617e-05,
"loss": 0.7657,
"step": 896
},
{
"epoch": 0.6658624849215923,
"grad_norm": 0.4290461838245392,
"learning_rate": 3.1661446114907325e-05,
"loss": 0.7515,
"step": 897
},
{
"epoch": 0.6666048065324301,
"grad_norm": 0.3985341191291809,
"learning_rate": 3.164175912846011e-05,
"loss": 0.7869,
"step": 898
},
{
"epoch": 0.667347128143268,
"grad_norm": 0.3736097812652588,
"learning_rate": 3.162205506631911e-05,
"loss": 0.8461,
"step": 899
},
{
"epoch": 0.668089449754106,
"grad_norm": 0.4430498480796814,
"learning_rate": 3.1602333957385495e-05,
"loss": 0.88,
"step": 900
},
{
"epoch": 0.6688317713649439,
"grad_norm": 0.4275624454021454,
"learning_rate": 3.158259583058545e-05,
"loss": 0.833,
"step": 901
},
{
"epoch": 0.6695740929757817,
"grad_norm": 0.42715683579444885,
"learning_rate": 3.156284071487012e-05,
"loss": 0.8321,
"step": 902
},
{
"epoch": 0.6703164145866196,
"grad_norm": 0.46634772419929504,
"learning_rate": 3.154306863921555e-05,
"loss": 0.8799,
"step": 903
},
{
"epoch": 0.6710587361974576,
"grad_norm": 0.38095808029174805,
"learning_rate": 3.1523279632622684e-05,
"loss": 0.805,
"step": 904
},
{
"epoch": 0.6718010578082955,
"grad_norm": 0.3953753709793091,
"learning_rate": 3.150347372411729e-05,
"loss": 0.9367,
"step": 905
},
{
"epoch": 0.6725433794191333,
"grad_norm": 0.40556415915489197,
"learning_rate": 3.148365094274994e-05,
"loss": 0.8616,
"step": 906
},
{
"epoch": 0.6732857010299712,
"grad_norm": 0.36998236179351807,
"learning_rate": 3.1463811317595924e-05,
"loss": 0.8632,
"step": 907
},
{
"epoch": 0.6740280226408091,
"grad_norm": 0.5273242592811584,
"learning_rate": 3.144395487775527e-05,
"loss": 0.7877,
"step": 908
},
{
"epoch": 0.6747703442516471,
"grad_norm": 0.43254518508911133,
"learning_rate": 3.142408165235266e-05,
"loss": 0.7899,
"step": 909
},
{
"epoch": 0.6755126658624849,
"grad_norm": 0.41343677043914795,
"learning_rate": 3.140419167053738e-05,
"loss": 0.8354,
"step": 910
},
{
"epoch": 0.6762549874733228,
"grad_norm": 0.3693787157535553,
"learning_rate": 3.13842849614833e-05,
"loss": 0.7484,
"step": 911
},
{
"epoch": 0.6769973090841607,
"grad_norm": 0.34831342101097107,
"learning_rate": 3.136436155438885e-05,
"loss": 0.8299,
"step": 912
},
{
"epoch": 0.6777396306949987,
"grad_norm": 0.3597358167171478,
"learning_rate": 3.1344421478476926e-05,
"loss": 0.8714,
"step": 913
},
{
"epoch": 0.6784819523058365,
"grad_norm": 0.36480751633644104,
"learning_rate": 3.132446476299488e-05,
"loss": 0.8386,
"step": 914
},
{
"epoch": 0.6792242739166744,
"grad_norm": 0.3980991542339325,
"learning_rate": 3.130449143721447e-05,
"loss": 0.8226,
"step": 915
},
{
"epoch": 0.6799665955275123,
"grad_norm": 0.39509183168411255,
"learning_rate": 3.128450153043181e-05,
"loss": 0.7421,
"step": 916
},
{
"epoch": 0.6807089171383502,
"grad_norm": 0.40358635783195496,
"learning_rate": 3.126449507196736e-05,
"loss": 0.8075,
"step": 917
},
{
"epoch": 0.6814512387491881,
"grad_norm": 0.42663317918777466,
"learning_rate": 3.124447209116583e-05,
"loss": 0.8733,
"step": 918
},
{
"epoch": 0.682193560360026,
"grad_norm": 0.4373573064804077,
"learning_rate": 3.122443261739616e-05,
"loss": 0.8067,
"step": 919
},
{
"epoch": 0.6829358819708639,
"grad_norm": 0.3898407518863678,
"learning_rate": 3.1204376680051525e-05,
"loss": 0.9386,
"step": 920
},
{
"epoch": 0.6836782035817017,
"grad_norm": 0.3621942698955536,
"learning_rate": 3.11843043085492e-05,
"loss": 0.8037,
"step": 921
},
{
"epoch": 0.6844205251925397,
"grad_norm": 0.3760909140110016,
"learning_rate": 3.1164215532330585e-05,
"loss": 0.8503,
"step": 922
},
{
"epoch": 0.6851628468033776,
"grad_norm": 0.37526753544807434,
"learning_rate": 3.1144110380861134e-05,
"loss": 0.8107,
"step": 923
},
{
"epoch": 0.6859051684142154,
"grad_norm": 0.40608811378479004,
"learning_rate": 3.1123988883630337e-05,
"loss": 0.8461,
"step": 924
},
{
"epoch": 0.6866474900250533,
"grad_norm": 0.32391631603240967,
"learning_rate": 3.1103851070151646e-05,
"loss": 0.8786,
"step": 925
},
{
"epoch": 0.6873898116358913,
"grad_norm": 0.42110610008239746,
"learning_rate": 3.108369696996245e-05,
"loss": 0.8611,
"step": 926
},
{
"epoch": 0.6881321332467292,
"grad_norm": 0.4104180932044983,
"learning_rate": 3.1063526612624015e-05,
"loss": 0.8632,
"step": 927
},
{
"epoch": 0.688874454857567,
"grad_norm": 0.401500403881073,
"learning_rate": 3.104334002772146e-05,
"loss": 0.8179,
"step": 928
},
{
"epoch": 0.6896167764684049,
"grad_norm": 0.3811194598674774,
"learning_rate": 3.1023137244863726e-05,
"loss": 0.7376,
"step": 929
},
{
"epoch": 0.6903590980792428,
"grad_norm": 0.34555327892303467,
"learning_rate": 3.100291829368348e-05,
"loss": 0.8595,
"step": 930
},
{
"epoch": 0.6911014196900808,
"grad_norm": 0.3416372835636139,
"learning_rate": 3.098268320383711e-05,
"loss": 0.9128,
"step": 931
},
{
"epoch": 0.6918437413009186,
"grad_norm": 0.36647018790245056,
"learning_rate": 3.09624320050047e-05,
"loss": 0.7548,
"step": 932
},
{
"epoch": 0.6925860629117565,
"grad_norm": 0.3531283140182495,
"learning_rate": 3.094216472688994e-05,
"loss": 0.8315,
"step": 933
},
{
"epoch": 0.6933283845225944,
"grad_norm": 0.42129501700401306,
"learning_rate": 3.092188139922011e-05,
"loss": 0.8368,
"step": 934
},
{
"epoch": 0.6940707061334324,
"grad_norm": 0.3378678858280182,
"learning_rate": 3.090158205174603e-05,
"loss": 0.9293,
"step": 935
},
{
"epoch": 0.6948130277442702,
"grad_norm": 0.38736778497695923,
"learning_rate": 3.0881266714242005e-05,
"loss": 0.8752,
"step": 936
},
{
"epoch": 0.6955553493551081,
"grad_norm": 0.34928473830223083,
"learning_rate": 3.086093541650583e-05,
"loss": 0.7956,
"step": 937
},
{
"epoch": 0.696297670965946,
"grad_norm": 0.3697172701358795,
"learning_rate": 3.0840588188358685e-05,
"loss": 0.8248,
"step": 938
},
{
"epoch": 0.6970399925767838,
"grad_norm": 0.4416426420211792,
"learning_rate": 3.08202250596451e-05,
"loss": 0.8146,
"step": 939
},
{
"epoch": 0.6977823141876218,
"grad_norm": 0.4334758520126343,
"learning_rate": 3.0799846060232955e-05,
"loss": 0.7391,
"step": 940
},
{
"epoch": 0.6985246357984597,
"grad_norm": 0.3890143632888794,
"learning_rate": 3.07794512200134e-05,
"loss": 0.7769,
"step": 941
},
{
"epoch": 0.6992669574092976,
"grad_norm": 0.411953330039978,
"learning_rate": 3.075904056890082e-05,
"loss": 0.9243,
"step": 942
},
{
"epoch": 0.7000092790201354,
"grad_norm": 0.45792150497436523,
"learning_rate": 3.073861413683278e-05,
"loss": 0.9176,
"step": 943
},
{
"epoch": 0.7007516006309734,
"grad_norm": 0.37603330612182617,
"learning_rate": 3.0718171953770016e-05,
"loss": 0.8958,
"step": 944
},
{
"epoch": 0.7014939222418113,
"grad_norm": 0.387465238571167,
"learning_rate": 3.069771404969633e-05,
"loss": 0.7828,
"step": 945
},
{
"epoch": 0.7022362438526492,
"grad_norm": 0.3826201558113098,
"learning_rate": 3.067724045461864e-05,
"loss": 0.8544,
"step": 946
},
{
"epoch": 0.702978565463487,
"grad_norm": 0.40352147817611694,
"learning_rate": 3.0656751198566803e-05,
"loss": 0.8082,
"step": 947
},
{
"epoch": 0.7037208870743249,
"grad_norm": 0.4045282304286957,
"learning_rate": 3.0636246311593717e-05,
"loss": 0.8526,
"step": 948
},
{
"epoch": 0.7044632086851629,
"grad_norm": 0.40600502490997314,
"learning_rate": 3.061572582377517e-05,
"loss": 0.9317,
"step": 949
},
{
"epoch": 0.7052055302960007,
"grad_norm": 0.3995624780654907,
"learning_rate": 3.0595189765209834e-05,
"loss": 0.8922,
"step": 950
},
{
"epoch": 0.7059478519068386,
"grad_norm": 0.4132879376411438,
"learning_rate": 3.057463816601924e-05,
"loss": 0.867,
"step": 951
},
{
"epoch": 0.7066901735176765,
"grad_norm": 0.39816924929618835,
"learning_rate": 3.0554071056347674e-05,
"loss": 0.8102,
"step": 952
},
{
"epoch": 0.7074324951285145,
"grad_norm": 0.44873765110969543,
"learning_rate": 3.0533488466362235e-05,
"loss": 0.8293,
"step": 953
},
{
"epoch": 0.7081748167393523,
"grad_norm": 0.3815595507621765,
"learning_rate": 3.051289042625266e-05,
"loss": 0.8287,
"step": 954
},
{
"epoch": 0.7089171383501902,
"grad_norm": 0.39245954155921936,
"learning_rate": 3.0492276966231388e-05,
"loss": 0.8116,
"step": 955
},
{
"epoch": 0.7096594599610281,
"grad_norm": 0.3837089538574219,
"learning_rate": 3.047164811653347e-05,
"loss": 0.8383,
"step": 956
},
{
"epoch": 0.7104017815718661,
"grad_norm": 0.3879731297492981,
"learning_rate": 3.0451003907416522e-05,
"loss": 0.8636,
"step": 957
},
{
"epoch": 0.7111441031827039,
"grad_norm": 0.44654250144958496,
"learning_rate": 3.0430344369160695e-05,
"loss": 0.862,
"step": 958
},
{
"epoch": 0.7118864247935418,
"grad_norm": 0.38784080743789673,
"learning_rate": 3.0409669532068618e-05,
"loss": 0.7992,
"step": 959
},
{
"epoch": 0.7126287464043797,
"grad_norm": 0.38243433833122253,
"learning_rate": 3.038897942646536e-05,
"loss": 0.9515,
"step": 960
},
{
"epoch": 0.7133710680152175,
"grad_norm": 0.36308029294013977,
"learning_rate": 3.0368274082698402e-05,
"loss": 0.7527,
"step": 961
},
{
"epoch": 0.7141133896260555,
"grad_norm": 0.3840027153491974,
"learning_rate": 3.034755353113755e-05,
"loss": 0.8217,
"step": 962
},
{
"epoch": 0.7148557112368934,
"grad_norm": 0.33255666494369507,
"learning_rate": 3.0326817802174928e-05,
"loss": 0.7772,
"step": 963
},
{
"epoch": 0.7155980328477313,
"grad_norm": 0.35044682025909424,
"learning_rate": 3.0306066926224925e-05,
"loss": 0.7885,
"step": 964
},
{
"epoch": 0.7163403544585691,
"grad_norm": 0.4197009801864624,
"learning_rate": 3.028530093372415e-05,
"loss": 0.9473,
"step": 965
},
{
"epoch": 0.7170826760694071,
"grad_norm": 0.36847788095474243,
"learning_rate": 3.026451985513138e-05,
"loss": 0.8673,
"step": 966
},
{
"epoch": 0.717824997680245,
"grad_norm": 0.3890594244003296,
"learning_rate": 3.02437237209275e-05,
"loss": 0.8133,
"step": 967
},
{
"epoch": 0.7185673192910829,
"grad_norm": 0.43013694882392883,
"learning_rate": 3.0222912561615517e-05,
"loss": 0.906,
"step": 968
},
{
"epoch": 0.7193096409019207,
"grad_norm": 0.4074924886226654,
"learning_rate": 3.020208640772045e-05,
"loss": 0.8783,
"step": 969
},
{
"epoch": 0.7200519625127586,
"grad_norm": 0.4189145863056183,
"learning_rate": 3.018124528978931e-05,
"loss": 0.895,
"step": 970
},
{
"epoch": 0.7207942841235966,
"grad_norm": 0.3682161271572113,
"learning_rate": 3.0160389238391067e-05,
"loss": 0.7772,
"step": 971
},
{
"epoch": 0.7215366057344345,
"grad_norm": 0.3887183666229248,
"learning_rate": 3.01395182841166e-05,
"loss": 0.9046,
"step": 972
},
{
"epoch": 0.7222789273452723,
"grad_norm": 0.35502782464027405,
"learning_rate": 3.0118632457578624e-05,
"loss": 0.778,
"step": 973
},
{
"epoch": 0.7230212489561102,
"grad_norm": 0.3781295120716095,
"learning_rate": 3.0097731789411688e-05,
"loss": 0.7921,
"step": 974
},
{
"epoch": 0.7237635705669482,
"grad_norm": 0.436229407787323,
"learning_rate": 3.007681631027212e-05,
"loss": 0.9466,
"step": 975
},
{
"epoch": 0.724505892177786,
"grad_norm": 0.37810221314430237,
"learning_rate": 3.0055886050837935e-05,
"loss": 0.7668,
"step": 976
},
{
"epoch": 0.7252482137886239,
"grad_norm": 0.43310630321502686,
"learning_rate": 3.0034941041808853e-05,
"loss": 0.7256,
"step": 977
},
{
"epoch": 0.7259905353994618,
"grad_norm": 0.4197729825973511,
"learning_rate": 3.0013981313906235e-05,
"loss": 0.8872,
"step": 978
},
{
"epoch": 0.7267328570102997,
"grad_norm": 0.4396391808986664,
"learning_rate": 2.999300689787302e-05,
"loss": 0.7703,
"step": 979
},
{
"epoch": 0.7274751786211376,
"grad_norm": 0.37335899472236633,
"learning_rate": 2.9972017824473682e-05,
"loss": 0.8266,
"step": 980
},
{
"epoch": 0.7282175002319755,
"grad_norm": 0.38292384147644043,
"learning_rate": 2.9951014124494206e-05,
"loss": 0.7715,
"step": 981
},
{
"epoch": 0.7289598218428134,
"grad_norm": 0.3634425401687622,
"learning_rate": 2.9929995828742032e-05,
"loss": 0.9531,
"step": 982
},
{
"epoch": 0.7297021434536513,
"grad_norm": 0.3410262167453766,
"learning_rate": 2.9908962968046e-05,
"loss": 0.7864,
"step": 983
},
{
"epoch": 0.7304444650644892,
"grad_norm": 0.4278801381587982,
"learning_rate": 2.988791557325632e-05,
"loss": 0.8387,
"step": 984
},
{
"epoch": 0.7311867866753271,
"grad_norm": 0.394827276468277,
"learning_rate": 2.9866853675244523e-05,
"loss": 0.8465,
"step": 985
},
{
"epoch": 0.731929108286165,
"grad_norm": 0.3513793349266052,
"learning_rate": 2.98457773049034e-05,
"loss": 0.792,
"step": 986
},
{
"epoch": 0.7326714298970028,
"grad_norm": 0.33131977915763855,
"learning_rate": 2.9824686493146977e-05,
"loss": 0.9048,
"step": 987
},
{
"epoch": 0.7334137515078407,
"grad_norm": 0.36972224712371826,
"learning_rate": 2.9803581270910466e-05,
"loss": 0.7708,
"step": 988
},
{
"epoch": 0.7341560731186787,
"grad_norm": 0.4358534812927246,
"learning_rate": 2.9782461669150212e-05,
"loss": 0.781,
"step": 989
},
{
"epoch": 0.7348983947295166,
"grad_norm": 0.37654241919517517,
"learning_rate": 2.9761327718843648e-05,
"loss": 0.8687,
"step": 990
},
{
"epoch": 0.7356407163403544,
"grad_norm": 0.409423291683197,
"learning_rate": 2.9740179450989265e-05,
"loss": 0.8757,
"step": 991
},
{
"epoch": 0.7363830379511923,
"grad_norm": 0.36852940917015076,
"learning_rate": 2.9719016896606535e-05,
"loss": 0.7978,
"step": 992
},
{
"epoch": 0.7371253595620303,
"grad_norm": 0.4487001299858093,
"learning_rate": 2.9697840086735898e-05,
"loss": 0.8977,
"step": 993
},
{
"epoch": 0.7378676811728682,
"grad_norm": 0.4240976870059967,
"learning_rate": 2.9676649052438706e-05,
"loss": 0.8973,
"step": 994
},
{
"epoch": 0.738610002783706,
"grad_norm": 0.4073665142059326,
"learning_rate": 2.965544382479717e-05,
"loss": 0.8245,
"step": 995
},
{
"epoch": 0.7393523243945439,
"grad_norm": 0.40183866024017334,
"learning_rate": 2.963422443491431e-05,
"loss": 0.8173,
"step": 996
},
{
"epoch": 0.7400946460053819,
"grad_norm": 0.4195357859134674,
"learning_rate": 2.9612990913913935e-05,
"loss": 0.8614,
"step": 997
},
{
"epoch": 0.7408369676162198,
"grad_norm": 0.3881988823413849,
"learning_rate": 2.959174329294058e-05,
"loss": 0.8538,
"step": 998
},
{
"epoch": 0.7415792892270576,
"grad_norm": 0.4063870906829834,
"learning_rate": 2.957048160315944e-05,
"loss": 0.7881,
"step": 999
},
{
"epoch": 0.7423216108378955,
"grad_norm": 0.3842681646347046,
"learning_rate": 2.954920587575637e-05,
"loss": 0.8986,
"step": 1000
},
{
"epoch": 0.7430639324487334,
"grad_norm": 0.379730224609375,
"learning_rate": 2.952791614193781e-05,
"loss": 0.7682,
"step": 1001
},
{
"epoch": 0.7438062540595713,
"grad_norm": 0.36264169216156006,
"learning_rate": 2.9506612432930722e-05,
"loss": 0.7856,
"step": 1002
},
{
"epoch": 0.7445485756704092,
"grad_norm": 0.3864235579967499,
"learning_rate": 2.948529477998261e-05,
"loss": 0.7882,
"step": 1003
},
{
"epoch": 0.7452908972812471,
"grad_norm": 0.42890578508377075,
"learning_rate": 2.946396321436138e-05,
"loss": 0.8788,
"step": 1004
},
{
"epoch": 0.746033218892085,
"grad_norm": 0.3864159882068634,
"learning_rate": 2.944261776735539e-05,
"loss": 0.8518,
"step": 1005
},
{
"epoch": 0.7467755405029229,
"grad_norm": 0.3886793255805969,
"learning_rate": 2.9421258470273317e-05,
"loss": 0.842,
"step": 1006
},
{
"epoch": 0.7475178621137608,
"grad_norm": 0.4735707938671112,
"learning_rate": 2.9399885354444184e-05,
"loss": 0.9169,
"step": 1007
},
{
"epoch": 0.7482601837245987,
"grad_norm": 0.3890320956707001,
"learning_rate": 2.9378498451217275e-05,
"loss": 0.8453,
"step": 1008
},
{
"epoch": 0.7490025053354366,
"grad_norm": 0.36628207564353943,
"learning_rate": 2.9357097791962093e-05,
"loss": 0.7281,
"step": 1009
},
{
"epoch": 0.7497448269462744,
"grad_norm": 0.35675758123397827,
"learning_rate": 2.933568340806831e-05,
"loss": 0.8212,
"step": 1010
},
{
"epoch": 0.7504871485571124,
"grad_norm": 0.3940046429634094,
"learning_rate": 2.931425533094575e-05,
"loss": 0.833,
"step": 1011
},
{
"epoch": 0.7512294701679503,
"grad_norm": 0.3780806362628937,
"learning_rate": 2.9292813592024304e-05,
"loss": 0.8061,
"step": 1012
},
{
"epoch": 0.7519717917787881,
"grad_norm": 0.4200505316257477,
"learning_rate": 2.92713582227539e-05,
"loss": 0.804,
"step": 1013
},
{
"epoch": 0.752714113389626,
"grad_norm": 0.4011412262916565,
"learning_rate": 2.924988925460448e-05,
"loss": 0.8795,
"step": 1014
},
{
"epoch": 0.753456435000464,
"grad_norm": 0.42554283142089844,
"learning_rate": 2.92284067190659e-05,
"loss": 0.872,
"step": 1015
},
{
"epoch": 0.7541987566113019,
"grad_norm": 0.44452059268951416,
"learning_rate": 2.920691064764795e-05,
"loss": 0.8406,
"step": 1016
},
{
"epoch": 0.7549410782221397,
"grad_norm": 0.36996594071388245,
"learning_rate": 2.9185401071880256e-05,
"loss": 0.801,
"step": 1017
},
{
"epoch": 0.7556833998329776,
"grad_norm": 0.3992551267147064,
"learning_rate": 2.9163878023312248e-05,
"loss": 0.8065,
"step": 1018
},
{
"epoch": 0.7564257214438155,
"grad_norm": 0.4441946744918823,
"learning_rate": 2.914234153351312e-05,
"loss": 0.8438,
"step": 1019
},
{
"epoch": 0.7571680430546535,
"grad_norm": 0.367318332195282,
"learning_rate": 2.912079163407179e-05,
"loss": 0.8188,
"step": 1020
},
{
"epoch": 0.7579103646654913,
"grad_norm": 0.47029224038124084,
"learning_rate": 2.909922835659684e-05,
"loss": 0.8777,
"step": 1021
},
{
"epoch": 0.7586526862763292,
"grad_norm": 0.351092129945755,
"learning_rate": 2.9077651732716466e-05,
"loss": 0.8203,
"step": 1022
},
{
"epoch": 0.7593950078871671,
"grad_norm": 0.40848031640052795,
"learning_rate": 2.9056061794078454e-05,
"loss": 0.9871,
"step": 1023
},
{
"epoch": 0.760137329498005,
"grad_norm": 0.3885229527950287,
"learning_rate": 2.9034458572350113e-05,
"loss": 0.9423,
"step": 1024
},
{
"epoch": 0.7608796511088429,
"grad_norm": 0.4087800979614258,
"learning_rate": 2.9012842099218227e-05,
"loss": 0.8842,
"step": 1025
},
{
"epoch": 0.7616219727196808,
"grad_norm": 0.42746204137802124,
"learning_rate": 2.8991212406389034e-05,
"loss": 0.8461,
"step": 1026
},
{
"epoch": 0.7623642943305187,
"grad_norm": 0.34393808245658875,
"learning_rate": 2.8969569525588145e-05,
"loss": 0.8116,
"step": 1027
},
{
"epoch": 0.7631066159413566,
"grad_norm": 0.36236944794654846,
"learning_rate": 2.8947913488560523e-05,
"loss": 0.9132,
"step": 1028
},
{
"epoch": 0.7638489375521945,
"grad_norm": 0.4048609435558319,
"learning_rate": 2.8926244327070444e-05,
"loss": 0.8502,
"step": 1029
},
{
"epoch": 0.7645912591630324,
"grad_norm": 0.39788907766342163,
"learning_rate": 2.8904562072901395e-05,
"loss": 0.8751,
"step": 1030
},
{
"epoch": 0.7653335807738703,
"grad_norm": 0.3588632345199585,
"learning_rate": 2.88828667578561e-05,
"loss": 0.7647,
"step": 1031
},
{
"epoch": 0.7660759023847081,
"grad_norm": 0.3718065917491913,
"learning_rate": 2.886115841375643e-05,
"loss": 0.88,
"step": 1032
},
{
"epoch": 0.7668182239955461,
"grad_norm": 0.3948284983634949,
"learning_rate": 2.8839437072443362e-05,
"loss": 0.88,
"step": 1033
},
{
"epoch": 0.767560545606384,
"grad_norm": 0.36646491289138794,
"learning_rate": 2.8817702765776947e-05,
"loss": 0.8244,
"step": 1034
},
{
"epoch": 0.7683028672172219,
"grad_norm": 0.41700711846351624,
"learning_rate": 2.8795955525636247e-05,
"loss": 0.8149,
"step": 1035
},
{
"epoch": 0.7690451888280597,
"grad_norm": 0.36176496744155884,
"learning_rate": 2.877419538391929e-05,
"loss": 0.7405,
"step": 1036
},
{
"epoch": 0.7697875104388977,
"grad_norm": 0.3812231123447418,
"learning_rate": 2.8752422372543037e-05,
"loss": 0.8124,
"step": 1037
},
{
"epoch": 0.7705298320497356,
"grad_norm": 0.3683227300643921,
"learning_rate": 2.8730636523443313e-05,
"loss": 0.8463,
"step": 1038
},
{
"epoch": 0.7712721536605734,
"grad_norm": 0.37682121992111206,
"learning_rate": 2.870883786857479e-05,
"loss": 0.8515,
"step": 1039
},
{
"epoch": 0.7720144752714113,
"grad_norm": 0.40752267837524414,
"learning_rate": 2.868702643991091e-05,
"loss": 0.9872,
"step": 1040
},
{
"epoch": 0.7727567968822492,
"grad_norm": 0.34598663449287415,
"learning_rate": 2.866520226944386e-05,
"loss": 0.8201,
"step": 1041
},
{
"epoch": 0.7734991184930872,
"grad_norm": 0.39901018142700195,
"learning_rate": 2.864336538918451e-05,
"loss": 0.7867,
"step": 1042
},
{
"epoch": 0.774241440103925,
"grad_norm": 0.40626272559165955,
"learning_rate": 2.8621515831162363e-05,
"loss": 0.8817,
"step": 1043
},
{
"epoch": 0.7749837617147629,
"grad_norm": 0.4141016900539398,
"learning_rate": 2.859965362742554e-05,
"loss": 0.8937,
"step": 1044
},
{
"epoch": 0.7757260833256008,
"grad_norm": 0.43257343769073486,
"learning_rate": 2.8577778810040687e-05,
"loss": 0.8147,
"step": 1045
},
{
"epoch": 0.7764684049364388,
"grad_norm": 0.40944743156433105,
"learning_rate": 2.855589141109297e-05,
"loss": 0.8747,
"step": 1046
},
{
"epoch": 0.7772107265472766,
"grad_norm": 0.4060775935649872,
"learning_rate": 2.8533991462686005e-05,
"loss": 0.9374,
"step": 1047
},
{
"epoch": 0.7779530481581145,
"grad_norm": 0.37721845507621765,
"learning_rate": 2.8512078996941805e-05,
"loss": 0.837,
"step": 1048
},
{
"epoch": 0.7786953697689524,
"grad_norm": 0.3883989155292511,
"learning_rate": 2.849015404600074e-05,
"loss": 0.8522,
"step": 1049
},
{
"epoch": 0.7794376913797902,
"grad_norm": 0.3714500367641449,
"learning_rate": 2.8468216642021524e-05,
"loss": 0.7972,
"step": 1050
},
{
"epoch": 0.7801800129906282,
"grad_norm": 0.37216463685035706,
"learning_rate": 2.84462668171811e-05,
"loss": 0.7885,
"step": 1051
},
{
"epoch": 0.7809223346014661,
"grad_norm": 0.343142032623291,
"learning_rate": 2.8424304603674645e-05,
"loss": 0.8198,
"step": 1052
},
{
"epoch": 0.781664656212304,
"grad_norm": 0.39523279666900635,
"learning_rate": 2.8402330033715512e-05,
"loss": 0.8359,
"step": 1053
},
{
"epoch": 0.7824069778231418,
"grad_norm": 0.425644189119339,
"learning_rate": 2.838034313953517e-05,
"loss": 0.873,
"step": 1054
},
{
"epoch": 0.7831492994339798,
"grad_norm": 0.37101006507873535,
"learning_rate": 2.835834395338317e-05,
"loss": 0.7968,
"step": 1055
},
{
"epoch": 0.7838916210448177,
"grad_norm": 0.5156154036521912,
"learning_rate": 2.833633250752708e-05,
"loss": 0.7642,
"step": 1056
},
{
"epoch": 0.7846339426556556,
"grad_norm": 0.42274123430252075,
"learning_rate": 2.8314308834252477e-05,
"loss": 0.9259,
"step": 1057
},
{
"epoch": 0.7853762642664934,
"grad_norm": 0.45442909002304077,
"learning_rate": 2.8292272965862834e-05,
"loss": 0.8184,
"step": 1058
},
{
"epoch": 0.7861185858773313,
"grad_norm": 0.38438424468040466,
"learning_rate": 2.8270224934679547e-05,
"loss": 0.8094,
"step": 1059
},
{
"epoch": 0.7868609074881693,
"grad_norm": 0.3720132112503052,
"learning_rate": 2.8248164773041838e-05,
"loss": 0.8544,
"step": 1060
},
{
"epoch": 0.7876032290990072,
"grad_norm": 0.3671931028366089,
"learning_rate": 2.822609251330671e-05,
"loss": 0.7667,
"step": 1061
},
{
"epoch": 0.788345550709845,
"grad_norm": 0.37383246421813965,
"learning_rate": 2.8204008187848938e-05,
"loss": 0.8313,
"step": 1062
},
{
"epoch": 0.7890878723206829,
"grad_norm": 0.37284618616104126,
"learning_rate": 2.8181911829060963e-05,
"loss": 0.7514,
"step": 1063
},
{
"epoch": 0.7898301939315209,
"grad_norm": 0.45464012026786804,
"learning_rate": 2.8159803469352902e-05,
"loss": 0.9212,
"step": 1064
},
{
"epoch": 0.7905725155423587,
"grad_norm": 0.3994920551776886,
"learning_rate": 2.813768314115246e-05,
"loss": 0.8621,
"step": 1065
},
{
"epoch": 0.7913148371531966,
"grad_norm": 0.4181472957134247,
"learning_rate": 2.8115550876904905e-05,
"loss": 0.8572,
"step": 1066
},
{
"epoch": 0.7920571587640345,
"grad_norm": 0.37849193811416626,
"learning_rate": 2.8093406709073002e-05,
"loss": 0.8802,
"step": 1067
},
{
"epoch": 0.7927994803748725,
"grad_norm": 0.3892970681190491,
"learning_rate": 2.8071250670136994e-05,
"loss": 0.9052,
"step": 1068
},
{
"epoch": 0.7935418019857103,
"grad_norm": 0.39315545558929443,
"learning_rate": 2.8049082792594515e-05,
"loss": 0.8753,
"step": 1069
},
{
"epoch": 0.7942841235965482,
"grad_norm": 0.35754016041755676,
"learning_rate": 2.8026903108960573e-05,
"loss": 0.8032,
"step": 1070
},
{
"epoch": 0.7950264452073861,
"grad_norm": 0.3976321518421173,
"learning_rate": 2.8004711651767492e-05,
"loss": 0.8564,
"step": 1071
},
{
"epoch": 0.795768766818224,
"grad_norm": 0.36166390776634216,
"learning_rate": 2.7982508453564866e-05,
"loss": 0.7671,
"step": 1072
},
{
"epoch": 0.7965110884290619,
"grad_norm": 0.36037370562553406,
"learning_rate": 2.796029354691952e-05,
"loss": 0.915,
"step": 1073
},
{
"epoch": 0.7972534100398998,
"grad_norm": 0.368314266204834,
"learning_rate": 2.793806696441543e-05,
"loss": 0.8892,
"step": 1074
},
{
"epoch": 0.7979957316507377,
"grad_norm": 0.4094344973564148,
"learning_rate": 2.791582873865371e-05,
"loss": 0.8191,
"step": 1075
},
{
"epoch": 0.7987380532615755,
"grad_norm": 0.38346514105796814,
"learning_rate": 2.7893578902252563e-05,
"loss": 0.8655,
"step": 1076
},
{
"epoch": 0.7994803748724135,
"grad_norm": 0.3845061659812927,
"learning_rate": 2.7871317487847195e-05,
"loss": 0.7341,
"step": 1077
},
{
"epoch": 0.8002226964832514,
"grad_norm": 0.37997809052467346,
"learning_rate": 2.784904452808982e-05,
"loss": 0.8391,
"step": 1078
},
{
"epoch": 0.8009650180940893,
"grad_norm": 0.38898172974586487,
"learning_rate": 2.782676005564958e-05,
"loss": 0.9057,
"step": 1079
},
{
"epoch": 0.8017073397049271,
"grad_norm": 0.3827812075614929,
"learning_rate": 2.7804464103212492e-05,
"loss": 0.8068,
"step": 1080
},
{
"epoch": 0.802449661315765,
"grad_norm": 0.36027270555496216,
"learning_rate": 2.7782156703481428e-05,
"loss": 0.8588,
"step": 1081
},
{
"epoch": 0.803191982926603,
"grad_norm": 0.4327481687068939,
"learning_rate": 2.775983788917603e-05,
"loss": 0.8632,
"step": 1082
},
{
"epoch": 0.8039343045374409,
"grad_norm": 0.3570108413696289,
"learning_rate": 2.7737507693032704e-05,
"loss": 0.8142,
"step": 1083
},
{
"epoch": 0.8046766261482787,
"grad_norm": 0.3824721872806549,
"learning_rate": 2.7715166147804534e-05,
"loss": 0.8073,
"step": 1084
},
{
"epoch": 0.8054189477591166,
"grad_norm": 0.3799905478954315,
"learning_rate": 2.769281328626126e-05,
"loss": 0.8286,
"step": 1085
},
{
"epoch": 0.8061612693699546,
"grad_norm": 0.39156484603881836,
"learning_rate": 2.7670449141189224e-05,
"loss": 0.8259,
"step": 1086
},
{
"epoch": 0.8069035909807925,
"grad_norm": 0.37677666544914246,
"learning_rate": 2.7648073745391294e-05,
"loss": 0.7925,
"step": 1087
},
{
"epoch": 0.8076459125916303,
"grad_norm": 0.3373050093650818,
"learning_rate": 2.7625687131686874e-05,
"loss": 0.855,
"step": 1088
},
{
"epoch": 0.8083882342024682,
"grad_norm": 0.397009938955307,
"learning_rate": 2.76032893329118e-05,
"loss": 0.8977,
"step": 1089
},
{
"epoch": 0.8091305558133061,
"grad_norm": 0.37769317626953125,
"learning_rate": 2.7580880381918317e-05,
"loss": 0.95,
"step": 1090
},
{
"epoch": 0.809872877424144,
"grad_norm": 0.4025100767612457,
"learning_rate": 2.755846031157504e-05,
"loss": 0.7946,
"step": 1091
},
{
"epoch": 0.8106151990349819,
"grad_norm": 0.4239715337753296,
"learning_rate": 2.753602915476687e-05,
"loss": 0.8794,
"step": 1092
},
{
"epoch": 0.8113575206458198,
"grad_norm": 0.40224555134773254,
"learning_rate": 2.7513586944394992e-05,
"loss": 0.9042,
"step": 1093
},
{
"epoch": 0.8120998422566577,
"grad_norm": 0.32581862807273865,
"learning_rate": 2.7491133713376795e-05,
"loss": 0.8939,
"step": 1094
},
{
"epoch": 0.8128421638674956,
"grad_norm": 0.4022167921066284,
"learning_rate": 2.7468669494645827e-05,
"loss": 0.7777,
"step": 1095
},
{
"epoch": 0.8135844854783335,
"grad_norm": 0.4196806252002716,
"learning_rate": 2.7446194321151768e-05,
"loss": 0.8122,
"step": 1096
},
{
"epoch": 0.8143268070891714,
"grad_norm": 0.3758104145526886,
"learning_rate": 2.7423708225860344e-05,
"loss": 0.7338,
"step": 1097
},
{
"epoch": 0.8150691287000092,
"grad_norm": 0.37303996086120605,
"learning_rate": 2.7401211241753315e-05,
"loss": 0.8801,
"step": 1098
},
{
"epoch": 0.8158114503108471,
"grad_norm": 0.392053484916687,
"learning_rate": 2.7378703401828416e-05,
"loss": 0.6883,
"step": 1099
},
{
"epoch": 0.8165537719216851,
"grad_norm": 0.37932631373405457,
"learning_rate": 2.7356184739099293e-05,
"loss": 0.7823,
"step": 1100
},
{
"epoch": 0.817296093532523,
"grad_norm": 0.3743017315864563,
"learning_rate": 2.7333655286595474e-05,
"loss": 0.8226,
"step": 1101
},
{
"epoch": 0.8180384151433608,
"grad_norm": 0.4144737720489502,
"learning_rate": 2.731111507736232e-05,
"loss": 0.8217,
"step": 1102
},
{
"epoch": 0.8187807367541987,
"grad_norm": 0.3302975296974182,
"learning_rate": 2.728856414446094e-05,
"loss": 0.8851,
"step": 1103
},
{
"epoch": 0.8195230583650367,
"grad_norm": 0.37614309787750244,
"learning_rate": 2.7266002520968212e-05,
"loss": 0.8134,
"step": 1104
},
{
"epoch": 0.8202653799758746,
"grad_norm": 0.42986634373664856,
"learning_rate": 2.7243430239976668e-05,
"loss": 0.8935,
"step": 1105
},
{
"epoch": 0.8210077015867124,
"grad_norm": 0.4134221076965332,
"learning_rate": 2.722084733459448e-05,
"loss": 0.8741,
"step": 1106
},
{
"epoch": 0.8217500231975503,
"grad_norm": 0.4643120765686035,
"learning_rate": 2.7198253837945414e-05,
"loss": 0.8715,
"step": 1107
},
{
"epoch": 0.8224923448083883,
"grad_norm": 0.4156094789505005,
"learning_rate": 2.7175649783168742e-05,
"loss": 0.9386,
"step": 1108
},
{
"epoch": 0.8232346664192262,
"grad_norm": 0.39130133390426636,
"learning_rate": 2.715303520341925e-05,
"loss": 0.9361,
"step": 1109
},
{
"epoch": 0.823976988030064,
"grad_norm": 0.38539817929267883,
"learning_rate": 2.7130410131867147e-05,
"loss": 0.8998,
"step": 1110
},
{
"epoch": 0.8247193096409019,
"grad_norm": 0.35698390007019043,
"learning_rate": 2.710777460169804e-05,
"loss": 0.761,
"step": 1111
},
{
"epoch": 0.8254616312517398,
"grad_norm": 0.33499324321746826,
"learning_rate": 2.708512864611287e-05,
"loss": 0.8085,
"step": 1112
},
{
"epoch": 0.8262039528625778,
"grad_norm": 0.44463443756103516,
"learning_rate": 2.706247229832787e-05,
"loss": 0.7751,
"step": 1113
},
{
"epoch": 0.8269462744734156,
"grad_norm": 0.4300941526889801,
"learning_rate": 2.703980559157452e-05,
"loss": 0.7756,
"step": 1114
},
{
"epoch": 0.8276885960842535,
"grad_norm": 0.3815639019012451,
"learning_rate": 2.70171285590995e-05,
"loss": 0.8001,
"step": 1115
},
{
"epoch": 0.8284309176950914,
"grad_norm": 0.38274961709976196,
"learning_rate": 2.6994441234164605e-05,
"loss": 0.7243,
"step": 1116
},
{
"epoch": 0.8291732393059293,
"grad_norm": 0.39203885197639465,
"learning_rate": 2.6971743650046774e-05,
"loss": 0.8777,
"step": 1117
},
{
"epoch": 0.8299155609167672,
"grad_norm": 0.38965150713920593,
"learning_rate": 2.694903584003795e-05,
"loss": 0.8189,
"step": 1118
},
{
"epoch": 0.8306578825276051,
"grad_norm": 0.39319199323654175,
"learning_rate": 2.69263178374451e-05,
"loss": 0.8219,
"step": 1119
},
{
"epoch": 0.831400204138443,
"grad_norm": 0.3590227961540222,
"learning_rate": 2.690358967559014e-05,
"loss": 0.7929,
"step": 1120
},
{
"epoch": 0.8321425257492808,
"grad_norm": 0.4018113911151886,
"learning_rate": 2.688085138780987e-05,
"loss": 0.7804,
"step": 1121
},
{
"epoch": 0.8328848473601188,
"grad_norm": 0.3763115406036377,
"learning_rate": 2.6858103007455968e-05,
"loss": 0.9118,
"step": 1122
},
{
"epoch": 0.8336271689709567,
"grad_norm": 0.4066718816757202,
"learning_rate": 2.6835344567894887e-05,
"loss": 0.7809,
"step": 1123
},
{
"epoch": 0.8343694905817945,
"grad_norm": 0.4564039707183838,
"learning_rate": 2.6812576102507852e-05,
"loss": 0.9001,
"step": 1124
},
{
"epoch": 0.8351118121926324,
"grad_norm": 0.38714709877967834,
"learning_rate": 2.6789797644690804e-05,
"loss": 0.7852,
"step": 1125
},
{
"epoch": 0.8358541338034704,
"grad_norm": 0.38495564460754395,
"learning_rate": 2.6767009227854307e-05,
"loss": 0.7465,
"step": 1126
},
{
"epoch": 0.8365964554143083,
"grad_norm": 0.35638296604156494,
"learning_rate": 2.6744210885423568e-05,
"loss": 0.876,
"step": 1127
},
{
"epoch": 0.8373387770251461,
"grad_norm": 0.40398475527763367,
"learning_rate": 2.672140265083833e-05,
"loss": 0.8088,
"step": 1128
},
{
"epoch": 0.838081098635984,
"grad_norm": 0.3809717297554016,
"learning_rate": 2.6698584557552853e-05,
"loss": 0.7492,
"step": 1129
},
{
"epoch": 0.8388234202468219,
"grad_norm": 0.34428274631500244,
"learning_rate": 2.6675756639035865e-05,
"loss": 0.7429,
"step": 1130
},
{
"epoch": 0.8395657418576599,
"grad_norm": 0.4311201572418213,
"learning_rate": 2.6652918928770484e-05,
"loss": 0.8018,
"step": 1131
},
{
"epoch": 0.8403080634684977,
"grad_norm": 0.36646634340286255,
"learning_rate": 2.663007146025421e-05,
"loss": 0.8448,
"step": 1132
},
{
"epoch": 0.8410503850793356,
"grad_norm": 0.3897891342639923,
"learning_rate": 2.6607214266998846e-05,
"loss": 0.7464,
"step": 1133
},
{
"epoch": 0.8417927066901735,
"grad_norm": 0.38725921511650085,
"learning_rate": 2.658434738253046e-05,
"loss": 0.7544,
"step": 1134
},
{
"epoch": 0.8425350283010115,
"grad_norm": 0.4934210479259491,
"learning_rate": 2.6561470840389354e-05,
"loss": 0.7775,
"step": 1135
},
{
"epoch": 0.8432773499118493,
"grad_norm": 0.40992632508277893,
"learning_rate": 2.6538584674129958e-05,
"loss": 0.8458,
"step": 1136
},
{
"epoch": 0.8440196715226872,
"grad_norm": 0.39897727966308594,
"learning_rate": 2.651568891732084e-05,
"loss": 0.8362,
"step": 1137
},
{
"epoch": 0.8447619931335251,
"grad_norm": 0.38299280405044556,
"learning_rate": 2.6492783603544648e-05,
"loss": 0.8096,
"step": 1138
},
{
"epoch": 0.845504314744363,
"grad_norm": 0.37341177463531494,
"learning_rate": 2.6469868766398024e-05,
"loss": 0.7951,
"step": 1139
},
{
"epoch": 0.8462466363552009,
"grad_norm": 0.36628684401512146,
"learning_rate": 2.644694443949159e-05,
"loss": 0.9074,
"step": 1140
},
{
"epoch": 0.8469889579660388,
"grad_norm": 0.33878034353256226,
"learning_rate": 2.6424010656449877e-05,
"loss": 0.7807,
"step": 1141
},
{
"epoch": 0.8477312795768767,
"grad_norm": 0.3553994297981262,
"learning_rate": 2.6401067450911305e-05,
"loss": 0.7631,
"step": 1142
},
{
"epoch": 0.8484736011877145,
"grad_norm": 0.35956519842147827,
"learning_rate": 2.6378114856528108e-05,
"loss": 0.7291,
"step": 1143
},
{
"epoch": 0.8492159227985525,
"grad_norm": 0.44231006503105164,
"learning_rate": 2.6355152906966268e-05,
"loss": 0.9642,
"step": 1144
},
{
"epoch": 0.8499582444093904,
"grad_norm": 0.3414170444011688,
"learning_rate": 2.6332181635905523e-05,
"loss": 0.8088,
"step": 1145
},
{
"epoch": 0.8507005660202283,
"grad_norm": 0.3992343246936798,
"learning_rate": 2.630920107703927e-05,
"loss": 0.8089,
"step": 1146
},
{
"epoch": 0.8514428876310661,
"grad_norm": 0.44197651743888855,
"learning_rate": 2.6286211264074518e-05,
"loss": 0.7817,
"step": 1147
},
{
"epoch": 0.8521852092419041,
"grad_norm": 0.3761456310749054,
"learning_rate": 2.626321223073186e-05,
"loss": 0.844,
"step": 1148
},
{
"epoch": 0.852927530852742,
"grad_norm": 0.3781460225582123,
"learning_rate": 2.6240204010745417e-05,
"loss": 0.8404,
"step": 1149
},
{
"epoch": 0.8536698524635798,
"grad_norm": 0.41302886605262756,
"learning_rate": 2.6217186637862767e-05,
"loss": 0.8341,
"step": 1150
},
{
"epoch": 0.8544121740744177,
"grad_norm": 0.3607892394065857,
"learning_rate": 2.6194160145844943e-05,
"loss": 0.781,
"step": 1151
},
{
"epoch": 0.8551544956852556,
"grad_norm": 0.3791287839412689,
"learning_rate": 2.6171124568466328e-05,
"loss": 0.7664,
"step": 1152
},
{
"epoch": 0.8558968172960936,
"grad_norm": 0.3444710969924927,
"learning_rate": 2.6148079939514634e-05,
"loss": 0.7775,
"step": 1153
},
{
"epoch": 0.8566391389069314,
"grad_norm": 0.35780924558639526,
"learning_rate": 2.612502629279086e-05,
"loss": 0.9237,
"step": 1154
},
{
"epoch": 0.8573814605177693,
"grad_norm": 0.4052909314632416,
"learning_rate": 2.6101963662109233e-05,
"loss": 0.9076,
"step": 1155
},
{
"epoch": 0.8581237821286072,
"grad_norm": 0.3919762372970581,
"learning_rate": 2.607889208129714e-05,
"loss": 0.7643,
"step": 1156
},
{
"epoch": 0.8588661037394452,
"grad_norm": 0.36909744143486023,
"learning_rate": 2.6055811584195116e-05,
"loss": 0.8057,
"step": 1157
},
{
"epoch": 0.859608425350283,
"grad_norm": 0.37541574239730835,
"learning_rate": 2.6032722204656752e-05,
"loss": 0.7,
"step": 1158
},
{
"epoch": 0.8603507469611209,
"grad_norm": 0.3770912289619446,
"learning_rate": 2.600962397654869e-05,
"loss": 0.7992,
"step": 1159
},
{
"epoch": 0.8610930685719588,
"grad_norm": 0.38514405488967896,
"learning_rate": 2.5986516933750547e-05,
"loss": 0.8241,
"step": 1160
},
{
"epoch": 0.8618353901827966,
"grad_norm": 0.46653178334236145,
"learning_rate": 2.596340111015485e-05,
"loss": 0.9005,
"step": 1161
},
{
"epoch": 0.8625777117936346,
"grad_norm": 0.39433014392852783,
"learning_rate": 2.5940276539667023e-05,
"loss": 0.7639,
"step": 1162
},
{
"epoch": 0.8633200334044725,
"grad_norm": 0.39670971035957336,
"learning_rate": 2.5917143256205315e-05,
"loss": 0.8006,
"step": 1163
},
{
"epoch": 0.8640623550153104,
"grad_norm": 0.3866695463657379,
"learning_rate": 2.5894001293700762e-05,
"loss": 0.9459,
"step": 1164
},
{
"epoch": 0.8648046766261482,
"grad_norm": 0.3981380760669708,
"learning_rate": 2.587085068609711e-05,
"loss": 0.7851,
"step": 1165
},
{
"epoch": 0.8655469982369862,
"grad_norm": 0.4058111608028412,
"learning_rate": 2.5847691467350803e-05,
"loss": 0.8604,
"step": 1166
},
{
"epoch": 0.8662893198478241,
"grad_norm": 0.42619138956069946,
"learning_rate": 2.582452367143091e-05,
"loss": 0.9086,
"step": 1167
},
{
"epoch": 0.867031641458662,
"grad_norm": 0.3281800448894501,
"learning_rate": 2.5801347332319094e-05,
"loss": 0.7461,
"step": 1168
},
{
"epoch": 0.8677739630694998,
"grad_norm": 0.380833238363266,
"learning_rate": 2.5778162484009523e-05,
"loss": 0.8507,
"step": 1169
},
{
"epoch": 0.8685162846803377,
"grad_norm": 0.404496431350708,
"learning_rate": 2.575496916050886e-05,
"loss": 0.7522,
"step": 1170
},
{
"epoch": 0.8692586062911757,
"grad_norm": 0.3586255609989166,
"learning_rate": 2.5731767395836195e-05,
"loss": 0.7842,
"step": 1171
},
{
"epoch": 0.8700009279020136,
"grad_norm": 0.40008819103240967,
"learning_rate": 2.5708557224023015e-05,
"loss": 0.7666,
"step": 1172
},
{
"epoch": 0.8707432495128514,
"grad_norm": 0.4241710901260376,
"learning_rate": 2.5685338679113118e-05,
"loss": 0.8025,
"step": 1173
},
{
"epoch": 0.8714855711236893,
"grad_norm": 0.377708375453949,
"learning_rate": 2.5662111795162597e-05,
"loss": 0.8052,
"step": 1174
},
{
"epoch": 0.8722278927345273,
"grad_norm": 0.5053303241729736,
"learning_rate": 2.5638876606239756e-05,
"loss": 0.8903,
"step": 1175
},
{
"epoch": 0.8729702143453651,
"grad_norm": 0.3756983280181885,
"learning_rate": 2.561563314642511e-05,
"loss": 0.8037,
"step": 1176
},
{
"epoch": 0.873712535956203,
"grad_norm": 0.3644946217536926,
"learning_rate": 2.5592381449811284e-05,
"loss": 0.7777,
"step": 1177
},
{
"epoch": 0.8744548575670409,
"grad_norm": 0.3979732394218445,
"learning_rate": 2.5569121550502992e-05,
"loss": 0.7259,
"step": 1178
},
{
"epoch": 0.8751971791778789,
"grad_norm": 0.365654319524765,
"learning_rate": 2.5545853482616975e-05,
"loss": 0.8369,
"step": 1179
},
{
"epoch": 0.8759395007887167,
"grad_norm": 0.37514373660087585,
"learning_rate": 2.5522577280281958e-05,
"loss": 0.7921,
"step": 1180
},
{
"epoch": 0.8766818223995546,
"grad_norm": 0.31953245401382446,
"learning_rate": 2.5499292977638607e-05,
"loss": 0.7783,
"step": 1181
},
{
"epoch": 0.8774241440103925,
"grad_norm": 0.35402050614356995,
"learning_rate": 2.5476000608839454e-05,
"loss": 0.7105,
"step": 1182
},
{
"epoch": 0.8781664656212304,
"grad_norm": 0.3901765048503876,
"learning_rate": 2.5452700208048864e-05,
"loss": 0.8499,
"step": 1183
},
{
"epoch": 0.8789087872320683,
"grad_norm": 0.39869165420532227,
"learning_rate": 2.5429391809442993e-05,
"loss": 0.8133,
"step": 1184
},
{
"epoch": 0.8796511088429062,
"grad_norm": 0.42267906665802,
"learning_rate": 2.5406075447209725e-05,
"loss": 0.8178,
"step": 1185
},
{
"epoch": 0.8803934304537441,
"grad_norm": 0.40030765533447266,
"learning_rate": 2.5382751155548615e-05,
"loss": 0.795,
"step": 1186
},
{
"epoch": 0.881135752064582,
"grad_norm": 0.34904223680496216,
"learning_rate": 2.535941896867086e-05,
"loss": 0.826,
"step": 1187
},
{
"epoch": 0.8818780736754199,
"grad_norm": 0.34118303656578064,
"learning_rate": 2.533607892079923e-05,
"loss": 0.7497,
"step": 1188
},
{
"epoch": 0.8826203952862578,
"grad_norm": 0.37169280648231506,
"learning_rate": 2.5312731046168026e-05,
"loss": 0.7827,
"step": 1189
},
{
"epoch": 0.8833627168970957,
"grad_norm": 0.4534485936164856,
"learning_rate": 2.5289375379023043e-05,
"loss": 0.842,
"step": 1190
},
{
"epoch": 0.8841050385079335,
"grad_norm": 0.38551488518714905,
"learning_rate": 2.526601195362147e-05,
"loss": 0.7868,
"step": 1191
},
{
"epoch": 0.8848473601187714,
"grad_norm": 0.3991512656211853,
"learning_rate": 2.5242640804231922e-05,
"loss": 0.7889,
"step": 1192
},
{
"epoch": 0.8855896817296094,
"grad_norm": 0.36497434973716736,
"learning_rate": 2.5219261965134315e-05,
"loss": 0.7602,
"step": 1193
},
{
"epoch": 0.8863320033404473,
"grad_norm": 0.38198092579841614,
"learning_rate": 2.519587547061985e-05,
"loss": 0.7992,
"step": 1194
},
{
"epoch": 0.8870743249512851,
"grad_norm": 0.3975292444229126,
"learning_rate": 2.517248135499095e-05,
"loss": 0.7652,
"step": 1195
},
{
"epoch": 0.887816646562123,
"grad_norm": 0.3680102229118347,
"learning_rate": 2.5149079652561225e-05,
"loss": 0.8046,
"step": 1196
},
{
"epoch": 0.888558968172961,
"grad_norm": 0.38051730394363403,
"learning_rate": 2.512567039765542e-05,
"loss": 0.8289,
"step": 1197
},
{
"epoch": 0.8893012897837989,
"grad_norm": 0.3820357620716095,
"learning_rate": 2.5102253624609332e-05,
"loss": 0.7923,
"step": 1198
},
{
"epoch": 0.8900436113946367,
"grad_norm": 0.40367165207862854,
"learning_rate": 2.507882936776981e-05,
"loss": 0.7872,
"step": 1199
},
{
"epoch": 0.8907859330054746,
"grad_norm": 0.3560531735420227,
"learning_rate": 2.505539766149468e-05,
"loss": 0.8427,
"step": 1200
},
{
"epoch": 0.8915282546163125,
"grad_norm": 0.34239915013313293,
"learning_rate": 2.503195854015267e-05,
"loss": 0.799,
"step": 1201
},
{
"epoch": 0.8922705762271504,
"grad_norm": 0.3868923485279083,
"learning_rate": 2.500851203812341e-05,
"loss": 0.8314,
"step": 1202
},
{
"epoch": 0.8930128978379883,
"grad_norm": 0.4198054373264313,
"learning_rate": 2.4985058189797347e-05,
"loss": 0.8763,
"step": 1203
},
{
"epoch": 0.8937552194488262,
"grad_norm": 0.4096454083919525,
"learning_rate": 2.4961597029575694e-05,
"loss": 0.7835,
"step": 1204
},
{
"epoch": 0.8944975410596641,
"grad_norm": 0.407387912273407,
"learning_rate": 2.493812859187041e-05,
"loss": 0.8137,
"step": 1205
},
{
"epoch": 0.895239862670502,
"grad_norm": 0.38549163937568665,
"learning_rate": 2.4914652911104104e-05,
"loss": 0.788,
"step": 1206
},
{
"epoch": 0.8959821842813399,
"grad_norm": 0.37403562664985657,
"learning_rate": 2.4891170021710025e-05,
"loss": 0.819,
"step": 1207
},
{
"epoch": 0.8967245058921778,
"grad_norm": 0.33642685413360596,
"learning_rate": 2.4867679958131996e-05,
"loss": 0.7957,
"step": 1208
},
{
"epoch": 0.8974668275030157,
"grad_norm": 0.3692936897277832,
"learning_rate": 2.4844182754824343e-05,
"loss": 0.8125,
"step": 1209
},
{
"epoch": 0.8982091491138535,
"grad_norm": 0.4189552664756775,
"learning_rate": 2.4820678446251893e-05,
"loss": 0.8238,
"step": 1210
},
{
"epoch": 0.8989514707246915,
"grad_norm": 0.411651074886322,
"learning_rate": 2.4797167066889873e-05,
"loss": 0.8231,
"step": 1211
},
{
"epoch": 0.8996937923355294,
"grad_norm": 0.4330805540084839,
"learning_rate": 2.477364865122389e-05,
"loss": 0.8004,
"step": 1212
},
{
"epoch": 0.9004361139463672,
"grad_norm": 0.4067060351371765,
"learning_rate": 2.4750123233749874e-05,
"loss": 0.8056,
"step": 1213
},
{
"epoch": 0.9011784355572051,
"grad_norm": 0.4101119339466095,
"learning_rate": 2.4726590848974015e-05,
"loss": 0.842,
"step": 1214
},
{
"epoch": 0.9019207571680431,
"grad_norm": 0.4217219352722168,
"learning_rate": 2.470305153141273e-05,
"loss": 0.869,
"step": 1215
},
{
"epoch": 0.902663078778881,
"grad_norm": 0.43955573439598083,
"learning_rate": 2.4679505315592608e-05,
"loss": 0.8974,
"step": 1216
},
{
"epoch": 0.9034054003897188,
"grad_norm": 0.3962491750717163,
"learning_rate": 2.4655952236050347e-05,
"loss": 0.8321,
"step": 1217
},
{
"epoch": 0.9041477220005567,
"grad_norm": 0.39119595289230347,
"learning_rate": 2.463239232733272e-05,
"loss": 0.816,
"step": 1218
},
{
"epoch": 0.9048900436113947,
"grad_norm": 0.37983494997024536,
"learning_rate": 2.4608825623996513e-05,
"loss": 0.9081,
"step": 1219
},
{
"epoch": 0.9056323652222326,
"grad_norm": 0.36598238348960876,
"learning_rate": 2.4585252160608474e-05,
"loss": 0.885,
"step": 1220
},
{
"epoch": 0.9063746868330704,
"grad_norm": 0.41146743297576904,
"learning_rate": 2.4561671971745284e-05,
"loss": 0.8678,
"step": 1221
},
{
"epoch": 0.9071170084439083,
"grad_norm": 0.40173497796058655,
"learning_rate": 2.453808509199346e-05,
"loss": 0.7682,
"step": 1222
},
{
"epoch": 0.9078593300547462,
"grad_norm": 0.3730056583881378,
"learning_rate": 2.4514491555949356e-05,
"loss": 0.7668,
"step": 1223
},
{
"epoch": 0.9086016516655842,
"grad_norm": 0.38427025079727173,
"learning_rate": 2.4490891398219084e-05,
"loss": 0.7851,
"step": 1224
},
{
"epoch": 0.909343973276422,
"grad_norm": 0.4028315842151642,
"learning_rate": 2.446728465341846e-05,
"loss": 0.8517,
"step": 1225
},
{
"epoch": 0.9100862948872599,
"grad_norm": 0.3713853657245636,
"learning_rate": 2.444367135617298e-05,
"loss": 0.754,
"step": 1226
},
{
"epoch": 0.9108286164980978,
"grad_norm": 0.3742387592792511,
"learning_rate": 2.4420051541117725e-05,
"loss": 0.916,
"step": 1227
},
{
"epoch": 0.9115709381089357,
"grad_norm": 0.38887640833854675,
"learning_rate": 2.4396425242897354e-05,
"loss": 0.8266,
"step": 1228
},
{
"epoch": 0.9123132597197736,
"grad_norm": 0.43293431401252747,
"learning_rate": 2.4372792496166032e-05,
"loss": 0.8804,
"step": 1229
},
{
"epoch": 0.9130555813306115,
"grad_norm": 0.42462730407714844,
"learning_rate": 2.4349153335587388e-05,
"loss": 0.7723,
"step": 1230
},
{
"epoch": 0.9137979029414494,
"grad_norm": 0.38612687587738037,
"learning_rate": 2.432550779583445e-05,
"loss": 0.8027,
"step": 1231
},
{
"epoch": 0.9145402245522872,
"grad_norm": 0.42818307876586914,
"learning_rate": 2.43018559115896e-05,
"loss": 0.9014,
"step": 1232
},
{
"epoch": 0.9152825461631252,
"grad_norm": 0.34975147247314453,
"learning_rate": 2.4278197717544533e-05,
"loss": 0.7912,
"step": 1233
},
{
"epoch": 0.9160248677739631,
"grad_norm": 0.44405031204223633,
"learning_rate": 2.4254533248400204e-05,
"loss": 0.8931,
"step": 1234
},
{
"epoch": 0.916767189384801,
"grad_norm": 0.4183354377746582,
"learning_rate": 2.423086253886675e-05,
"loss": 0.8008,
"step": 1235
},
{
"epoch": 0.9175095109956388,
"grad_norm": 0.40280815958976746,
"learning_rate": 2.420718562366349e-05,
"loss": 0.817,
"step": 1236
},
{
"epoch": 0.9182518326064768,
"grad_norm": 0.45507195591926575,
"learning_rate": 2.4183502537518826e-05,
"loss": 0.8518,
"step": 1237
},
{
"epoch": 0.9189941542173147,
"grad_norm": 0.4256339371204376,
"learning_rate": 2.4159813315170217e-05,
"loss": 0.8463,
"step": 1238
},
{
"epoch": 0.9197364758281525,
"grad_norm": 0.49776536226272583,
"learning_rate": 2.413611799136412e-05,
"loss": 0.8524,
"step": 1239
},
{
"epoch": 0.9204787974389904,
"grad_norm": 0.36638113856315613,
"learning_rate": 2.4112416600855944e-05,
"loss": 0.9285,
"step": 1240
},
{
"epoch": 0.9212211190498283,
"grad_norm": 0.4256168007850647,
"learning_rate": 2.4088709178409997e-05,
"loss": 0.9358,
"step": 1241
},
{
"epoch": 0.9219634406606663,
"grad_norm": 0.35663262009620667,
"learning_rate": 2.4064995758799437e-05,
"loss": 0.7417,
"step": 1242
},
{
"epoch": 0.9227057622715041,
"grad_norm": 0.4273064136505127,
"learning_rate": 2.404127637680621e-05,
"loss": 0.8254,
"step": 1243
},
{
"epoch": 0.923448083882342,
"grad_norm": 0.3672143220901489,
"learning_rate": 2.4017551067221014e-05,
"loss": 0.8111,
"step": 1244
},
{
"epoch": 0.9241904054931799,
"grad_norm": 0.38435935974121094,
"learning_rate": 2.3993819864843234e-05,
"loss": 0.88,
"step": 1245
},
{
"epoch": 0.9249327271040179,
"grad_norm": 0.37791430950164795,
"learning_rate": 2.3970082804480912e-05,
"loss": 0.8167,
"step": 1246
},
{
"epoch": 0.9256750487148557,
"grad_norm": 0.4131454825401306,
"learning_rate": 2.3946339920950675e-05,
"loss": 0.8046,
"step": 1247
},
{
"epoch": 0.9264173703256936,
"grad_norm": 0.42721277475357056,
"learning_rate": 2.392259124907768e-05,
"loss": 0.8612,
"step": 1248
},
{
"epoch": 0.9271596919365315,
"grad_norm": 0.39011621475219727,
"learning_rate": 2.3898836823695595e-05,
"loss": 0.8757,
"step": 1249
},
{
"epoch": 0.9279020135473695,
"grad_norm": 0.41248777508735657,
"learning_rate": 2.3875076679646503e-05,
"loss": 0.9099,
"step": 1250
},
{
"epoch": 0.9286443351582073,
"grad_norm": 0.4113259017467499,
"learning_rate": 2.38513108517809e-05,
"loss": 0.8793,
"step": 1251
},
{
"epoch": 0.9293866567690452,
"grad_norm": 0.4344083368778229,
"learning_rate": 2.3827539374957615e-05,
"loss": 0.7458,
"step": 1252
},
{
"epoch": 0.9301289783798831,
"grad_norm": 0.3516993522644043,
"learning_rate": 2.380376228404373e-05,
"loss": 0.796,
"step": 1253
},
{
"epoch": 0.9308712999907209,
"grad_norm": 0.3689666986465454,
"learning_rate": 2.3779979613914605e-05,
"loss": 0.8158,
"step": 1254
},
{
"epoch": 0.9316136216015589,
"grad_norm": 0.35044315457344055,
"learning_rate": 2.375619139945376e-05,
"loss": 0.8159,
"step": 1255
},
{
"epoch": 0.9323559432123968,
"grad_norm": 0.39020657539367676,
"learning_rate": 2.3732397675552847e-05,
"loss": 0.7833,
"step": 1256
},
{
"epoch": 0.9330982648232347,
"grad_norm": 0.3595007359981537,
"learning_rate": 2.3708598477111616e-05,
"loss": 0.8122,
"step": 1257
},
{
"epoch": 0.9338405864340725,
"grad_norm": 0.37129589915275574,
"learning_rate": 2.3684793839037825e-05,
"loss": 0.76,
"step": 1258
},
{
"epoch": 0.9345829080449105,
"grad_norm": 0.3745594918727875,
"learning_rate": 2.3660983796247226e-05,
"loss": 0.7385,
"step": 1259
},
{
"epoch": 0.9353252296557484,
"grad_norm": 0.38395509123802185,
"learning_rate": 2.3637168383663493e-05,
"loss": 0.7137,
"step": 1260
},
{
"epoch": 0.9360675512665863,
"grad_norm": 0.37863481044769287,
"learning_rate": 2.3613347636218164e-05,
"loss": 0.8338,
"step": 1261
},
{
"epoch": 0.9368098728774241,
"grad_norm": 0.41848132014274597,
"learning_rate": 2.358952158885063e-05,
"loss": 0.7827,
"step": 1262
},
{
"epoch": 0.937552194488262,
"grad_norm": 0.37910816073417664,
"learning_rate": 2.356569027650803e-05,
"loss": 0.7945,
"step": 1263
},
{
"epoch": 0.9382945160991,
"grad_norm": 0.37581756711006165,
"learning_rate": 2.354185373414524e-05,
"loss": 0.8229,
"step": 1264
},
{
"epoch": 0.9390368377099378,
"grad_norm": 0.3768438696861267,
"learning_rate": 2.3518011996724805e-05,
"loss": 0.8108,
"step": 1265
},
{
"epoch": 0.9397791593207757,
"grad_norm": 0.4513545334339142,
"learning_rate": 2.349416509921688e-05,
"loss": 0.8078,
"step": 1266
},
{
"epoch": 0.9405214809316136,
"grad_norm": 0.37917861342430115,
"learning_rate": 2.34703130765992e-05,
"loss": 0.7238,
"step": 1267
},
{
"epoch": 0.9412638025424516,
"grad_norm": 0.4270203709602356,
"learning_rate": 2.344645596385701e-05,
"loss": 0.9255,
"step": 1268
},
{
"epoch": 0.9420061241532894,
"grad_norm": 0.38025596737861633,
"learning_rate": 2.3422593795983026e-05,
"loss": 0.7485,
"step": 1269
},
{
"epoch": 0.9427484457641273,
"grad_norm": 0.35476383566856384,
"learning_rate": 2.3398726607977388e-05,
"loss": 0.7331,
"step": 1270
},
{
"epoch": 0.9434907673749652,
"grad_norm": 0.40570273995399475,
"learning_rate": 2.3374854434847572e-05,
"loss": 0.8989,
"step": 1271
},
{
"epoch": 0.944233088985803,
"grad_norm": 0.39702484011650085,
"learning_rate": 2.335097731160839e-05,
"loss": 0.7746,
"step": 1272
},
{
"epoch": 0.944975410596641,
"grad_norm": 0.4499233067035675,
"learning_rate": 2.3327095273281904e-05,
"loss": 0.9053,
"step": 1273
},
{
"epoch": 0.9457177322074789,
"grad_norm": 0.40540260076522827,
"learning_rate": 2.3303208354897392e-05,
"loss": 0.9249,
"step": 1274
},
{
"epoch": 0.9464600538183168,
"grad_norm": 0.37241825461387634,
"learning_rate": 2.327931659149129e-05,
"loss": 0.8205,
"step": 1275
},
{
"epoch": 0.9472023754291546,
"grad_norm": 0.42290613055229187,
"learning_rate": 2.325542001810712e-05,
"loss": 0.8871,
"step": 1276
},
{
"epoch": 0.9479446970399926,
"grad_norm": 0.40947234630584717,
"learning_rate": 2.32315186697955e-05,
"loss": 0.8654,
"step": 1277
},
{
"epoch": 0.9486870186508305,
"grad_norm": 0.4005737900733948,
"learning_rate": 2.3207612581614005e-05,
"loss": 0.9048,
"step": 1278
},
{
"epoch": 0.9494293402616684,
"grad_norm": 0.38072773814201355,
"learning_rate": 2.3183701788627192e-05,
"loss": 0.766,
"step": 1279
},
{
"epoch": 0.9501716618725062,
"grad_norm": 0.3864803612232208,
"learning_rate": 2.3159786325906524e-05,
"loss": 0.8,
"step": 1280
},
{
"epoch": 0.9509139834833441,
"grad_norm": 0.3687272071838379,
"learning_rate": 2.313586622853028e-05,
"loss": 0.7775,
"step": 1281
},
{
"epoch": 0.9516563050941821,
"grad_norm": 0.37793952226638794,
"learning_rate": 2.3111941531583573e-05,
"loss": 0.8045,
"step": 1282
},
{
"epoch": 0.95239862670502,
"grad_norm": 0.3550565540790558,
"learning_rate": 2.3088012270158244e-05,
"loss": 0.8476,
"step": 1283
},
{
"epoch": 0.9531409483158578,
"grad_norm": 0.3786463737487793,
"learning_rate": 2.3064078479352833e-05,
"loss": 0.7822,
"step": 1284
},
{
"epoch": 0.9538832699266957,
"grad_norm": 0.45172375440597534,
"learning_rate": 2.3040140194272515e-05,
"loss": 0.8627,
"step": 1285
},
{
"epoch": 0.9546255915375337,
"grad_norm": 0.44080403447151184,
"learning_rate": 2.3016197450029076e-05,
"loss": 0.9437,
"step": 1286
},
{
"epoch": 0.9553679131483716,
"grad_norm": 0.40898117423057556,
"learning_rate": 2.299225028174082e-05,
"loss": 0.8085,
"step": 1287
},
{
"epoch": 0.9561102347592094,
"grad_norm": 0.41421830654144287,
"learning_rate": 2.2968298724532562e-05,
"loss": 0.8255,
"step": 1288
},
{
"epoch": 0.9568525563700473,
"grad_norm": 0.33999302983283997,
"learning_rate": 2.294434281353554e-05,
"loss": 0.9014,
"step": 1289
},
{
"epoch": 0.9575948779808853,
"grad_norm": 0.47232887148857117,
"learning_rate": 2.292038258388738e-05,
"loss": 1.0212,
"step": 1290
},
{
"epoch": 0.9583371995917231,
"grad_norm": 0.37770771980285645,
"learning_rate": 2.2896418070732048e-05,
"loss": 0.7837,
"step": 1291
},
{
"epoch": 0.959079521202561,
"grad_norm": 0.36593371629714966,
"learning_rate": 2.2872449309219778e-05,
"loss": 0.7225,
"step": 1292
},
{
"epoch": 0.9598218428133989,
"grad_norm": 0.36546435952186584,
"learning_rate": 2.2848476334507068e-05,
"loss": 0.7716,
"step": 1293
},
{
"epoch": 0.9605641644242368,
"grad_norm": 0.3623177111148834,
"learning_rate": 2.2824499181756555e-05,
"loss": 0.7491,
"step": 1294
},
{
"epoch": 0.9613064860350747,
"grad_norm": 0.3944539427757263,
"learning_rate": 2.280051788613703e-05,
"loss": 0.7789,
"step": 1295
},
{
"epoch": 0.9620488076459126,
"grad_norm": 0.42587587237358093,
"learning_rate": 2.277653248282336e-05,
"loss": 0.7938,
"step": 1296
},
{
"epoch": 0.9627911292567505,
"grad_norm": 0.41713660955429077,
"learning_rate": 2.275254300699642e-05,
"loss": 0.8678,
"step": 1297
},
{
"epoch": 0.9635334508675883,
"grad_norm": 0.402723491191864,
"learning_rate": 2.272854949384308e-05,
"loss": 0.8656,
"step": 1298
},
{
"epoch": 0.9642757724784263,
"grad_norm": 0.3979452848434448,
"learning_rate": 2.2704551978556112e-05,
"loss": 0.8606,
"step": 1299
},
{
"epoch": 0.9650180940892642,
"grad_norm": 0.3622892200946808,
"learning_rate": 2.2680550496334176e-05,
"loss": 0.7591,
"step": 1300
},
{
"epoch": 0.9657604157001021,
"grad_norm": 0.4129877984523773,
"learning_rate": 2.265654508238174e-05,
"loss": 0.8572,
"step": 1301
},
{
"epoch": 0.9665027373109399,
"grad_norm": 0.38189956545829773,
"learning_rate": 2.2632535771909036e-05,
"loss": 0.8457,
"step": 1302
},
{
"epoch": 0.9672450589217778,
"grad_norm": 0.3791535794734955,
"learning_rate": 2.2608522600132017e-05,
"loss": 0.7564,
"step": 1303
},
{
"epoch": 0.9679873805326158,
"grad_norm": 0.3876500427722931,
"learning_rate": 2.2584505602272305e-05,
"loss": 0.8574,
"step": 1304
},
{
"epoch": 0.9687297021434537,
"grad_norm": 0.36182138323783875,
"learning_rate": 2.2560484813557122e-05,
"loss": 0.7202,
"step": 1305
},
{
"epoch": 0.9694720237542915,
"grad_norm": 0.3705620765686035,
"learning_rate": 2.253646026921926e-05,
"loss": 0.7622,
"step": 1306
},
{
"epoch": 0.9702143453651294,
"grad_norm": 0.3661751449108124,
"learning_rate": 2.2512432004497015e-05,
"loss": 0.7586,
"step": 1307
},
{
"epoch": 0.9709566669759674,
"grad_norm": 0.35583674907684326,
"learning_rate": 2.248840005463414e-05,
"loss": 0.8295,
"step": 1308
},
{
"epoch": 0.9716989885868053,
"grad_norm": 0.34773266315460205,
"learning_rate": 2.2464364454879792e-05,
"loss": 0.8581,
"step": 1309
},
{
"epoch": 0.9724413101976431,
"grad_norm": 0.35552486777305603,
"learning_rate": 2.2440325240488484e-05,
"loss": 0.8472,
"step": 1310
},
{
"epoch": 0.973183631808481,
"grad_norm": 0.332487016916275,
"learning_rate": 2.241628244672003e-05,
"loss": 0.791,
"step": 1311
},
{
"epoch": 0.9739259534193189,
"grad_norm": 0.41265782713890076,
"learning_rate": 2.2392236108839503e-05,
"loss": 0.8364,
"step": 1312
},
{
"epoch": 0.9746682750301568,
"grad_norm": 0.4064471125602722,
"learning_rate": 2.236818626211715e-05,
"loss": 0.8414,
"step": 1313
},
{
"epoch": 0.9754105966409947,
"grad_norm": 0.3656854033470154,
"learning_rate": 2.2344132941828395e-05,
"loss": 0.7787,
"step": 1314
},
{
"epoch": 0.9761529182518326,
"grad_norm": 0.40122607350349426,
"learning_rate": 2.2320076183253733e-05,
"loss": 0.8778,
"step": 1315
},
{
"epoch": 0.9768952398626705,
"grad_norm": 0.46013033390045166,
"learning_rate": 2.229601602167871e-05,
"loss": 0.8577,
"step": 1316
},
{
"epoch": 0.9776375614735084,
"grad_norm": 0.41439104080200195,
"learning_rate": 2.227195249239387e-05,
"loss": 0.7765,
"step": 1317
},
{
"epoch": 0.9783798830843463,
"grad_norm": 0.422951340675354,
"learning_rate": 2.224788563069469e-05,
"loss": 0.8019,
"step": 1318
},
{
"epoch": 0.9791222046951842,
"grad_norm": 0.3669542372226715,
"learning_rate": 2.2223815471881543e-05,
"loss": 0.7908,
"step": 1319
},
{
"epoch": 0.979864526306022,
"grad_norm": 0.39412543177604675,
"learning_rate": 2.2199742051259617e-05,
"loss": 0.8098,
"step": 1320
},
{
"epoch": 0.9806068479168599,
"grad_norm": 0.41053861379623413,
"learning_rate": 2.2175665404138907e-05,
"loss": 0.7639,
"step": 1321
},
{
"epoch": 0.9813491695276979,
"grad_norm": 0.44688349962234497,
"learning_rate": 2.215158556583414e-05,
"loss": 0.707,
"step": 1322
},
{
"epoch": 0.9820914911385358,
"grad_norm": 0.4231646955013275,
"learning_rate": 2.2127502571664698e-05,
"loss": 0.794,
"step": 1323
},
{
"epoch": 0.9828338127493736,
"grad_norm": 0.3481779992580414,
"learning_rate": 2.2103416456954623e-05,
"loss": 0.7119,
"step": 1324
},
{
"epoch": 0.9835761343602115,
"grad_norm": 0.4194432497024536,
"learning_rate": 2.2079327257032515e-05,
"loss": 0.8184,
"step": 1325
},
{
"epoch": 0.9843184559710495,
"grad_norm": 0.48259228467941284,
"learning_rate": 2.2055235007231507e-05,
"loss": 0.8588,
"step": 1326
},
{
"epoch": 0.9850607775818874,
"grad_norm": 0.33273354172706604,
"learning_rate": 2.203113974288921e-05,
"loss": 0.7598,
"step": 1327
},
{
"epoch": 0.9858030991927252,
"grad_norm": 0.3975766599178314,
"learning_rate": 2.2007041499347634e-05,
"loss": 0.7695,
"step": 1328
},
{
"epoch": 0.9865454208035631,
"grad_norm": 0.3873244524002075,
"learning_rate": 2.1982940311953187e-05,
"loss": 0.8279,
"step": 1329
},
{
"epoch": 0.9872877424144011,
"grad_norm": 0.3644872307777405,
"learning_rate": 2.195883621605658e-05,
"loss": 0.7318,
"step": 1330
},
{
"epoch": 0.988030064025239,
"grad_norm": 0.4143805205821991,
"learning_rate": 2.1934729247012793e-05,
"loss": 0.8224,
"step": 1331
},
{
"epoch": 0.9887723856360768,
"grad_norm": 0.372543066740036,
"learning_rate": 2.1910619440181025e-05,
"loss": 0.8265,
"step": 1332
},
{
"epoch": 0.9895147072469147,
"grad_norm": 0.34617361426353455,
"learning_rate": 2.1886506830924625e-05,
"loss": 0.7999,
"step": 1333
},
{
"epoch": 0.9902570288577526,
"grad_norm": 0.4121123254299164,
"learning_rate": 2.186239145461106e-05,
"loss": 0.7929,
"step": 1334
},
{
"epoch": 0.9909993504685906,
"grad_norm": 0.41998445987701416,
"learning_rate": 2.1838273346611866e-05,
"loss": 0.8936,
"step": 1335
},
{
"epoch": 0.9917416720794284,
"grad_norm": 0.3765590488910675,
"learning_rate": 2.181415254230256e-05,
"loss": 0.8088,
"step": 1336
},
{
"epoch": 0.9924839936902663,
"grad_norm": 0.426413893699646,
"learning_rate": 2.1790029077062643e-05,
"loss": 0.8305,
"step": 1337
},
{
"epoch": 0.9932263153011042,
"grad_norm": 0.38797062635421753,
"learning_rate": 2.17659029862755e-05,
"loss": 0.6533,
"step": 1338
},
{
"epoch": 0.9939686369119421,
"grad_norm": 0.4096301794052124,
"learning_rate": 2.1741774305328365e-05,
"loss": 0.8569,
"step": 1339
},
{
"epoch": 0.99471095852278,
"grad_norm": 0.402988076210022,
"learning_rate": 2.1717643069612296e-05,
"loss": 0.799,
"step": 1340
},
{
"epoch": 0.9954532801336179,
"grad_norm": 0.5020378232002258,
"learning_rate": 2.169350931452206e-05,
"loss": 0.9032,
"step": 1341
},
{
"epoch": 0.9961956017444558,
"grad_norm": 0.4305264353752136,
"learning_rate": 2.1669373075456154e-05,
"loss": 0.9215,
"step": 1342
},
{
"epoch": 0.9969379233552936,
"grad_norm": 0.43394702672958374,
"learning_rate": 2.1645234387816695e-05,
"loss": 0.8829,
"step": 1343
},
{
"epoch": 0.9976802449661316,
"grad_norm": 0.4227968752384186,
"learning_rate": 2.1621093287009406e-05,
"loss": 0.8431,
"step": 1344
},
{
"epoch": 0.9984225665769695,
"grad_norm": 0.4018241763114929,
"learning_rate": 2.1596949808443553e-05,
"loss": 0.7647,
"step": 1345
},
{
"epoch": 0.9991648881878074,
"grad_norm": 0.36082741618156433,
"learning_rate": 2.1572803987531864e-05,
"loss": 0.7796,
"step": 1346
},
{
"epoch": 0.9999072097986452,
"grad_norm": 0.4092870354652405,
"learning_rate": 2.154865585969053e-05,
"loss": 0.8429,
"step": 1347
}
],
"logging_steps": 1,
"max_steps": 2694,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 1.6697839224954552e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}