fietje-2 / checkpoint-900 /trainer_state.json
BramVanroy's picture
Training in progress, step 900, checkpoint
a9d7cd5 verified
raw
history blame
144 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.13177802723412563,
"eval_steps": 900,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.867297319395897,
"learning_rate": 8.99868209108215e-05,
"loss": 3.2458,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 31.122135116455087,
"learning_rate": 8.9973641821643e-05,
"loss": 6.8269,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 10.493605611332246,
"learning_rate": 8.99604627324645e-05,
"loss": 6.1526,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 20.993597101446063,
"learning_rate": 8.994728364328598e-05,
"loss": 6.2791,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 6.994941997336733,
"learning_rate": 8.993410455410749e-05,
"loss": 5.5069,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 9.044801050606559,
"learning_rate": 8.992092546492898e-05,
"loss": 5.2917,
"step": 6
},
{
"epoch": 0.0,
"grad_norm": 7.371025975927372,
"learning_rate": 8.990774637575048e-05,
"loss": 5.0841,
"step": 7
},
{
"epoch": 0.0,
"grad_norm": 3.095134034689849,
"learning_rate": 8.989456728657198e-05,
"loss": 4.6676,
"step": 8
},
{
"epoch": 0.0,
"grad_norm": 2.727749907194312,
"learning_rate": 8.988138819739348e-05,
"loss": 4.4157,
"step": 9
},
{
"epoch": 0.0,
"grad_norm": 3.1785847861912533,
"learning_rate": 8.986820910821497e-05,
"loss": 4.3083,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 1.5890441000530777,
"learning_rate": 8.985503001903648e-05,
"loss": 4.0652,
"step": 11
},
{
"epoch": 0.0,
"grad_norm": 3.7851888183485998,
"learning_rate": 8.984185092985796e-05,
"loss": 4.0157,
"step": 12
},
{
"epoch": 0.0,
"grad_norm": 2.177578439182146,
"learning_rate": 8.982867184067945e-05,
"loss": 3.8923,
"step": 13
},
{
"epoch": 0.0,
"grad_norm": 4.189312958133593,
"learning_rate": 8.981549275150096e-05,
"loss": 3.8165,
"step": 14
},
{
"epoch": 0.0,
"grad_norm": 2.135971084370728,
"learning_rate": 8.980231366232245e-05,
"loss": 3.7407,
"step": 15
},
{
"epoch": 0.0,
"grad_norm": 1.6742409253843133,
"learning_rate": 8.978913457314395e-05,
"loss": 3.6203,
"step": 16
},
{
"epoch": 0.0,
"grad_norm": 1.740647923231835,
"learning_rate": 8.977595548396545e-05,
"loss": 3.5358,
"step": 17
},
{
"epoch": 0.0,
"grad_norm": 1.3386504358864526,
"learning_rate": 8.976277639478695e-05,
"loss": 3.4649,
"step": 18
},
{
"epoch": 0.0,
"grad_norm": 1.4146027388570108,
"learning_rate": 8.974959730560844e-05,
"loss": 3.4096,
"step": 19
},
{
"epoch": 0.0,
"grad_norm": 1.3101076465055932,
"learning_rate": 8.973641821642993e-05,
"loss": 3.3495,
"step": 20
},
{
"epoch": 0.0,
"grad_norm": 1.3979232652764932,
"learning_rate": 8.972323912725143e-05,
"loss": 3.2864,
"step": 21
},
{
"epoch": 0.0,
"grad_norm": 1.112058959017875,
"learning_rate": 8.971006003807292e-05,
"loss": 3.2272,
"step": 22
},
{
"epoch": 0.0,
"grad_norm": 1.5723342224397572,
"learning_rate": 8.969688094889443e-05,
"loss": 3.202,
"step": 23
},
{
"epoch": 0.0,
"grad_norm": 1.6929219679520722,
"learning_rate": 8.968370185971592e-05,
"loss": 3.1556,
"step": 24
},
{
"epoch": 0.0,
"grad_norm": 1.0947475079787117,
"learning_rate": 8.967052277053742e-05,
"loss": 3.1127,
"step": 25
},
{
"epoch": 0.0,
"grad_norm": 1.0263062806535903,
"learning_rate": 8.965734368135892e-05,
"loss": 3.07,
"step": 26
},
{
"epoch": 0.0,
"grad_norm": 0.9314787514966034,
"learning_rate": 8.964416459218042e-05,
"loss": 3.0321,
"step": 27
},
{
"epoch": 0.0,
"grad_norm": 0.8258215193640202,
"learning_rate": 8.96309855030019e-05,
"loss": 2.9997,
"step": 28
},
{
"epoch": 0.0,
"grad_norm": 0.8854426518873304,
"learning_rate": 8.96178064138234e-05,
"loss": 2.9693,
"step": 29
},
{
"epoch": 0.0,
"grad_norm": 0.7136563954565109,
"learning_rate": 8.96046273246449e-05,
"loss": 2.9337,
"step": 30
},
{
"epoch": 0.0,
"grad_norm": 0.8520936236031108,
"learning_rate": 8.959144823546639e-05,
"loss": 2.8958,
"step": 31
},
{
"epoch": 0.0,
"grad_norm": 0.7069687542653055,
"learning_rate": 8.95782691462879e-05,
"loss": 2.8893,
"step": 32
},
{
"epoch": 0.0,
"grad_norm": 0.6112456126798674,
"learning_rate": 8.95650900571094e-05,
"loss": 2.8452,
"step": 33
},
{
"epoch": 0.0,
"grad_norm": 0.6182491443128697,
"learning_rate": 8.955191096793089e-05,
"loss": 2.8375,
"step": 34
},
{
"epoch": 0.01,
"grad_norm": 0.6354831752982139,
"learning_rate": 8.95387318787524e-05,
"loss": 2.8036,
"step": 35
},
{
"epoch": 0.01,
"grad_norm": 0.5551758024561351,
"learning_rate": 8.952555278957388e-05,
"loss": 2.7898,
"step": 36
},
{
"epoch": 0.01,
"grad_norm": 0.5385458807945395,
"learning_rate": 8.951237370039537e-05,
"loss": 2.7654,
"step": 37
},
{
"epoch": 0.01,
"grad_norm": 0.611312479127073,
"learning_rate": 8.949919461121688e-05,
"loss": 2.7404,
"step": 38
},
{
"epoch": 0.01,
"grad_norm": 0.41994683299580327,
"learning_rate": 8.948601552203837e-05,
"loss": 2.727,
"step": 39
},
{
"epoch": 0.01,
"grad_norm": 0.4844003990796705,
"learning_rate": 8.947283643285986e-05,
"loss": 2.7055,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 0.5096807026587409,
"learning_rate": 8.945965734368137e-05,
"loss": 2.6912,
"step": 41
},
{
"epoch": 0.01,
"grad_norm": 0.7522366060460143,
"learning_rate": 8.944647825450286e-05,
"loss": 2.681,
"step": 42
},
{
"epoch": 0.01,
"grad_norm": 0.9331119398395223,
"learning_rate": 8.943329916532436e-05,
"loss": 2.6557,
"step": 43
},
{
"epoch": 0.01,
"grad_norm": 1.012546232060092,
"learning_rate": 8.942012007614585e-05,
"loss": 2.6533,
"step": 44
},
{
"epoch": 0.01,
"grad_norm": 0.8636983952962917,
"learning_rate": 8.940694098696735e-05,
"loss": 2.6397,
"step": 45
},
{
"epoch": 0.01,
"grad_norm": 0.7226119626951081,
"learning_rate": 8.939376189778884e-05,
"loss": 2.6172,
"step": 46
},
{
"epoch": 0.01,
"grad_norm": 0.8159190836781818,
"learning_rate": 8.938058280861035e-05,
"loss": 2.601,
"step": 47
},
{
"epoch": 0.01,
"grad_norm": 0.8397723356140642,
"learning_rate": 8.936740371943184e-05,
"loss": 2.5913,
"step": 48
},
{
"epoch": 0.01,
"grad_norm": 0.9302397602027757,
"learning_rate": 8.935422463025333e-05,
"loss": 2.5779,
"step": 49
},
{
"epoch": 0.01,
"grad_norm": 0.9294815366036785,
"learning_rate": 8.934104554107484e-05,
"loss": 2.5666,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 0.9876017326778281,
"learning_rate": 8.932786645189634e-05,
"loss": 2.559,
"step": 51
},
{
"epoch": 0.01,
"grad_norm": 1.0531686104317621,
"learning_rate": 8.931468736271782e-05,
"loss": 2.5426,
"step": 52
},
{
"epoch": 0.01,
"grad_norm": 0.9432926643235091,
"learning_rate": 8.930150827353932e-05,
"loss": 2.5324,
"step": 53
},
{
"epoch": 0.01,
"grad_norm": 1.0147959053073372,
"learning_rate": 8.928832918436082e-05,
"loss": 2.5166,
"step": 54
},
{
"epoch": 0.01,
"grad_norm": 0.8370421203774538,
"learning_rate": 8.927515009518231e-05,
"loss": 2.5035,
"step": 55
},
{
"epoch": 0.01,
"grad_norm": 0.7658170492109742,
"learning_rate": 8.926197100600382e-05,
"loss": 2.5064,
"step": 56
},
{
"epoch": 0.01,
"grad_norm": 0.5836453359341242,
"learning_rate": 8.924879191682531e-05,
"loss": 2.4889,
"step": 57
},
{
"epoch": 0.01,
"grad_norm": 0.6774145346502838,
"learning_rate": 8.92356128276468e-05,
"loss": 2.4773,
"step": 58
},
{
"epoch": 0.01,
"grad_norm": 0.6303115552198764,
"learning_rate": 8.922243373846831e-05,
"loss": 2.4746,
"step": 59
},
{
"epoch": 0.01,
"grad_norm": 0.4733727246322016,
"learning_rate": 8.920925464928979e-05,
"loss": 2.4583,
"step": 60
},
{
"epoch": 0.01,
"grad_norm": 0.66470201831204,
"learning_rate": 8.919607556011129e-05,
"loss": 2.4509,
"step": 61
},
{
"epoch": 0.01,
"grad_norm": 0.6356308149108085,
"learning_rate": 8.91828964709328e-05,
"loss": 2.4375,
"step": 62
},
{
"epoch": 0.01,
"grad_norm": 0.46831466582575737,
"learning_rate": 8.916971738175429e-05,
"loss": 2.4314,
"step": 63
},
{
"epoch": 0.01,
"grad_norm": 0.5711699065497955,
"learning_rate": 8.915653829257578e-05,
"loss": 2.42,
"step": 64
},
{
"epoch": 0.01,
"grad_norm": 0.5389976605269295,
"learning_rate": 8.914335920339729e-05,
"loss": 2.4161,
"step": 65
},
{
"epoch": 0.01,
"grad_norm": 0.5007278366304382,
"learning_rate": 8.913018011421878e-05,
"loss": 2.3977,
"step": 66
},
{
"epoch": 0.01,
"grad_norm": 0.487467802029198,
"learning_rate": 8.911700102504028e-05,
"loss": 2.4007,
"step": 67
},
{
"epoch": 0.01,
"grad_norm": 0.4113420784339612,
"learning_rate": 8.910382193586177e-05,
"loss": 2.3966,
"step": 68
},
{
"epoch": 0.01,
"grad_norm": 0.4504091272252521,
"learning_rate": 8.909064284668326e-05,
"loss": 2.3862,
"step": 69
},
{
"epoch": 0.01,
"grad_norm": 0.34513031845084635,
"learning_rate": 8.907746375750476e-05,
"loss": 2.3712,
"step": 70
},
{
"epoch": 0.01,
"grad_norm": 0.5047955438477213,
"learning_rate": 8.906428466832626e-05,
"loss": 2.3695,
"step": 71
},
{
"epoch": 0.01,
"grad_norm": 0.5024769592440181,
"learning_rate": 8.905110557914776e-05,
"loss": 2.3644,
"step": 72
},
{
"epoch": 0.01,
"grad_norm": 0.4721080086046586,
"learning_rate": 8.903792648996925e-05,
"loss": 2.3555,
"step": 73
},
{
"epoch": 0.01,
"grad_norm": 0.4591297268523745,
"learning_rate": 8.902474740079076e-05,
"loss": 2.3376,
"step": 74
},
{
"epoch": 0.01,
"grad_norm": 0.5656344865220397,
"learning_rate": 8.901156831161225e-05,
"loss": 2.3398,
"step": 75
},
{
"epoch": 0.01,
"grad_norm": 0.6500485453460426,
"learning_rate": 8.899838922243373e-05,
"loss": 2.3325,
"step": 76
},
{
"epoch": 0.01,
"grad_norm": 0.6536690673884828,
"learning_rate": 8.898521013325524e-05,
"loss": 2.3345,
"step": 77
},
{
"epoch": 0.01,
"grad_norm": 0.5143484723108316,
"learning_rate": 8.897203104407673e-05,
"loss": 2.3139,
"step": 78
},
{
"epoch": 0.01,
"grad_norm": 0.37322968848559845,
"learning_rate": 8.895885195489823e-05,
"loss": 2.3079,
"step": 79
},
{
"epoch": 0.01,
"grad_norm": 0.34462618220763114,
"learning_rate": 8.894567286571974e-05,
"loss": 2.2973,
"step": 80
},
{
"epoch": 0.01,
"grad_norm": 0.38708375377202425,
"learning_rate": 8.893249377654123e-05,
"loss": 2.2951,
"step": 81
},
{
"epoch": 0.01,
"grad_norm": 0.421171485688958,
"learning_rate": 8.891931468736272e-05,
"loss": 2.2951,
"step": 82
},
{
"epoch": 0.01,
"grad_norm": 0.4465155129873313,
"learning_rate": 8.890613559818423e-05,
"loss": 2.2908,
"step": 83
},
{
"epoch": 0.01,
"grad_norm": 0.42100671498868236,
"learning_rate": 8.889295650900571e-05,
"loss": 2.2862,
"step": 84
},
{
"epoch": 0.01,
"grad_norm": 0.4308804247735972,
"learning_rate": 8.88797774198272e-05,
"loss": 2.2868,
"step": 85
},
{
"epoch": 0.01,
"grad_norm": 0.4490696078386771,
"learning_rate": 8.886659833064871e-05,
"loss": 2.2784,
"step": 86
},
{
"epoch": 0.01,
"grad_norm": 0.4720132579092968,
"learning_rate": 8.88534192414702e-05,
"loss": 2.2596,
"step": 87
},
{
"epoch": 0.01,
"grad_norm": 0.5051381693356659,
"learning_rate": 8.88402401522917e-05,
"loss": 2.2644,
"step": 88
},
{
"epoch": 0.01,
"grad_norm": 0.5161793827955174,
"learning_rate": 8.88270610631132e-05,
"loss": 2.2561,
"step": 89
},
{
"epoch": 0.01,
"grad_norm": 0.5947828202225921,
"learning_rate": 8.88138819739347e-05,
"loss": 2.2485,
"step": 90
},
{
"epoch": 0.01,
"grad_norm": 0.7363411963455597,
"learning_rate": 8.88007028847562e-05,
"loss": 2.256,
"step": 91
},
{
"epoch": 0.01,
"grad_norm": 0.7827586632222484,
"learning_rate": 8.878752379557769e-05,
"loss": 2.2498,
"step": 92
},
{
"epoch": 0.01,
"grad_norm": 0.5518918442765439,
"learning_rate": 8.877434470639918e-05,
"loss": 2.242,
"step": 93
},
{
"epoch": 0.01,
"grad_norm": 0.47976621209079673,
"learning_rate": 8.876116561722067e-05,
"loss": 2.2336,
"step": 94
},
{
"epoch": 0.01,
"grad_norm": 0.6515602255953327,
"learning_rate": 8.874798652804218e-05,
"loss": 2.2348,
"step": 95
},
{
"epoch": 0.01,
"grad_norm": 0.6536808073233461,
"learning_rate": 8.873480743886368e-05,
"loss": 2.2269,
"step": 96
},
{
"epoch": 0.01,
"grad_norm": 0.45429917400178504,
"learning_rate": 8.872162834968517e-05,
"loss": 2.2215,
"step": 97
},
{
"epoch": 0.01,
"grad_norm": 0.6059083408950834,
"learning_rate": 8.870844926050668e-05,
"loss": 2.2141,
"step": 98
},
{
"epoch": 0.01,
"grad_norm": 0.6153517456337565,
"learning_rate": 8.869527017132817e-05,
"loss": 2.2135,
"step": 99
},
{
"epoch": 0.01,
"grad_norm": 0.4350225773043892,
"learning_rate": 8.868209108214965e-05,
"loss": 2.2046,
"step": 100
},
{
"epoch": 0.01,
"grad_norm": 0.7538851308577357,
"learning_rate": 8.866891199297116e-05,
"loss": 2.1987,
"step": 101
},
{
"epoch": 0.01,
"grad_norm": 0.7783668567986096,
"learning_rate": 8.865573290379265e-05,
"loss": 2.1944,
"step": 102
},
{
"epoch": 0.02,
"grad_norm": 0.6152937329119922,
"learning_rate": 8.864255381461415e-05,
"loss": 2.1976,
"step": 103
},
{
"epoch": 0.02,
"grad_norm": 0.8129985392759269,
"learning_rate": 8.862937472543565e-05,
"loss": 2.1969,
"step": 104
},
{
"epoch": 0.02,
"grad_norm": 0.658961787535577,
"learning_rate": 8.861619563625715e-05,
"loss": 2.1916,
"step": 105
},
{
"epoch": 0.02,
"grad_norm": 0.5694890400136322,
"learning_rate": 8.860301654707864e-05,
"loss": 2.1853,
"step": 106
},
{
"epoch": 0.02,
"grad_norm": 0.8449981372046623,
"learning_rate": 8.858983745790015e-05,
"loss": 2.1918,
"step": 107
},
{
"epoch": 0.02,
"grad_norm": 0.6504754487654488,
"learning_rate": 8.857665836872163e-05,
"loss": 2.18,
"step": 108
},
{
"epoch": 0.02,
"grad_norm": 0.5770328719385773,
"learning_rate": 8.856347927954312e-05,
"loss": 2.1726,
"step": 109
},
{
"epoch": 0.02,
"grad_norm": 0.876682475915139,
"learning_rate": 8.855030019036463e-05,
"loss": 2.1697,
"step": 110
},
{
"epoch": 0.02,
"grad_norm": 0.6193197639347029,
"learning_rate": 8.853712110118612e-05,
"loss": 2.1762,
"step": 111
},
{
"epoch": 0.02,
"grad_norm": 0.5205430357287273,
"learning_rate": 8.852394201200762e-05,
"loss": 2.1643,
"step": 112
},
{
"epoch": 0.02,
"grad_norm": 0.6357317276650712,
"learning_rate": 8.851076292282912e-05,
"loss": 2.1698,
"step": 113
},
{
"epoch": 0.02,
"grad_norm": 0.5337930185336511,
"learning_rate": 8.849758383365062e-05,
"loss": 2.1533,
"step": 114
},
{
"epoch": 0.02,
"grad_norm": 0.5062081503044676,
"learning_rate": 8.848440474447211e-05,
"loss": 2.1554,
"step": 115
},
{
"epoch": 0.02,
"grad_norm": 0.4764632463998196,
"learning_rate": 8.84712256552936e-05,
"loss": 2.1505,
"step": 116
},
{
"epoch": 0.02,
"grad_norm": 0.41457257334024383,
"learning_rate": 8.84580465661151e-05,
"loss": 2.1467,
"step": 117
},
{
"epoch": 0.02,
"grad_norm": 0.5065884475761684,
"learning_rate": 8.844486747693659e-05,
"loss": 2.1422,
"step": 118
},
{
"epoch": 0.02,
"grad_norm": 0.4564614277589642,
"learning_rate": 8.84316883877581e-05,
"loss": 2.1412,
"step": 119
},
{
"epoch": 0.02,
"grad_norm": 0.3554317111825436,
"learning_rate": 8.84185092985796e-05,
"loss": 2.1455,
"step": 120
},
{
"epoch": 0.02,
"grad_norm": 0.42806997535938857,
"learning_rate": 8.840533020940109e-05,
"loss": 2.1275,
"step": 121
},
{
"epoch": 0.02,
"grad_norm": 0.42000313047228083,
"learning_rate": 8.83921511202226e-05,
"loss": 2.1342,
"step": 122
},
{
"epoch": 0.02,
"grad_norm": 0.3885632469021741,
"learning_rate": 8.837897203104409e-05,
"loss": 2.1307,
"step": 123
},
{
"epoch": 0.02,
"grad_norm": 0.47343669511945285,
"learning_rate": 8.836579294186557e-05,
"loss": 2.1317,
"step": 124
},
{
"epoch": 0.02,
"grad_norm": 0.48475235283422596,
"learning_rate": 8.835261385268708e-05,
"loss": 2.1161,
"step": 125
},
{
"epoch": 0.02,
"grad_norm": 0.566942141589765,
"learning_rate": 8.833943476350857e-05,
"loss": 2.1182,
"step": 126
},
{
"epoch": 0.02,
"grad_norm": 0.6962692105838626,
"learning_rate": 8.832625567433006e-05,
"loss": 2.1201,
"step": 127
},
{
"epoch": 0.02,
"grad_norm": 0.8872705283662579,
"learning_rate": 8.831307658515157e-05,
"loss": 2.1125,
"step": 128
},
{
"epoch": 0.02,
"grad_norm": 1.0040048160020307,
"learning_rate": 8.829989749597306e-05,
"loss": 2.1243,
"step": 129
},
{
"epoch": 0.02,
"grad_norm": 0.8763157343936164,
"learning_rate": 8.828671840679456e-05,
"loss": 2.135,
"step": 130
},
{
"epoch": 0.02,
"grad_norm": 0.4928878082025761,
"learning_rate": 8.827353931761607e-05,
"loss": 2.1159,
"step": 131
},
{
"epoch": 0.02,
"grad_norm": 0.5070687239013271,
"learning_rate": 8.826036022843755e-05,
"loss": 2.1048,
"step": 132
},
{
"epoch": 0.02,
"grad_norm": 0.7840409646534107,
"learning_rate": 8.824718113925904e-05,
"loss": 2.1037,
"step": 133
},
{
"epoch": 0.02,
"grad_norm": 0.5221585439101886,
"learning_rate": 8.823400205008055e-05,
"loss": 2.0959,
"step": 134
},
{
"epoch": 0.02,
"grad_norm": 0.4568661842831729,
"learning_rate": 8.822082296090204e-05,
"loss": 2.0918,
"step": 135
},
{
"epoch": 0.02,
"grad_norm": 0.5841088902261057,
"learning_rate": 8.820764387172353e-05,
"loss": 2.0916,
"step": 136
},
{
"epoch": 0.02,
"grad_norm": 0.5321596782302892,
"learning_rate": 8.819446478254504e-05,
"loss": 2.0897,
"step": 137
},
{
"epoch": 0.02,
"grad_norm": 0.4811464623013186,
"learning_rate": 8.818128569336654e-05,
"loss": 2.0797,
"step": 138
},
{
"epoch": 0.02,
"grad_norm": 0.45409201842843017,
"learning_rate": 8.816810660418803e-05,
"loss": 2.0809,
"step": 139
},
{
"epoch": 0.02,
"grad_norm": 0.5436744181229896,
"learning_rate": 8.815492751500952e-05,
"loss": 2.0768,
"step": 140
},
{
"epoch": 0.02,
"grad_norm": 0.465625799362755,
"learning_rate": 8.814174842583102e-05,
"loss": 2.0801,
"step": 141
},
{
"epoch": 0.02,
"grad_norm": 0.4631920693131495,
"learning_rate": 8.812856933665251e-05,
"loss": 2.0729,
"step": 142
},
{
"epoch": 0.02,
"grad_norm": 0.4973179737348441,
"learning_rate": 8.811539024747402e-05,
"loss": 2.0765,
"step": 143
},
{
"epoch": 0.02,
"grad_norm": 0.3900332148198551,
"learning_rate": 8.810221115829551e-05,
"loss": 2.0688,
"step": 144
},
{
"epoch": 0.02,
"grad_norm": 0.45058451016387613,
"learning_rate": 8.8089032069117e-05,
"loss": 2.0684,
"step": 145
},
{
"epoch": 0.02,
"grad_norm": 0.35118754238330746,
"learning_rate": 8.807585297993851e-05,
"loss": 2.0681,
"step": 146
},
{
"epoch": 0.02,
"grad_norm": 0.33942242720035193,
"learning_rate": 8.806267389076e-05,
"loss": 2.0626,
"step": 147
},
{
"epoch": 0.02,
"grad_norm": 0.34835606297437544,
"learning_rate": 8.804949480158149e-05,
"loss": 2.0611,
"step": 148
},
{
"epoch": 0.02,
"grad_norm": 0.33374548138193794,
"learning_rate": 8.8036315712403e-05,
"loss": 2.0589,
"step": 149
},
{
"epoch": 0.02,
"grad_norm": 0.3554684609547751,
"learning_rate": 8.802313662322449e-05,
"loss": 2.0531,
"step": 150
},
{
"epoch": 0.02,
"grad_norm": 0.30356995848645646,
"learning_rate": 8.800995753404598e-05,
"loss": 2.0527,
"step": 151
},
{
"epoch": 0.02,
"grad_norm": 0.3749048670516549,
"learning_rate": 8.799677844486749e-05,
"loss": 2.0409,
"step": 152
},
{
"epoch": 0.02,
"grad_norm": 0.44906357358681387,
"learning_rate": 8.798359935568898e-05,
"loss": 2.0455,
"step": 153
},
{
"epoch": 0.02,
"grad_norm": 0.4937506907032551,
"learning_rate": 8.797042026651048e-05,
"loss": 2.0593,
"step": 154
},
{
"epoch": 0.02,
"grad_norm": 0.6302087869169094,
"learning_rate": 8.795724117733198e-05,
"loss": 2.0444,
"step": 155
},
{
"epoch": 0.02,
"grad_norm": 0.7666465190058978,
"learning_rate": 8.794406208815346e-05,
"loss": 2.0482,
"step": 156
},
{
"epoch": 0.02,
"grad_norm": 0.8415113455563511,
"learning_rate": 8.793088299897496e-05,
"loss": 2.0553,
"step": 157
},
{
"epoch": 0.02,
"grad_norm": 0.6599046675210126,
"learning_rate": 8.791770390979646e-05,
"loss": 2.0346,
"step": 158
},
{
"epoch": 0.02,
"grad_norm": 0.4286698284091557,
"learning_rate": 8.790452482061796e-05,
"loss": 2.0364,
"step": 159
},
{
"epoch": 0.02,
"grad_norm": 0.4636961335954888,
"learning_rate": 8.789134573143945e-05,
"loss": 2.027,
"step": 160
},
{
"epoch": 0.02,
"grad_norm": 0.4729809233526084,
"learning_rate": 8.787816664226096e-05,
"loss": 2.0344,
"step": 161
},
{
"epoch": 0.02,
"grad_norm": 0.5102897103629896,
"learning_rate": 8.786498755308245e-05,
"loss": 2.0313,
"step": 162
},
{
"epoch": 0.02,
"grad_norm": 0.3821986499935569,
"learning_rate": 8.785180846390395e-05,
"loss": 2.0281,
"step": 163
},
{
"epoch": 0.02,
"grad_norm": 0.31922482573170025,
"learning_rate": 8.783862937472544e-05,
"loss": 2.0246,
"step": 164
},
{
"epoch": 0.02,
"grad_norm": 0.4125805076448191,
"learning_rate": 8.782545028554693e-05,
"loss": 2.0209,
"step": 165
},
{
"epoch": 0.02,
"grad_norm": 0.3584157670577966,
"learning_rate": 8.781227119636843e-05,
"loss": 2.0201,
"step": 166
},
{
"epoch": 0.02,
"grad_norm": 0.3732188737105076,
"learning_rate": 8.779909210718993e-05,
"loss": 2.0181,
"step": 167
},
{
"epoch": 0.02,
"grad_norm": 0.34438987877017796,
"learning_rate": 8.778591301801143e-05,
"loss": 2.0155,
"step": 168
},
{
"epoch": 0.02,
"grad_norm": 0.322936068818923,
"learning_rate": 8.777273392883292e-05,
"loss": 2.0206,
"step": 169
},
{
"epoch": 0.02,
"grad_norm": 0.3671965249502594,
"learning_rate": 8.775955483965443e-05,
"loss": 2.0215,
"step": 170
},
{
"epoch": 0.03,
"grad_norm": 0.3303955763338707,
"learning_rate": 8.774637575047592e-05,
"loss": 2.0134,
"step": 171
},
{
"epoch": 0.03,
"grad_norm": 0.26010553136423376,
"learning_rate": 8.77331966612974e-05,
"loss": 1.9998,
"step": 172
},
{
"epoch": 0.03,
"grad_norm": 0.28273036383624667,
"learning_rate": 8.772001757211891e-05,
"loss": 2.008,
"step": 173
},
{
"epoch": 0.03,
"grad_norm": 0.3493466030437232,
"learning_rate": 8.77068384829404e-05,
"loss": 2.0077,
"step": 174
},
{
"epoch": 0.03,
"grad_norm": 0.3240073311732844,
"learning_rate": 8.76936593937619e-05,
"loss": 2.0005,
"step": 175
},
{
"epoch": 0.03,
"grad_norm": 0.37871862295109215,
"learning_rate": 8.76804803045834e-05,
"loss": 1.9947,
"step": 176
},
{
"epoch": 0.03,
"grad_norm": 0.36425872526816194,
"learning_rate": 8.76673012154049e-05,
"loss": 2.0012,
"step": 177
},
{
"epoch": 0.03,
"grad_norm": 0.3146144912051344,
"learning_rate": 8.765412212622639e-05,
"loss": 1.9888,
"step": 178
},
{
"epoch": 0.03,
"grad_norm": 0.32672427059297815,
"learning_rate": 8.764094303704789e-05,
"loss": 1.9914,
"step": 179
},
{
"epoch": 0.03,
"grad_norm": 0.2874426352457461,
"learning_rate": 8.762776394786938e-05,
"loss": 1.991,
"step": 180
},
{
"epoch": 0.03,
"grad_norm": 0.304367486735733,
"learning_rate": 8.761458485869087e-05,
"loss": 1.9806,
"step": 181
},
{
"epoch": 0.03,
"grad_norm": 0.37875831866804927,
"learning_rate": 8.760140576951238e-05,
"loss": 1.9924,
"step": 182
},
{
"epoch": 0.03,
"grad_norm": 0.4835755191486126,
"learning_rate": 8.758822668033388e-05,
"loss": 1.9921,
"step": 183
},
{
"epoch": 0.03,
"grad_norm": 0.5565270140077677,
"learning_rate": 8.757504759115537e-05,
"loss": 1.9822,
"step": 184
},
{
"epoch": 0.03,
"grad_norm": 0.5464855527172483,
"learning_rate": 8.756186850197688e-05,
"loss": 1.9839,
"step": 185
},
{
"epoch": 0.03,
"grad_norm": 0.4989777454659821,
"learning_rate": 8.754868941279837e-05,
"loss": 1.9867,
"step": 186
},
{
"epoch": 0.03,
"grad_norm": 0.4574274365674215,
"learning_rate": 8.753551032361986e-05,
"loss": 1.9837,
"step": 187
},
{
"epoch": 0.03,
"grad_norm": 0.47786281828321875,
"learning_rate": 8.752233123444136e-05,
"loss": 1.989,
"step": 188
},
{
"epoch": 0.03,
"grad_norm": 0.5193952650470612,
"learning_rate": 8.750915214526285e-05,
"loss": 1.9721,
"step": 189
},
{
"epoch": 0.03,
"grad_norm": 0.46706899407183416,
"learning_rate": 8.749597305608435e-05,
"loss": 1.9721,
"step": 190
},
{
"epoch": 0.03,
"grad_norm": 0.41004377608522635,
"learning_rate": 8.748279396690585e-05,
"loss": 1.9736,
"step": 191
},
{
"epoch": 0.03,
"grad_norm": 0.4094480903013732,
"learning_rate": 8.746961487772735e-05,
"loss": 1.9714,
"step": 192
},
{
"epoch": 0.03,
"grad_norm": 0.5224817485503537,
"learning_rate": 8.745643578854884e-05,
"loss": 1.9775,
"step": 193
},
{
"epoch": 0.03,
"grad_norm": 0.5763427830733111,
"learning_rate": 8.744325669937035e-05,
"loss": 1.9691,
"step": 194
},
{
"epoch": 0.03,
"grad_norm": 0.45154944484866116,
"learning_rate": 8.743007761019184e-05,
"loss": 1.9685,
"step": 195
},
{
"epoch": 0.03,
"grad_norm": 0.29409144127731623,
"learning_rate": 8.741689852101332e-05,
"loss": 1.9665,
"step": 196
},
{
"epoch": 0.03,
"grad_norm": 0.3391706172784746,
"learning_rate": 8.740371943183483e-05,
"loss": 1.9657,
"step": 197
},
{
"epoch": 0.03,
"grad_norm": 0.3584680315892078,
"learning_rate": 8.739054034265632e-05,
"loss": 1.9662,
"step": 198
},
{
"epoch": 0.03,
"grad_norm": 0.32480662203931654,
"learning_rate": 8.737736125347782e-05,
"loss": 1.967,
"step": 199
},
{
"epoch": 0.03,
"grad_norm": 0.2678893712664796,
"learning_rate": 8.736418216429932e-05,
"loss": 1.9667,
"step": 200
},
{
"epoch": 0.03,
"grad_norm": 0.3034157270439372,
"learning_rate": 8.735100307512082e-05,
"loss": 1.9639,
"step": 201
},
{
"epoch": 0.03,
"grad_norm": 0.38638946729313517,
"learning_rate": 8.733782398594231e-05,
"loss": 1.9654,
"step": 202
},
{
"epoch": 0.03,
"grad_norm": 0.44116092753081665,
"learning_rate": 8.73246448967638e-05,
"loss": 1.9662,
"step": 203
},
{
"epoch": 0.03,
"grad_norm": 0.5599610453487487,
"learning_rate": 8.73114658075853e-05,
"loss": 1.9589,
"step": 204
},
{
"epoch": 0.03,
"grad_norm": 0.6813838919621874,
"learning_rate": 8.729828671840679e-05,
"loss": 1.96,
"step": 205
},
{
"epoch": 0.03,
"grad_norm": 0.6749810768948219,
"learning_rate": 8.72851076292283e-05,
"loss": 1.9669,
"step": 206
},
{
"epoch": 0.03,
"grad_norm": 0.5474380551907909,
"learning_rate": 8.727192854004979e-05,
"loss": 1.9479,
"step": 207
},
{
"epoch": 0.03,
"grad_norm": 0.39912501114471677,
"learning_rate": 8.725874945087129e-05,
"loss": 1.9591,
"step": 208
},
{
"epoch": 0.03,
"grad_norm": 0.3704777650253716,
"learning_rate": 8.72455703616928e-05,
"loss": 1.9535,
"step": 209
},
{
"epoch": 0.03,
"grad_norm": 0.45420800985949406,
"learning_rate": 8.723239127251429e-05,
"loss": 1.9476,
"step": 210
},
{
"epoch": 0.03,
"grad_norm": 0.5106582102448923,
"learning_rate": 8.721921218333578e-05,
"loss": 1.954,
"step": 211
},
{
"epoch": 0.03,
"grad_norm": 1.648206578856157,
"learning_rate": 8.720603309415728e-05,
"loss": 1.963,
"step": 212
},
{
"epoch": 0.03,
"grad_norm": 0.35693125434587636,
"learning_rate": 8.719285400497877e-05,
"loss": 1.9419,
"step": 213
},
{
"epoch": 0.03,
"grad_norm": 0.6519985094603863,
"learning_rate": 8.717967491580026e-05,
"loss": 1.9445,
"step": 214
},
{
"epoch": 0.03,
"grad_norm": 0.6567675718025047,
"learning_rate": 8.716649582662177e-05,
"loss": 1.9469,
"step": 215
},
{
"epoch": 0.03,
"grad_norm": 0.4872034695801568,
"learning_rate": 8.715331673744326e-05,
"loss": 1.9495,
"step": 216
},
{
"epoch": 0.03,
"grad_norm": 0.4145209119308714,
"learning_rate": 8.714013764826476e-05,
"loss": 1.9486,
"step": 217
},
{
"epoch": 0.03,
"grad_norm": 0.4775886307734156,
"learning_rate": 8.712695855908626e-05,
"loss": 1.9496,
"step": 218
},
{
"epoch": 0.03,
"grad_norm": 0.40804530533441385,
"learning_rate": 8.711377946990776e-05,
"loss": 1.9419,
"step": 219
},
{
"epoch": 0.03,
"grad_norm": 0.3313950875913946,
"learning_rate": 8.710060038072924e-05,
"loss": 1.9372,
"step": 220
},
{
"epoch": 0.03,
"grad_norm": 0.3887501325873841,
"learning_rate": 8.708742129155075e-05,
"loss": 1.9421,
"step": 221
},
{
"epoch": 0.03,
"grad_norm": 0.4319364449745349,
"learning_rate": 8.707424220237224e-05,
"loss": 1.9385,
"step": 222
},
{
"epoch": 0.03,
"grad_norm": 0.39127832437474,
"learning_rate": 8.706106311319373e-05,
"loss": 1.935,
"step": 223
},
{
"epoch": 0.03,
"grad_norm": 0.36405269202971846,
"learning_rate": 8.704788402401524e-05,
"loss": 1.9395,
"step": 224
},
{
"epoch": 0.03,
"grad_norm": 0.3815614792943797,
"learning_rate": 8.703470493483673e-05,
"loss": 1.9351,
"step": 225
},
{
"epoch": 0.03,
"grad_norm": 0.43828706493068725,
"learning_rate": 8.702152584565823e-05,
"loss": 1.9217,
"step": 226
},
{
"epoch": 0.03,
"grad_norm": 0.39922403213927826,
"learning_rate": 8.700834675647972e-05,
"loss": 1.9289,
"step": 227
},
{
"epoch": 0.03,
"grad_norm": 0.3844142284075385,
"learning_rate": 8.699516766730122e-05,
"loss": 1.9248,
"step": 228
},
{
"epoch": 0.03,
"grad_norm": 0.38881007783639865,
"learning_rate": 8.698198857812271e-05,
"loss": 1.9201,
"step": 229
},
{
"epoch": 0.03,
"grad_norm": 0.2975403535899065,
"learning_rate": 8.696880948894422e-05,
"loss": 1.928,
"step": 230
},
{
"epoch": 0.03,
"grad_norm": 0.2832203830943493,
"learning_rate": 8.695563039976571e-05,
"loss": 1.9162,
"step": 231
},
{
"epoch": 0.03,
"grad_norm": 0.2979487231657903,
"learning_rate": 8.69424513105872e-05,
"loss": 1.9271,
"step": 232
},
{
"epoch": 0.03,
"grad_norm": 0.3621148892850887,
"learning_rate": 8.692927222140871e-05,
"loss": 1.9156,
"step": 233
},
{
"epoch": 0.03,
"grad_norm": 0.3252924764609856,
"learning_rate": 8.69160931322302e-05,
"loss": 1.9208,
"step": 234
},
{
"epoch": 0.03,
"grad_norm": 0.38778060232379175,
"learning_rate": 8.69029140430517e-05,
"loss": 1.9137,
"step": 235
},
{
"epoch": 0.03,
"grad_norm": 0.5259448310725102,
"learning_rate": 8.688973495387319e-05,
"loss": 1.915,
"step": 236
},
{
"epoch": 0.03,
"grad_norm": 0.6748649230965568,
"learning_rate": 8.687655586469469e-05,
"loss": 1.9335,
"step": 237
},
{
"epoch": 0.03,
"grad_norm": 0.7329646137433606,
"learning_rate": 8.686337677551618e-05,
"loss": 1.921,
"step": 238
},
{
"epoch": 0.03,
"grad_norm": 0.6477306709245009,
"learning_rate": 8.685019768633769e-05,
"loss": 1.9107,
"step": 239
},
{
"epoch": 0.04,
"grad_norm": 0.4249940595660931,
"learning_rate": 8.683701859715918e-05,
"loss": 1.9074,
"step": 240
},
{
"epoch": 0.04,
"grad_norm": 0.40780487506439317,
"learning_rate": 8.682383950798068e-05,
"loss": 1.9121,
"step": 241
},
{
"epoch": 0.04,
"grad_norm": 0.5063150515112605,
"learning_rate": 8.681066041880218e-05,
"loss": 1.9122,
"step": 242
},
{
"epoch": 0.04,
"grad_norm": 0.43654436775464556,
"learning_rate": 8.679748132962368e-05,
"loss": 1.9101,
"step": 243
},
{
"epoch": 0.04,
"grad_norm": 0.28693671377727636,
"learning_rate": 8.678430224044516e-05,
"loss": 1.9071,
"step": 244
},
{
"epoch": 0.04,
"grad_norm": 0.35062666021736943,
"learning_rate": 8.677112315126666e-05,
"loss": 1.9052,
"step": 245
},
{
"epoch": 0.04,
"grad_norm": 0.4571466841247422,
"learning_rate": 8.675794406208816e-05,
"loss": 1.9037,
"step": 246
},
{
"epoch": 0.04,
"grad_norm": 0.3856445615558911,
"learning_rate": 8.674476497290965e-05,
"loss": 1.9033,
"step": 247
},
{
"epoch": 0.04,
"grad_norm": 0.3233514425302591,
"learning_rate": 8.673158588373116e-05,
"loss": 1.904,
"step": 248
},
{
"epoch": 0.04,
"grad_norm": 0.4212339592559155,
"learning_rate": 8.671840679455265e-05,
"loss": 1.9029,
"step": 249
},
{
"epoch": 0.04,
"grad_norm": 0.4275588677912957,
"learning_rate": 8.670522770537415e-05,
"loss": 1.8994,
"step": 250
},
{
"epoch": 0.04,
"grad_norm": 0.29035057566401584,
"learning_rate": 8.669204861619564e-05,
"loss": 1.8968,
"step": 251
},
{
"epoch": 0.04,
"grad_norm": 0.3292457727990474,
"learning_rate": 8.667886952701713e-05,
"loss": 1.8971,
"step": 252
},
{
"epoch": 0.04,
"grad_norm": 0.39002632439582646,
"learning_rate": 8.666569043783863e-05,
"loss": 1.9019,
"step": 253
},
{
"epoch": 0.04,
"grad_norm": 0.3058384482996071,
"learning_rate": 8.665251134866013e-05,
"loss": 1.8991,
"step": 254
},
{
"epoch": 0.04,
"grad_norm": 0.33747054066822696,
"learning_rate": 8.663933225948163e-05,
"loss": 1.8891,
"step": 255
},
{
"epoch": 0.04,
"grad_norm": 0.35931140472197654,
"learning_rate": 8.662615317030312e-05,
"loss": 1.9025,
"step": 256
},
{
"epoch": 0.04,
"grad_norm": 0.312755193621967,
"learning_rate": 8.661297408112463e-05,
"loss": 1.8904,
"step": 257
},
{
"epoch": 0.04,
"grad_norm": 0.24759800889452924,
"learning_rate": 8.659979499194612e-05,
"loss": 1.8957,
"step": 258
},
{
"epoch": 0.04,
"grad_norm": 0.23365570528449237,
"learning_rate": 8.658661590276762e-05,
"loss": 1.8895,
"step": 259
},
{
"epoch": 0.04,
"grad_norm": 0.26012031059809776,
"learning_rate": 8.657343681358911e-05,
"loss": 1.874,
"step": 260
},
{
"epoch": 0.04,
"grad_norm": 0.2369419439744028,
"learning_rate": 8.65602577244106e-05,
"loss": 1.8888,
"step": 261
},
{
"epoch": 0.04,
"grad_norm": 0.23737605818719734,
"learning_rate": 8.65470786352321e-05,
"loss": 1.892,
"step": 262
},
{
"epoch": 0.04,
"grad_norm": 0.20560803394813162,
"learning_rate": 8.65338995460536e-05,
"loss": 1.8902,
"step": 263
},
{
"epoch": 0.04,
"grad_norm": 0.23756980446551546,
"learning_rate": 8.65207204568751e-05,
"loss": 1.8914,
"step": 264
},
{
"epoch": 0.04,
"grad_norm": 0.2955355993564412,
"learning_rate": 8.650754136769659e-05,
"loss": 1.8741,
"step": 265
},
{
"epoch": 0.04,
"grad_norm": 0.36625312284796796,
"learning_rate": 8.649436227851809e-05,
"loss": 1.8849,
"step": 266
},
{
"epoch": 0.04,
"grad_norm": 0.4377187791569674,
"learning_rate": 8.64811831893396e-05,
"loss": 1.875,
"step": 267
},
{
"epoch": 0.04,
"grad_norm": 0.4751457670664482,
"learning_rate": 8.646800410016107e-05,
"loss": 1.887,
"step": 268
},
{
"epoch": 0.04,
"grad_norm": 0.5124162062393218,
"learning_rate": 8.645482501098258e-05,
"loss": 1.8759,
"step": 269
},
{
"epoch": 0.04,
"grad_norm": 0.5128407598622339,
"learning_rate": 8.644164592180407e-05,
"loss": 1.884,
"step": 270
},
{
"epoch": 0.04,
"grad_norm": 0.4645016436292238,
"learning_rate": 8.642846683262557e-05,
"loss": 1.8748,
"step": 271
},
{
"epoch": 0.04,
"grad_norm": 0.42963082194049396,
"learning_rate": 8.641528774344708e-05,
"loss": 1.8725,
"step": 272
},
{
"epoch": 0.04,
"grad_norm": 0.41218244280637906,
"learning_rate": 8.640210865426857e-05,
"loss": 1.8791,
"step": 273
},
{
"epoch": 0.04,
"grad_norm": 0.34726276152467556,
"learning_rate": 8.638892956509006e-05,
"loss": 1.8742,
"step": 274
},
{
"epoch": 0.04,
"grad_norm": 0.328823655826036,
"learning_rate": 8.637575047591156e-05,
"loss": 1.8721,
"step": 275
},
{
"epoch": 0.04,
"grad_norm": 0.3686461464930126,
"learning_rate": 8.636257138673305e-05,
"loss": 1.8687,
"step": 276
},
{
"epoch": 0.04,
"grad_norm": 0.3656673874029743,
"learning_rate": 8.634939229755454e-05,
"loss": 1.8654,
"step": 277
},
{
"epoch": 0.04,
"grad_norm": 0.30525439673036175,
"learning_rate": 8.633621320837605e-05,
"loss": 1.8746,
"step": 278
},
{
"epoch": 0.04,
"grad_norm": 0.3007226581860427,
"learning_rate": 8.632303411919755e-05,
"loss": 1.8664,
"step": 279
},
{
"epoch": 0.04,
"grad_norm": 0.3849279004150829,
"learning_rate": 8.630985503001904e-05,
"loss": 1.8717,
"step": 280
},
{
"epoch": 0.04,
"grad_norm": 0.43777228984264327,
"learning_rate": 8.629667594084055e-05,
"loss": 1.8776,
"step": 281
},
{
"epoch": 0.04,
"grad_norm": 0.4220356221423105,
"learning_rate": 8.628349685166204e-05,
"loss": 1.8705,
"step": 282
},
{
"epoch": 0.04,
"grad_norm": 0.4049297705769528,
"learning_rate": 8.627031776248353e-05,
"loss": 1.8702,
"step": 283
},
{
"epoch": 0.04,
"grad_norm": 0.3919834421059084,
"learning_rate": 8.625713867330503e-05,
"loss": 1.8672,
"step": 284
},
{
"epoch": 0.04,
"grad_norm": 0.3713089483320666,
"learning_rate": 8.624395958412652e-05,
"loss": 1.8626,
"step": 285
},
{
"epoch": 0.04,
"grad_norm": 0.3604341363789889,
"learning_rate": 8.623078049494802e-05,
"loss": 1.8724,
"step": 286
},
{
"epoch": 0.04,
"grad_norm": 0.39717353573166714,
"learning_rate": 8.621760140576952e-05,
"loss": 1.8646,
"step": 287
},
{
"epoch": 0.04,
"grad_norm": 0.44719078220357295,
"learning_rate": 8.620442231659102e-05,
"loss": 1.8582,
"step": 288
},
{
"epoch": 0.04,
"grad_norm": 0.4513680274672226,
"learning_rate": 8.619124322741251e-05,
"loss": 1.8696,
"step": 289
},
{
"epoch": 0.04,
"grad_norm": 0.43023542467375564,
"learning_rate": 8.6178064138234e-05,
"loss": 1.8699,
"step": 290
},
{
"epoch": 0.04,
"grad_norm": 0.43858266418234654,
"learning_rate": 8.616488504905551e-05,
"loss": 1.86,
"step": 291
},
{
"epoch": 0.04,
"grad_norm": 0.5064425360850751,
"learning_rate": 8.615170595987699e-05,
"loss": 1.858,
"step": 292
},
{
"epoch": 0.04,
"grad_norm": 0.48407682623823045,
"learning_rate": 8.61385268706985e-05,
"loss": 1.853,
"step": 293
},
{
"epoch": 0.04,
"grad_norm": 0.46114150689754957,
"learning_rate": 8.612534778151999e-05,
"loss": 1.8598,
"step": 294
},
{
"epoch": 0.04,
"grad_norm": 0.44714939482574123,
"learning_rate": 8.611216869234149e-05,
"loss": 1.8571,
"step": 295
},
{
"epoch": 0.04,
"grad_norm": 0.29499856062245083,
"learning_rate": 8.6098989603163e-05,
"loss": 1.85,
"step": 296
},
{
"epoch": 0.04,
"grad_norm": 0.27237302520628326,
"learning_rate": 8.608581051398449e-05,
"loss": 1.8633,
"step": 297
},
{
"epoch": 0.04,
"grad_norm": 0.3852560547170149,
"learning_rate": 8.607263142480598e-05,
"loss": 1.84,
"step": 298
},
{
"epoch": 0.04,
"grad_norm": 0.4338967169702793,
"learning_rate": 8.605945233562747e-05,
"loss": 1.86,
"step": 299
},
{
"epoch": 0.04,
"grad_norm": 0.3952031517559691,
"learning_rate": 8.604627324644897e-05,
"loss": 1.8473,
"step": 300
},
{
"epoch": 0.04,
"grad_norm": 0.25514061111362424,
"learning_rate": 8.603309415727046e-05,
"loss": 1.8464,
"step": 301
},
{
"epoch": 0.04,
"grad_norm": 0.3078510816972887,
"learning_rate": 8.601991506809197e-05,
"loss": 1.8518,
"step": 302
},
{
"epoch": 0.04,
"grad_norm": 0.38406564873200627,
"learning_rate": 8.600673597891346e-05,
"loss": 1.8453,
"step": 303
},
{
"epoch": 0.04,
"grad_norm": 0.29032703723109443,
"learning_rate": 8.599355688973496e-05,
"loss": 1.8426,
"step": 304
},
{
"epoch": 0.04,
"grad_norm": 0.21551221995312625,
"learning_rate": 8.598037780055646e-05,
"loss": 1.8407,
"step": 305
},
{
"epoch": 0.04,
"grad_norm": 0.25104013568498373,
"learning_rate": 8.596719871137796e-05,
"loss": 1.8412,
"step": 306
},
{
"epoch": 0.04,
"grad_norm": 0.2753448788479235,
"learning_rate": 8.595401962219945e-05,
"loss": 1.85,
"step": 307
},
{
"epoch": 0.05,
"grad_norm": 0.28287628219754707,
"learning_rate": 8.594084053302095e-05,
"loss": 1.8393,
"step": 308
},
{
"epoch": 0.05,
"grad_norm": 0.29153292982905893,
"learning_rate": 8.592766144384244e-05,
"loss": 1.8436,
"step": 309
},
{
"epoch": 0.05,
"grad_norm": 0.3356468622059691,
"learning_rate": 8.591448235466393e-05,
"loss": 1.8501,
"step": 310
},
{
"epoch": 0.05,
"grad_norm": 0.3668022644305236,
"learning_rate": 8.590130326548544e-05,
"loss": 1.8442,
"step": 311
},
{
"epoch": 0.05,
"grad_norm": 0.35131341619593265,
"learning_rate": 8.588812417630693e-05,
"loss": 1.8385,
"step": 312
},
{
"epoch": 0.05,
"grad_norm": 0.32173508052581773,
"learning_rate": 8.587494508712843e-05,
"loss": 1.8436,
"step": 313
},
{
"epoch": 0.05,
"grad_norm": 0.300778040243218,
"learning_rate": 8.586176599794992e-05,
"loss": 1.8382,
"step": 314
},
{
"epoch": 0.05,
"grad_norm": 0.30861066996157027,
"learning_rate": 8.584858690877143e-05,
"loss": 1.8351,
"step": 315
},
{
"epoch": 0.05,
"grad_norm": 0.34553615609293836,
"learning_rate": 8.583540781959291e-05,
"loss": 1.8326,
"step": 316
},
{
"epoch": 0.05,
"grad_norm": 0.3945637083333217,
"learning_rate": 8.582222873041442e-05,
"loss": 1.8439,
"step": 317
},
{
"epoch": 0.05,
"grad_norm": 0.4185609027500278,
"learning_rate": 8.580904964123591e-05,
"loss": 1.836,
"step": 318
},
{
"epoch": 0.05,
"grad_norm": 0.4737109913412694,
"learning_rate": 8.57958705520574e-05,
"loss": 1.8349,
"step": 319
},
{
"epoch": 0.05,
"grad_norm": 0.542744291036962,
"learning_rate": 8.578269146287891e-05,
"loss": 1.8369,
"step": 320
},
{
"epoch": 0.05,
"grad_norm": 0.5698184063084959,
"learning_rate": 8.57695123737004e-05,
"loss": 1.8364,
"step": 321
},
{
"epoch": 0.05,
"grad_norm": 0.5216244389865377,
"learning_rate": 8.57563332845219e-05,
"loss": 1.8391,
"step": 322
},
{
"epoch": 0.05,
"grad_norm": 0.4349460582065712,
"learning_rate": 8.574315419534339e-05,
"loss": 1.8316,
"step": 323
},
{
"epoch": 0.05,
"grad_norm": 0.4060439023599905,
"learning_rate": 8.572997510616489e-05,
"loss": 1.829,
"step": 324
},
{
"epoch": 0.05,
"grad_norm": 0.4006651254790203,
"learning_rate": 8.571679601698638e-05,
"loss": 1.8289,
"step": 325
},
{
"epoch": 0.05,
"grad_norm": 0.35982668295004727,
"learning_rate": 8.570361692780789e-05,
"loss": 1.8345,
"step": 326
},
{
"epoch": 0.05,
"grad_norm": 0.34270943020191247,
"learning_rate": 8.569043783862938e-05,
"loss": 1.8244,
"step": 327
},
{
"epoch": 0.05,
"grad_norm": 0.35208015052905844,
"learning_rate": 8.567725874945087e-05,
"loss": 1.8395,
"step": 328
},
{
"epoch": 0.05,
"grad_norm": 0.3560007421719364,
"learning_rate": 8.566407966027238e-05,
"loss": 1.8244,
"step": 329
},
{
"epoch": 0.05,
"grad_norm": 0.36632340017442266,
"learning_rate": 8.565090057109388e-05,
"loss": 1.8225,
"step": 330
},
{
"epoch": 0.05,
"grad_norm": 0.33466423040979987,
"learning_rate": 8.563772148191537e-05,
"loss": 1.829,
"step": 331
},
{
"epoch": 0.05,
"grad_norm": 0.2716882763297445,
"learning_rate": 8.562454239273686e-05,
"loss": 1.8341,
"step": 332
},
{
"epoch": 0.05,
"grad_norm": 0.26453069945784297,
"learning_rate": 8.561136330355836e-05,
"loss": 1.834,
"step": 333
},
{
"epoch": 0.05,
"grad_norm": 0.2678715963585482,
"learning_rate": 8.559818421437985e-05,
"loss": 1.8246,
"step": 334
},
{
"epoch": 0.05,
"grad_norm": 0.28963443783737886,
"learning_rate": 8.558500512520136e-05,
"loss": 1.8217,
"step": 335
},
{
"epoch": 0.05,
"grad_norm": 0.2799148037129182,
"learning_rate": 8.557182603602285e-05,
"loss": 1.8232,
"step": 336
},
{
"epoch": 0.05,
"grad_norm": 0.25449424333816384,
"learning_rate": 8.555864694684435e-05,
"loss": 1.8211,
"step": 337
},
{
"epoch": 0.05,
"grad_norm": 0.29873070853220085,
"learning_rate": 8.554546785766584e-05,
"loss": 1.8218,
"step": 338
},
{
"epoch": 0.05,
"grad_norm": 0.31256528253176236,
"learning_rate": 8.553228876848735e-05,
"loss": 1.8184,
"step": 339
},
{
"epoch": 0.05,
"grad_norm": 0.2579289256497001,
"learning_rate": 8.551910967930883e-05,
"loss": 1.8285,
"step": 340
},
{
"epoch": 0.05,
"grad_norm": 0.2692223001749751,
"learning_rate": 8.550593059013033e-05,
"loss": 1.8199,
"step": 341
},
{
"epoch": 0.05,
"grad_norm": 0.2828285551419333,
"learning_rate": 8.549275150095183e-05,
"loss": 1.812,
"step": 342
},
{
"epoch": 0.05,
"grad_norm": 0.25760648063355074,
"learning_rate": 8.547957241177332e-05,
"loss": 1.8196,
"step": 343
},
{
"epoch": 0.05,
"grad_norm": 0.3229138587412795,
"learning_rate": 8.546639332259483e-05,
"loss": 1.8168,
"step": 344
},
{
"epoch": 0.05,
"grad_norm": 0.32584515922858565,
"learning_rate": 8.545321423341632e-05,
"loss": 1.821,
"step": 345
},
{
"epoch": 0.05,
"grad_norm": 0.2785648276363959,
"learning_rate": 8.544003514423782e-05,
"loss": 1.8226,
"step": 346
},
{
"epoch": 0.05,
"grad_norm": 0.32332023531594767,
"learning_rate": 8.542685605505931e-05,
"loss": 1.8067,
"step": 347
},
{
"epoch": 0.05,
"grad_norm": 0.30635180846369325,
"learning_rate": 8.54136769658808e-05,
"loss": 1.8132,
"step": 348
},
{
"epoch": 0.05,
"grad_norm": 0.29630729064298555,
"learning_rate": 8.54004978767023e-05,
"loss": 1.8101,
"step": 349
},
{
"epoch": 0.05,
"grad_norm": 0.34621816808581285,
"learning_rate": 8.53873187875238e-05,
"loss": 1.8176,
"step": 350
},
{
"epoch": 0.05,
"grad_norm": 0.37813665939757496,
"learning_rate": 8.53741396983453e-05,
"loss": 1.8118,
"step": 351
},
{
"epoch": 0.05,
"grad_norm": 0.4024072071879195,
"learning_rate": 8.536096060916679e-05,
"loss": 1.8151,
"step": 352
},
{
"epoch": 0.05,
"grad_norm": 0.4804782375915566,
"learning_rate": 8.534778151998829e-05,
"loss": 1.8163,
"step": 353
},
{
"epoch": 0.05,
"grad_norm": 0.5127084558984112,
"learning_rate": 8.533460243080979e-05,
"loss": 1.811,
"step": 354
},
{
"epoch": 0.05,
"grad_norm": 0.4388610896297329,
"learning_rate": 8.532142334163129e-05,
"loss": 1.8137,
"step": 355
},
{
"epoch": 0.05,
"grad_norm": 0.3221135161434642,
"learning_rate": 8.530824425245278e-05,
"loss": 1.7968,
"step": 356
},
{
"epoch": 0.05,
"grad_norm": 0.340563157991881,
"learning_rate": 8.529506516327427e-05,
"loss": 1.8103,
"step": 357
},
{
"epoch": 0.05,
"grad_norm": 0.39991989603879213,
"learning_rate": 8.528188607409577e-05,
"loss": 1.8087,
"step": 358
},
{
"epoch": 0.05,
"grad_norm": 0.4120407943765869,
"learning_rate": 8.526870698491728e-05,
"loss": 1.805,
"step": 359
},
{
"epoch": 0.05,
"grad_norm": 0.34596624387275093,
"learning_rate": 8.525552789573877e-05,
"loss": 1.8035,
"step": 360
},
{
"epoch": 0.05,
"grad_norm": 0.25789497090466024,
"learning_rate": 8.524234880656026e-05,
"loss": 1.8058,
"step": 361
},
{
"epoch": 0.05,
"grad_norm": 0.30263266008859196,
"learning_rate": 8.522916971738176e-05,
"loss": 1.8084,
"step": 362
},
{
"epoch": 0.05,
"grad_norm": 0.3562148440043474,
"learning_rate": 8.521599062820326e-05,
"loss": 1.799,
"step": 363
},
{
"epoch": 0.05,
"grad_norm": 0.30587484606473564,
"learning_rate": 8.520281153902474e-05,
"loss": 1.8003,
"step": 364
},
{
"epoch": 0.05,
"grad_norm": 0.27973522082893804,
"learning_rate": 8.518963244984625e-05,
"loss": 1.8153,
"step": 365
},
{
"epoch": 0.05,
"grad_norm": 0.28180525427229874,
"learning_rate": 8.517645336066775e-05,
"loss": 1.7987,
"step": 366
},
{
"epoch": 0.05,
"grad_norm": 0.24578221711398116,
"learning_rate": 8.516327427148924e-05,
"loss": 1.8116,
"step": 367
},
{
"epoch": 0.05,
"grad_norm": 0.265330003846191,
"learning_rate": 8.515009518231075e-05,
"loss": 1.8034,
"step": 368
},
{
"epoch": 0.05,
"grad_norm": 0.2777372207789329,
"learning_rate": 8.513691609313224e-05,
"loss": 1.793,
"step": 369
},
{
"epoch": 0.05,
"grad_norm": 0.22568033781908964,
"learning_rate": 8.512373700395373e-05,
"loss": 1.8029,
"step": 370
},
{
"epoch": 0.05,
"grad_norm": 0.25601857440834863,
"learning_rate": 8.511055791477523e-05,
"loss": 1.7958,
"step": 371
},
{
"epoch": 0.05,
"grad_norm": 0.31701347224825027,
"learning_rate": 8.509737882559672e-05,
"loss": 1.8006,
"step": 372
},
{
"epoch": 0.05,
"grad_norm": 0.2911487871418614,
"learning_rate": 8.508419973641821e-05,
"loss": 1.7955,
"step": 373
},
{
"epoch": 0.05,
"grad_norm": 0.2082178563597066,
"learning_rate": 8.507102064723972e-05,
"loss": 1.797,
"step": 374
},
{
"epoch": 0.05,
"grad_norm": 0.25022339272015853,
"learning_rate": 8.505784155806122e-05,
"loss": 1.7918,
"step": 375
},
{
"epoch": 0.06,
"grad_norm": 0.2857289749098349,
"learning_rate": 8.504466246888271e-05,
"loss": 1.7919,
"step": 376
},
{
"epoch": 0.06,
"grad_norm": 0.2982765315572785,
"learning_rate": 8.50314833797042e-05,
"loss": 1.7945,
"step": 377
},
{
"epoch": 0.06,
"grad_norm": 0.3632832111118879,
"learning_rate": 8.501830429052571e-05,
"loss": 1.7891,
"step": 378
},
{
"epoch": 0.06,
"grad_norm": 0.4582525250106023,
"learning_rate": 8.50051252013472e-05,
"loss": 1.7909,
"step": 379
},
{
"epoch": 0.06,
"grad_norm": 0.49699840172624643,
"learning_rate": 8.49919461121687e-05,
"loss": 1.7971,
"step": 380
},
{
"epoch": 0.06,
"grad_norm": 0.4504572316819271,
"learning_rate": 8.497876702299019e-05,
"loss": 1.8089,
"step": 381
},
{
"epoch": 0.06,
"grad_norm": 0.3849459017582572,
"learning_rate": 8.496558793381169e-05,
"loss": 1.8009,
"step": 382
},
{
"epoch": 0.06,
"grad_norm": 0.3899357827016154,
"learning_rate": 8.495240884463319e-05,
"loss": 1.7915,
"step": 383
},
{
"epoch": 0.06,
"grad_norm": 0.3914658024096451,
"learning_rate": 8.493922975545469e-05,
"loss": 1.7883,
"step": 384
},
{
"epoch": 0.06,
"grad_norm": 0.3709299103461753,
"learning_rate": 8.492605066627618e-05,
"loss": 1.7951,
"step": 385
},
{
"epoch": 0.06,
"grad_norm": 0.3506925098949618,
"learning_rate": 8.491287157709767e-05,
"loss": 1.7889,
"step": 386
},
{
"epoch": 0.06,
"grad_norm": 0.29624563115275554,
"learning_rate": 8.489969248791918e-05,
"loss": 1.7831,
"step": 387
},
{
"epoch": 0.06,
"grad_norm": 0.25682264450924897,
"learning_rate": 8.488651339874066e-05,
"loss": 1.7874,
"step": 388
},
{
"epoch": 0.06,
"grad_norm": 0.2460261750961129,
"learning_rate": 8.487333430956217e-05,
"loss": 1.7828,
"step": 389
},
{
"epoch": 0.06,
"grad_norm": 0.24758082633885842,
"learning_rate": 8.486015522038366e-05,
"loss": 1.7794,
"step": 390
},
{
"epoch": 0.06,
"grad_norm": 0.33054734116446843,
"learning_rate": 8.484697613120516e-05,
"loss": 1.7818,
"step": 391
},
{
"epoch": 0.06,
"grad_norm": 0.4561811736514614,
"learning_rate": 8.483379704202666e-05,
"loss": 1.7922,
"step": 392
},
{
"epoch": 0.06,
"grad_norm": 0.49921174238421934,
"learning_rate": 8.482061795284816e-05,
"loss": 1.7798,
"step": 393
},
{
"epoch": 0.06,
"grad_norm": 0.4554136076143871,
"learning_rate": 8.480743886366965e-05,
"loss": 1.7822,
"step": 394
},
{
"epoch": 0.06,
"grad_norm": 0.41157410101274916,
"learning_rate": 8.479425977449115e-05,
"loss": 1.7798,
"step": 395
},
{
"epoch": 0.06,
"grad_norm": 0.40589712484793644,
"learning_rate": 8.478108068531264e-05,
"loss": 1.7865,
"step": 396
},
{
"epoch": 0.06,
"grad_norm": 0.41969602312949894,
"learning_rate": 8.476790159613413e-05,
"loss": 1.7846,
"step": 397
},
{
"epoch": 0.06,
"grad_norm": 0.3716209700804159,
"learning_rate": 8.475472250695564e-05,
"loss": 1.7831,
"step": 398
},
{
"epoch": 0.06,
"grad_norm": 0.2807622431725275,
"learning_rate": 8.474154341777713e-05,
"loss": 1.7785,
"step": 399
},
{
"epoch": 0.06,
"grad_norm": 0.2882135171870514,
"learning_rate": 8.472836432859863e-05,
"loss": 1.7955,
"step": 400
},
{
"epoch": 0.06,
"grad_norm": 0.33929519699606947,
"learning_rate": 8.471518523942012e-05,
"loss": 1.7821,
"step": 401
},
{
"epoch": 0.06,
"grad_norm": 0.36786610502124734,
"learning_rate": 8.470200615024163e-05,
"loss": 1.776,
"step": 402
},
{
"epoch": 0.06,
"grad_norm": 0.3364252349308489,
"learning_rate": 8.468882706106312e-05,
"loss": 1.7886,
"step": 403
},
{
"epoch": 0.06,
"grad_norm": 0.2402932781772499,
"learning_rate": 8.467564797188462e-05,
"loss": 1.7841,
"step": 404
},
{
"epoch": 0.06,
"grad_norm": 0.25361231156390895,
"learning_rate": 8.466246888270611e-05,
"loss": 1.7807,
"step": 405
},
{
"epoch": 0.06,
"grad_norm": 0.2615629884646646,
"learning_rate": 8.46492897935276e-05,
"loss": 1.7791,
"step": 406
},
{
"epoch": 0.06,
"grad_norm": 0.28499785308119957,
"learning_rate": 8.463611070434911e-05,
"loss": 1.7755,
"step": 407
},
{
"epoch": 0.06,
"grad_norm": 0.3356974383461246,
"learning_rate": 8.46229316151706e-05,
"loss": 1.7711,
"step": 408
},
{
"epoch": 0.06,
"grad_norm": 0.30749102123781535,
"learning_rate": 8.46097525259921e-05,
"loss": 1.7825,
"step": 409
},
{
"epoch": 0.06,
"grad_norm": 0.2672389622630441,
"learning_rate": 8.459657343681359e-05,
"loss": 1.7726,
"step": 410
},
{
"epoch": 0.06,
"grad_norm": 0.2658653893968794,
"learning_rate": 8.45833943476351e-05,
"loss": 1.7919,
"step": 411
},
{
"epoch": 0.06,
"grad_norm": 0.28816762727415146,
"learning_rate": 8.457021525845658e-05,
"loss": 1.774,
"step": 412
},
{
"epoch": 0.06,
"grad_norm": 0.3546252508516257,
"learning_rate": 8.455703616927809e-05,
"loss": 1.7764,
"step": 413
},
{
"epoch": 0.06,
"grad_norm": 0.3982229064694483,
"learning_rate": 8.454385708009958e-05,
"loss": 1.7797,
"step": 414
},
{
"epoch": 0.06,
"grad_norm": 0.39743403855878273,
"learning_rate": 8.453067799092107e-05,
"loss": 1.7818,
"step": 415
},
{
"epoch": 0.06,
"grad_norm": 0.3565095947734143,
"learning_rate": 8.451749890174258e-05,
"loss": 1.7689,
"step": 416
},
{
"epoch": 0.06,
"grad_norm": 0.33445086643446986,
"learning_rate": 8.450431981256408e-05,
"loss": 1.7703,
"step": 417
},
{
"epoch": 0.06,
"grad_norm": 0.3315820482846953,
"learning_rate": 8.449114072338557e-05,
"loss": 1.7695,
"step": 418
},
{
"epoch": 0.06,
"grad_norm": 0.28800691098886155,
"learning_rate": 8.447796163420706e-05,
"loss": 1.7735,
"step": 419
},
{
"epoch": 0.06,
"grad_norm": 0.2801123597101435,
"learning_rate": 8.446478254502856e-05,
"loss": 1.7727,
"step": 420
},
{
"epoch": 0.06,
"grad_norm": 0.3002418032943439,
"learning_rate": 8.445160345585005e-05,
"loss": 1.7694,
"step": 421
},
{
"epoch": 0.06,
"grad_norm": 0.3105457712936389,
"learning_rate": 8.443842436667156e-05,
"loss": 1.7743,
"step": 422
},
{
"epoch": 0.06,
"grad_norm": 0.31681589803189375,
"learning_rate": 8.442524527749305e-05,
"loss": 1.7754,
"step": 423
},
{
"epoch": 0.06,
"grad_norm": 0.29540052374128717,
"learning_rate": 8.441206618831454e-05,
"loss": 1.7676,
"step": 424
},
{
"epoch": 0.06,
"grad_norm": 0.2786507071219561,
"learning_rate": 8.439888709913604e-05,
"loss": 1.7768,
"step": 425
},
{
"epoch": 0.06,
"grad_norm": 0.2882558823750487,
"learning_rate": 8.438570800995755e-05,
"loss": 1.7559,
"step": 426
},
{
"epoch": 0.06,
"grad_norm": 0.3231073755842465,
"learning_rate": 8.437252892077904e-05,
"loss": 1.7718,
"step": 427
},
{
"epoch": 0.06,
"grad_norm": 0.3009918727669168,
"learning_rate": 8.435934983160053e-05,
"loss": 1.7558,
"step": 428
},
{
"epoch": 0.06,
"grad_norm": 0.2628726596154197,
"learning_rate": 8.434617074242203e-05,
"loss": 1.7592,
"step": 429
},
{
"epoch": 0.06,
"grad_norm": 0.28399571790394734,
"learning_rate": 8.433299165324352e-05,
"loss": 1.7671,
"step": 430
},
{
"epoch": 0.06,
"grad_norm": 0.25544475849265763,
"learning_rate": 8.431981256406503e-05,
"loss": 1.7663,
"step": 431
},
{
"epoch": 0.06,
"grad_norm": 0.23553288100790656,
"learning_rate": 8.430663347488652e-05,
"loss": 1.7739,
"step": 432
},
{
"epoch": 0.06,
"grad_norm": 0.2775059913303605,
"learning_rate": 8.429345438570802e-05,
"loss": 1.7679,
"step": 433
},
{
"epoch": 0.06,
"grad_norm": 0.3075973810425004,
"learning_rate": 8.428027529652951e-05,
"loss": 1.7615,
"step": 434
},
{
"epoch": 0.06,
"grad_norm": 0.2930374223294978,
"learning_rate": 8.426709620735102e-05,
"loss": 1.7666,
"step": 435
},
{
"epoch": 0.06,
"grad_norm": 0.2579433797280727,
"learning_rate": 8.42539171181725e-05,
"loss": 1.767,
"step": 436
},
{
"epoch": 0.06,
"grad_norm": 0.24664576276610475,
"learning_rate": 8.4240738028994e-05,
"loss": 1.7673,
"step": 437
},
{
"epoch": 0.06,
"grad_norm": 0.2369430249499012,
"learning_rate": 8.42275589398155e-05,
"loss": 1.7651,
"step": 438
},
{
"epoch": 0.06,
"grad_norm": 0.22260512529991971,
"learning_rate": 8.421437985063699e-05,
"loss": 1.7656,
"step": 439
},
{
"epoch": 0.06,
"grad_norm": 0.2500300140196859,
"learning_rate": 8.420120076145849e-05,
"loss": 1.7563,
"step": 440
},
{
"epoch": 0.06,
"grad_norm": 0.2973553354592089,
"learning_rate": 8.418802167227999e-05,
"loss": 1.7686,
"step": 441
},
{
"epoch": 0.06,
"grad_norm": 0.3473837490426601,
"learning_rate": 8.417484258310149e-05,
"loss": 1.766,
"step": 442
},
{
"epoch": 0.06,
"grad_norm": 0.37681005189169203,
"learning_rate": 8.416166349392298e-05,
"loss": 1.7644,
"step": 443
},
{
"epoch": 0.07,
"grad_norm": 0.43730723219972917,
"learning_rate": 8.414848440474447e-05,
"loss": 1.755,
"step": 444
},
{
"epoch": 0.07,
"grad_norm": 0.48972509251420343,
"learning_rate": 8.413530531556597e-05,
"loss": 1.7574,
"step": 445
},
{
"epoch": 0.07,
"grad_norm": 0.4405993959888919,
"learning_rate": 8.412212622638747e-05,
"loss": 1.7486,
"step": 446
},
{
"epoch": 0.07,
"grad_norm": 0.3363658300532759,
"learning_rate": 8.410894713720897e-05,
"loss": 1.7578,
"step": 447
},
{
"epoch": 0.07,
"grad_norm": 0.2582139601078949,
"learning_rate": 8.409576804803046e-05,
"loss": 1.7612,
"step": 448
},
{
"epoch": 0.07,
"grad_norm": 0.2811740539218468,
"learning_rate": 8.408258895885196e-05,
"loss": 1.7597,
"step": 449
},
{
"epoch": 0.07,
"grad_norm": 0.2754375286092553,
"learning_rate": 8.406940986967346e-05,
"loss": 1.7618,
"step": 450
},
{
"epoch": 0.07,
"grad_norm": 0.2708471785317565,
"learning_rate": 8.405623078049496e-05,
"loss": 1.7566,
"step": 451
},
{
"epoch": 0.07,
"grad_norm": 0.27721812968333487,
"learning_rate": 8.404305169131645e-05,
"loss": 1.7546,
"step": 452
},
{
"epoch": 0.07,
"grad_norm": 0.23002196109659534,
"learning_rate": 8.402987260213794e-05,
"loss": 1.7563,
"step": 453
},
{
"epoch": 0.07,
"grad_norm": 0.26623499924803207,
"learning_rate": 8.401669351295944e-05,
"loss": 1.7632,
"step": 454
},
{
"epoch": 0.07,
"grad_norm": 0.31058712725330645,
"learning_rate": 8.400351442378095e-05,
"loss": 1.7554,
"step": 455
},
{
"epoch": 0.07,
"grad_norm": 0.2565651528940601,
"learning_rate": 8.399033533460244e-05,
"loss": 1.7495,
"step": 456
},
{
"epoch": 0.07,
"grad_norm": 0.22030881745923991,
"learning_rate": 8.397715624542393e-05,
"loss": 1.7516,
"step": 457
},
{
"epoch": 0.07,
"grad_norm": 0.22786327950251026,
"learning_rate": 8.396397715624543e-05,
"loss": 1.7545,
"step": 458
},
{
"epoch": 0.07,
"grad_norm": 0.22992085421490108,
"learning_rate": 8.395079806706693e-05,
"loss": 1.7524,
"step": 459
},
{
"epoch": 0.07,
"grad_norm": 0.2745377165784387,
"learning_rate": 8.393761897788841e-05,
"loss": 1.757,
"step": 460
},
{
"epoch": 0.07,
"grad_norm": 0.27034688441148863,
"learning_rate": 8.392443988870992e-05,
"loss": 1.7514,
"step": 461
},
{
"epoch": 0.07,
"grad_norm": 0.21431031046245713,
"learning_rate": 8.391126079953142e-05,
"loss": 1.7516,
"step": 462
},
{
"epoch": 0.07,
"grad_norm": 0.24404100887840613,
"learning_rate": 8.389808171035291e-05,
"loss": 1.7446,
"step": 463
},
{
"epoch": 0.07,
"grad_norm": 0.25123054377636167,
"learning_rate": 8.38849026211744e-05,
"loss": 1.7375,
"step": 464
},
{
"epoch": 0.07,
"grad_norm": 0.22147497464193822,
"learning_rate": 8.387172353199591e-05,
"loss": 1.7531,
"step": 465
},
{
"epoch": 0.07,
"grad_norm": 0.2719165207073828,
"learning_rate": 8.38585444428174e-05,
"loss": 1.7427,
"step": 466
},
{
"epoch": 0.07,
"grad_norm": 0.3072701977020835,
"learning_rate": 8.38453653536389e-05,
"loss": 1.7554,
"step": 467
},
{
"epoch": 0.07,
"grad_norm": 0.3465803271567775,
"learning_rate": 8.383218626446039e-05,
"loss": 1.7362,
"step": 468
},
{
"epoch": 0.07,
"grad_norm": 0.401606701101308,
"learning_rate": 8.381900717528189e-05,
"loss": 1.7487,
"step": 469
},
{
"epoch": 0.07,
"grad_norm": 0.3606552145593931,
"learning_rate": 8.380582808610339e-05,
"loss": 1.7401,
"step": 470
},
{
"epoch": 0.07,
"grad_norm": 0.31305392679932725,
"learning_rate": 8.379264899692489e-05,
"loss": 1.7426,
"step": 471
},
{
"epoch": 0.07,
"grad_norm": 0.3283966913328143,
"learning_rate": 8.377946990774638e-05,
"loss": 1.745,
"step": 472
},
{
"epoch": 0.07,
"grad_norm": 0.29719539854668975,
"learning_rate": 8.376629081856787e-05,
"loss": 1.7479,
"step": 473
},
{
"epoch": 0.07,
"grad_norm": 0.2858720938289175,
"learning_rate": 8.375311172938938e-05,
"loss": 1.7471,
"step": 474
},
{
"epoch": 0.07,
"grad_norm": 0.30869366396700376,
"learning_rate": 8.373993264021087e-05,
"loss": 1.7426,
"step": 475
},
{
"epoch": 0.07,
"grad_norm": 0.2970917996723054,
"learning_rate": 8.372675355103237e-05,
"loss": 1.748,
"step": 476
},
{
"epoch": 0.07,
"grad_norm": 0.31796639499576057,
"learning_rate": 8.371357446185386e-05,
"loss": 1.7482,
"step": 477
},
{
"epoch": 0.07,
"grad_norm": 0.3755211025212181,
"learning_rate": 8.370039537267536e-05,
"loss": 1.7441,
"step": 478
},
{
"epoch": 0.07,
"grad_norm": 0.4281842470615676,
"learning_rate": 8.368721628349686e-05,
"loss": 1.7413,
"step": 479
},
{
"epoch": 0.07,
"grad_norm": 0.4094549002505819,
"learning_rate": 8.367403719431836e-05,
"loss": 1.7509,
"step": 480
},
{
"epoch": 0.07,
"grad_norm": 0.3581404361289346,
"learning_rate": 8.366085810513985e-05,
"loss": 1.7416,
"step": 481
},
{
"epoch": 0.07,
"grad_norm": 0.3304524068236931,
"learning_rate": 8.364767901596134e-05,
"loss": 1.7444,
"step": 482
},
{
"epoch": 0.07,
"grad_norm": 0.35489265098811323,
"learning_rate": 8.363449992678285e-05,
"loss": 1.7506,
"step": 483
},
{
"epoch": 0.07,
"grad_norm": 0.3714282179236144,
"learning_rate": 8.362132083760433e-05,
"loss": 1.745,
"step": 484
},
{
"epoch": 0.07,
"grad_norm": 0.3586675337061257,
"learning_rate": 8.360814174842584e-05,
"loss": 1.7425,
"step": 485
},
{
"epoch": 0.07,
"grad_norm": 0.2345396435881697,
"learning_rate": 8.359496265924733e-05,
"loss": 1.7367,
"step": 486
},
{
"epoch": 0.07,
"grad_norm": 0.2053682387601687,
"learning_rate": 8.358178357006883e-05,
"loss": 1.7386,
"step": 487
},
{
"epoch": 0.07,
"grad_norm": 0.2711812966223324,
"learning_rate": 8.356860448089032e-05,
"loss": 1.7337,
"step": 488
},
{
"epoch": 0.07,
"grad_norm": 0.258724050629809,
"learning_rate": 8.355542539171183e-05,
"loss": 1.7356,
"step": 489
},
{
"epoch": 0.07,
"grad_norm": 0.19885527922839485,
"learning_rate": 8.354224630253332e-05,
"loss": 1.7313,
"step": 490
},
{
"epoch": 0.07,
"grad_norm": 0.23059280089577702,
"learning_rate": 8.352906721335482e-05,
"loss": 1.737,
"step": 491
},
{
"epoch": 0.07,
"grad_norm": 0.2870358462685661,
"learning_rate": 8.351588812417631e-05,
"loss": 1.7349,
"step": 492
},
{
"epoch": 0.07,
"grad_norm": 0.2633075013669381,
"learning_rate": 8.35027090349978e-05,
"loss": 1.7353,
"step": 493
},
{
"epoch": 0.07,
"grad_norm": 0.3049199627731043,
"learning_rate": 8.348952994581931e-05,
"loss": 1.7384,
"step": 494
},
{
"epoch": 0.07,
"grad_norm": 0.34926345039498624,
"learning_rate": 8.34763508566408e-05,
"loss": 1.7287,
"step": 495
},
{
"epoch": 0.07,
"grad_norm": 0.31033645995550346,
"learning_rate": 8.34631717674623e-05,
"loss": 1.7386,
"step": 496
},
{
"epoch": 0.07,
"grad_norm": 0.23708657622061127,
"learning_rate": 8.344999267828379e-05,
"loss": 1.7376,
"step": 497
},
{
"epoch": 0.07,
"grad_norm": 0.2503644410759697,
"learning_rate": 8.34368135891053e-05,
"loss": 1.7286,
"step": 498
},
{
"epoch": 0.07,
"grad_norm": 0.30108885609906977,
"learning_rate": 8.342363449992679e-05,
"loss": 1.7424,
"step": 499
},
{
"epoch": 0.07,
"grad_norm": 0.3640305222721408,
"learning_rate": 8.341045541074829e-05,
"loss": 1.7365,
"step": 500
},
{
"epoch": 0.07,
"grad_norm": 0.3862444666501204,
"learning_rate": 8.339727632156978e-05,
"loss": 1.7388,
"step": 501
},
{
"epoch": 0.07,
"grad_norm": 0.33565159533458905,
"learning_rate": 8.338409723239127e-05,
"loss": 1.725,
"step": 502
},
{
"epoch": 0.07,
"grad_norm": 0.24120380275839087,
"learning_rate": 8.337091814321278e-05,
"loss": 1.7386,
"step": 503
},
{
"epoch": 0.07,
"grad_norm": 0.2019440056977102,
"learning_rate": 8.335773905403427e-05,
"loss": 1.7364,
"step": 504
},
{
"epoch": 0.07,
"grad_norm": 0.19665207193807027,
"learning_rate": 8.334455996485577e-05,
"loss": 1.7384,
"step": 505
},
{
"epoch": 0.07,
"grad_norm": 0.20041439367425065,
"learning_rate": 8.333138087567726e-05,
"loss": 1.7298,
"step": 506
},
{
"epoch": 0.07,
"grad_norm": 0.22759855658411884,
"learning_rate": 8.331820178649877e-05,
"loss": 1.7374,
"step": 507
},
{
"epoch": 0.07,
"grad_norm": 0.2237449007183391,
"learning_rate": 8.330502269732025e-05,
"loss": 1.736,
"step": 508
},
{
"epoch": 0.07,
"grad_norm": 0.19892064401379644,
"learning_rate": 8.329184360814176e-05,
"loss": 1.7309,
"step": 509
},
{
"epoch": 0.07,
"grad_norm": 0.21780921689097468,
"learning_rate": 8.327866451896325e-05,
"loss": 1.7247,
"step": 510
},
{
"epoch": 0.07,
"grad_norm": 0.22223023552216575,
"learning_rate": 8.326548542978474e-05,
"loss": 1.7352,
"step": 511
},
{
"epoch": 0.07,
"grad_norm": 0.23435078161758566,
"learning_rate": 8.325230634060624e-05,
"loss": 1.727,
"step": 512
},
{
"epoch": 0.08,
"grad_norm": 0.25436357491660655,
"learning_rate": 8.323912725142775e-05,
"loss": 1.7182,
"step": 513
},
{
"epoch": 0.08,
"grad_norm": 0.2542313172961627,
"learning_rate": 8.322594816224924e-05,
"loss": 1.7259,
"step": 514
},
{
"epoch": 0.08,
"grad_norm": 0.25353563544590524,
"learning_rate": 8.321276907307073e-05,
"loss": 1.7131,
"step": 515
},
{
"epoch": 0.08,
"grad_norm": 0.25178826296170725,
"learning_rate": 8.319958998389223e-05,
"loss": 1.7303,
"step": 516
},
{
"epoch": 0.08,
"grad_norm": 0.24967442898411737,
"learning_rate": 8.318641089471372e-05,
"loss": 1.7156,
"step": 517
},
{
"epoch": 0.08,
"grad_norm": 0.29093688012404956,
"learning_rate": 8.317323180553523e-05,
"loss": 1.7273,
"step": 518
},
{
"epoch": 0.08,
"grad_norm": 0.3408463856901383,
"learning_rate": 8.316005271635672e-05,
"loss": 1.731,
"step": 519
},
{
"epoch": 0.08,
"grad_norm": 0.3654511204209073,
"learning_rate": 8.314687362717822e-05,
"loss": 1.732,
"step": 520
},
{
"epoch": 0.08,
"grad_norm": 0.3271746533985564,
"learning_rate": 8.313369453799971e-05,
"loss": 1.7295,
"step": 521
},
{
"epoch": 0.08,
"grad_norm": 0.2717225234340834,
"learning_rate": 8.312051544882122e-05,
"loss": 1.7325,
"step": 522
},
{
"epoch": 0.08,
"grad_norm": 0.3081217924018755,
"learning_rate": 8.310733635964271e-05,
"loss": 1.7246,
"step": 523
},
{
"epoch": 0.08,
"grad_norm": 0.3527075080404129,
"learning_rate": 8.30941572704642e-05,
"loss": 1.7216,
"step": 524
},
{
"epoch": 0.08,
"grad_norm": 0.4154842597521178,
"learning_rate": 8.30809781812857e-05,
"loss": 1.7255,
"step": 525
},
{
"epoch": 0.08,
"grad_norm": 0.443067725739395,
"learning_rate": 8.306779909210719e-05,
"loss": 1.7281,
"step": 526
},
{
"epoch": 0.08,
"grad_norm": 0.4106745425675618,
"learning_rate": 8.305462000292868e-05,
"loss": 1.7302,
"step": 527
},
{
"epoch": 0.08,
"grad_norm": 0.33999747867070007,
"learning_rate": 8.304144091375019e-05,
"loss": 1.7267,
"step": 528
},
{
"epoch": 0.08,
"grad_norm": 0.28901846740062775,
"learning_rate": 8.302826182457169e-05,
"loss": 1.7262,
"step": 529
},
{
"epoch": 0.08,
"grad_norm": 0.29966915847542663,
"learning_rate": 8.301508273539318e-05,
"loss": 1.7313,
"step": 530
},
{
"epoch": 0.08,
"grad_norm": 0.31328357913945787,
"learning_rate": 8.300190364621469e-05,
"loss": 1.7218,
"step": 531
},
{
"epoch": 0.08,
"grad_norm": 0.2814137310527097,
"learning_rate": 8.298872455703617e-05,
"loss": 1.7215,
"step": 532
},
{
"epoch": 0.08,
"grad_norm": 0.24123503809014038,
"learning_rate": 8.297554546785767e-05,
"loss": 1.7188,
"step": 533
},
{
"epoch": 0.08,
"grad_norm": 0.28016429302641516,
"learning_rate": 8.296236637867917e-05,
"loss": 1.7198,
"step": 534
},
{
"epoch": 0.08,
"grad_norm": 0.3117115191862107,
"learning_rate": 8.294918728950066e-05,
"loss": 1.7262,
"step": 535
},
{
"epoch": 0.08,
"grad_norm": 0.278045134177961,
"learning_rate": 8.293600820032216e-05,
"loss": 1.723,
"step": 536
},
{
"epoch": 0.08,
"grad_norm": 0.23769710910310898,
"learning_rate": 8.292282911114366e-05,
"loss": 1.723,
"step": 537
},
{
"epoch": 0.08,
"grad_norm": 0.24511433211495529,
"learning_rate": 8.290965002196516e-05,
"loss": 1.7253,
"step": 538
},
{
"epoch": 0.08,
"grad_norm": 0.2384929581877857,
"learning_rate": 8.289647093278665e-05,
"loss": 1.7122,
"step": 539
},
{
"epoch": 0.08,
"grad_norm": 0.22551550594015082,
"learning_rate": 8.288329184360814e-05,
"loss": 1.7219,
"step": 540
},
{
"epoch": 0.08,
"grad_norm": 0.23506955783092845,
"learning_rate": 8.287011275442964e-05,
"loss": 1.7233,
"step": 541
},
{
"epoch": 0.08,
"grad_norm": 0.21860310689837917,
"learning_rate": 8.285693366525115e-05,
"loss": 1.7221,
"step": 542
},
{
"epoch": 0.08,
"grad_norm": 0.20398647046618187,
"learning_rate": 8.284375457607264e-05,
"loss": 1.7213,
"step": 543
},
{
"epoch": 0.08,
"grad_norm": 0.24633014333201053,
"learning_rate": 8.283057548689413e-05,
"loss": 1.7132,
"step": 544
},
{
"epoch": 0.08,
"grad_norm": 0.2667137895735778,
"learning_rate": 8.281739639771563e-05,
"loss": 1.726,
"step": 545
},
{
"epoch": 0.08,
"grad_norm": 0.21858001204347083,
"learning_rate": 8.280421730853713e-05,
"loss": 1.7112,
"step": 546
},
{
"epoch": 0.08,
"grad_norm": 0.19721106291794568,
"learning_rate": 8.279103821935863e-05,
"loss": 1.722,
"step": 547
},
{
"epoch": 0.08,
"grad_norm": 0.2051408273401384,
"learning_rate": 8.277785913018012e-05,
"loss": 1.7206,
"step": 548
},
{
"epoch": 0.08,
"grad_norm": 0.21682627662474413,
"learning_rate": 8.276468004100161e-05,
"loss": 1.7151,
"step": 549
},
{
"epoch": 0.08,
"grad_norm": 0.2438456543254191,
"learning_rate": 8.275150095182311e-05,
"loss": 1.711,
"step": 550
},
{
"epoch": 0.08,
"grad_norm": 0.2629415416448845,
"learning_rate": 8.27383218626446e-05,
"loss": 1.7155,
"step": 551
},
{
"epoch": 0.08,
"grad_norm": 0.29381087938708167,
"learning_rate": 8.272514277346611e-05,
"loss": 1.7184,
"step": 552
},
{
"epoch": 0.08,
"grad_norm": 0.3068644717057271,
"learning_rate": 8.27119636842876e-05,
"loss": 1.7071,
"step": 553
},
{
"epoch": 0.08,
"grad_norm": 0.3266717891225327,
"learning_rate": 8.26987845951091e-05,
"loss": 1.7144,
"step": 554
},
{
"epoch": 0.08,
"grad_norm": 0.3342256318259162,
"learning_rate": 8.26856055059306e-05,
"loss": 1.7132,
"step": 555
},
{
"epoch": 0.08,
"grad_norm": 0.3507430866157419,
"learning_rate": 8.267242641675208e-05,
"loss": 1.7222,
"step": 556
},
{
"epoch": 0.08,
"grad_norm": 0.35401841712709037,
"learning_rate": 8.265924732757359e-05,
"loss": 1.7163,
"step": 557
},
{
"epoch": 0.08,
"grad_norm": 0.29152885792134353,
"learning_rate": 8.264606823839509e-05,
"loss": 1.7021,
"step": 558
},
{
"epoch": 0.08,
"grad_norm": 0.2321802057678123,
"learning_rate": 8.263288914921658e-05,
"loss": 1.715,
"step": 559
},
{
"epoch": 0.08,
"grad_norm": 0.20521573981201763,
"learning_rate": 8.261971006003807e-05,
"loss": 1.7136,
"step": 560
},
{
"epoch": 0.08,
"grad_norm": 0.22389843673660376,
"learning_rate": 8.260653097085958e-05,
"loss": 1.7145,
"step": 561
},
{
"epoch": 0.08,
"grad_norm": 0.26769790258636306,
"learning_rate": 8.259335188168107e-05,
"loss": 1.7135,
"step": 562
},
{
"epoch": 0.08,
"grad_norm": 0.263472778250681,
"learning_rate": 8.258017279250257e-05,
"loss": 1.7153,
"step": 563
},
{
"epoch": 0.08,
"grad_norm": 0.20438761322969645,
"learning_rate": 8.256699370332406e-05,
"loss": 1.7053,
"step": 564
},
{
"epoch": 0.08,
"grad_norm": 0.2711791794052338,
"learning_rate": 8.255381461414556e-05,
"loss": 1.7076,
"step": 565
},
{
"epoch": 0.08,
"grad_norm": 0.29324151826012396,
"learning_rate": 8.254063552496706e-05,
"loss": 1.7162,
"step": 566
},
{
"epoch": 0.08,
"grad_norm": 0.23680109722780296,
"learning_rate": 8.252745643578856e-05,
"loss": 1.7173,
"step": 567
},
{
"epoch": 0.08,
"grad_norm": 0.3897963836334016,
"learning_rate": 8.251427734661005e-05,
"loss": 1.709,
"step": 568
},
{
"epoch": 0.08,
"grad_norm": 0.3107639335861359,
"learning_rate": 8.250109825743154e-05,
"loss": 1.7108,
"step": 569
},
{
"epoch": 0.08,
"grad_norm": 0.31507837210091977,
"learning_rate": 8.248791916825305e-05,
"loss": 1.7177,
"step": 570
},
{
"epoch": 0.08,
"grad_norm": 0.407833593775017,
"learning_rate": 8.247474007907455e-05,
"loss": 1.7158,
"step": 571
},
{
"epoch": 0.08,
"grad_norm": 0.33475655675893035,
"learning_rate": 8.246156098989604e-05,
"loss": 1.7159,
"step": 572
},
{
"epoch": 0.08,
"grad_norm": 0.30470045477617375,
"learning_rate": 8.244838190071753e-05,
"loss": 1.7133,
"step": 573
},
{
"epoch": 0.08,
"grad_norm": 0.2361385908228469,
"learning_rate": 8.243520281153903e-05,
"loss": 1.7138,
"step": 574
},
{
"epoch": 0.08,
"grad_norm": 0.3028517511605204,
"learning_rate": 8.242202372236052e-05,
"loss": 1.7061,
"step": 575
},
{
"epoch": 0.08,
"grad_norm": 0.26913911701533544,
"learning_rate": 8.240884463318203e-05,
"loss": 1.7081,
"step": 576
},
{
"epoch": 0.08,
"grad_norm": 0.24779044758095875,
"learning_rate": 8.239566554400352e-05,
"loss": 1.708,
"step": 577
},
{
"epoch": 0.08,
"grad_norm": 0.2233287796665562,
"learning_rate": 8.238248645482501e-05,
"loss": 1.7112,
"step": 578
},
{
"epoch": 0.08,
"grad_norm": 0.2712079799980342,
"learning_rate": 8.236930736564652e-05,
"loss": 1.7124,
"step": 579
},
{
"epoch": 0.08,
"grad_norm": 0.2094942014575434,
"learning_rate": 8.2356128276468e-05,
"loss": 1.7054,
"step": 580
},
{
"epoch": 0.09,
"grad_norm": 0.21486524279150165,
"learning_rate": 8.234294918728951e-05,
"loss": 1.7119,
"step": 581
},
{
"epoch": 0.09,
"grad_norm": 0.2090205889232179,
"learning_rate": 8.2329770098111e-05,
"loss": 1.7059,
"step": 582
},
{
"epoch": 0.09,
"grad_norm": 0.20460593187920675,
"learning_rate": 8.23165910089325e-05,
"loss": 1.7066,
"step": 583
},
{
"epoch": 0.09,
"grad_norm": 0.1767845416143033,
"learning_rate": 8.230341191975399e-05,
"loss": 1.7024,
"step": 584
},
{
"epoch": 0.09,
"grad_norm": 0.17887391851705126,
"learning_rate": 8.22902328305755e-05,
"loss": 1.7067,
"step": 585
},
{
"epoch": 0.09,
"grad_norm": 0.1695109544582885,
"learning_rate": 8.227705374139699e-05,
"loss": 1.6895,
"step": 586
},
{
"epoch": 0.09,
"grad_norm": 0.2083349170582134,
"learning_rate": 8.226387465221849e-05,
"loss": 1.6915,
"step": 587
},
{
"epoch": 0.09,
"grad_norm": 0.217567810379028,
"learning_rate": 8.225069556303998e-05,
"loss": 1.7014,
"step": 588
},
{
"epoch": 0.09,
"grad_norm": 0.26578168421783505,
"learning_rate": 8.223751647386147e-05,
"loss": 1.6959,
"step": 589
},
{
"epoch": 0.09,
"grad_norm": 0.26759525206680784,
"learning_rate": 8.222433738468297e-05,
"loss": 1.7081,
"step": 590
},
{
"epoch": 0.09,
"grad_norm": 0.29241320421717,
"learning_rate": 8.221115829550447e-05,
"loss": 1.7,
"step": 591
},
{
"epoch": 0.09,
"grad_norm": 0.2665660324658676,
"learning_rate": 8.219797920632597e-05,
"loss": 1.7116,
"step": 592
},
{
"epoch": 0.09,
"grad_norm": 0.2543632897483254,
"learning_rate": 8.218480011714746e-05,
"loss": 1.6998,
"step": 593
},
{
"epoch": 0.09,
"grad_norm": 0.24422324009786234,
"learning_rate": 8.217162102796897e-05,
"loss": 1.7032,
"step": 594
},
{
"epoch": 0.09,
"grad_norm": 0.2412157212895076,
"learning_rate": 8.215844193879046e-05,
"loss": 1.7002,
"step": 595
},
{
"epoch": 0.09,
"grad_norm": 0.22950526573401658,
"learning_rate": 8.214526284961196e-05,
"loss": 1.6968,
"step": 596
},
{
"epoch": 0.09,
"grad_norm": 0.2405536739297689,
"learning_rate": 8.213208376043345e-05,
"loss": 1.7067,
"step": 597
},
{
"epoch": 0.09,
"grad_norm": 0.22384819760583757,
"learning_rate": 8.211890467125494e-05,
"loss": 1.7002,
"step": 598
},
{
"epoch": 0.09,
"grad_norm": 0.22165394505798217,
"learning_rate": 8.210572558207644e-05,
"loss": 1.7053,
"step": 599
},
{
"epoch": 0.09,
"grad_norm": 0.22325001055054616,
"learning_rate": 8.209254649289794e-05,
"loss": 1.7076,
"step": 600
},
{
"epoch": 0.09,
"grad_norm": 0.24437917178401694,
"learning_rate": 8.207936740371944e-05,
"loss": 1.6863,
"step": 601
},
{
"epoch": 0.09,
"grad_norm": 0.24473623901397126,
"learning_rate": 8.206618831454093e-05,
"loss": 1.7029,
"step": 602
},
{
"epoch": 0.09,
"grad_norm": 0.23793249982285095,
"learning_rate": 8.205300922536244e-05,
"loss": 1.7003,
"step": 603
},
{
"epoch": 0.09,
"grad_norm": 0.27010899475555333,
"learning_rate": 8.203983013618392e-05,
"loss": 1.7035,
"step": 604
},
{
"epoch": 0.09,
"grad_norm": 0.2974971413903659,
"learning_rate": 8.202665104700543e-05,
"loss": 1.6963,
"step": 605
},
{
"epoch": 0.09,
"grad_norm": 0.3391211533028402,
"learning_rate": 8.201347195782692e-05,
"loss": 1.699,
"step": 606
},
{
"epoch": 0.09,
"grad_norm": 0.401887788096081,
"learning_rate": 8.200029286864841e-05,
"loss": 1.7001,
"step": 607
},
{
"epoch": 0.09,
"grad_norm": 0.4014157913865194,
"learning_rate": 8.198711377946991e-05,
"loss": 1.7035,
"step": 608
},
{
"epoch": 0.09,
"grad_norm": 0.34227601583187883,
"learning_rate": 8.197393469029142e-05,
"loss": 1.7031,
"step": 609
},
{
"epoch": 0.09,
"grad_norm": 0.2572596615978446,
"learning_rate": 8.196075560111291e-05,
"loss": 1.691,
"step": 610
},
{
"epoch": 0.09,
"grad_norm": 0.25436898174940326,
"learning_rate": 8.19475765119344e-05,
"loss": 1.6983,
"step": 611
},
{
"epoch": 0.09,
"grad_norm": 0.2966388678401186,
"learning_rate": 8.19343974227559e-05,
"loss": 1.6939,
"step": 612
},
{
"epoch": 0.09,
"grad_norm": 0.2609270973789931,
"learning_rate": 8.192121833357739e-05,
"loss": 1.6882,
"step": 613
},
{
"epoch": 0.09,
"grad_norm": 0.22900064576600393,
"learning_rate": 8.190803924439888e-05,
"loss": 1.6962,
"step": 614
},
{
"epoch": 0.09,
"grad_norm": 0.2157661551046652,
"learning_rate": 8.189486015522039e-05,
"loss": 1.6889,
"step": 615
},
{
"epoch": 0.09,
"grad_norm": 0.23909710323754044,
"learning_rate": 8.188168106604189e-05,
"loss": 1.6969,
"step": 616
},
{
"epoch": 0.09,
"grad_norm": 0.25769486284072124,
"learning_rate": 8.186850197686338e-05,
"loss": 1.699,
"step": 617
},
{
"epoch": 0.09,
"grad_norm": 0.23459236703850359,
"learning_rate": 8.185532288768489e-05,
"loss": 1.6904,
"step": 618
},
{
"epoch": 0.09,
"grad_norm": 0.23197840287816318,
"learning_rate": 8.184214379850638e-05,
"loss": 1.6927,
"step": 619
},
{
"epoch": 0.09,
"grad_norm": 0.22922928506230492,
"learning_rate": 8.182896470932787e-05,
"loss": 1.691,
"step": 620
},
{
"epoch": 0.09,
"grad_norm": 0.20472350901775296,
"learning_rate": 8.181578562014937e-05,
"loss": 1.6988,
"step": 621
},
{
"epoch": 0.09,
"grad_norm": 0.21293532425438386,
"learning_rate": 8.180260653097086e-05,
"loss": 1.6983,
"step": 622
},
{
"epoch": 0.09,
"grad_norm": 0.25813129719442796,
"learning_rate": 8.178942744179236e-05,
"loss": 1.6977,
"step": 623
},
{
"epoch": 0.09,
"grad_norm": 0.28731586003331094,
"learning_rate": 8.177624835261386e-05,
"loss": 1.6917,
"step": 624
},
{
"epoch": 0.09,
"grad_norm": 0.27778976830351854,
"learning_rate": 8.176306926343536e-05,
"loss": 1.6904,
"step": 625
},
{
"epoch": 0.09,
"grad_norm": 0.2667191282626614,
"learning_rate": 8.174989017425685e-05,
"loss": 1.6906,
"step": 626
},
{
"epoch": 0.09,
"grad_norm": 0.27135549254051117,
"learning_rate": 8.173671108507834e-05,
"loss": 1.7077,
"step": 627
},
{
"epoch": 0.09,
"grad_norm": 0.2737735487643415,
"learning_rate": 8.172353199589984e-05,
"loss": 1.6919,
"step": 628
},
{
"epoch": 0.09,
"grad_norm": 0.23702108817197906,
"learning_rate": 8.171035290672134e-05,
"loss": 1.6833,
"step": 629
},
{
"epoch": 0.09,
"grad_norm": 0.21824459389931547,
"learning_rate": 8.169717381754284e-05,
"loss": 1.6874,
"step": 630
},
{
"epoch": 0.09,
"grad_norm": 0.2551514671888147,
"learning_rate": 8.168399472836433e-05,
"loss": 1.6964,
"step": 631
},
{
"epoch": 0.09,
"grad_norm": 0.2888406849522713,
"learning_rate": 8.167081563918583e-05,
"loss": 1.686,
"step": 632
},
{
"epoch": 0.09,
"grad_norm": 0.3043373169879388,
"learning_rate": 8.165763655000733e-05,
"loss": 1.6909,
"step": 633
},
{
"epoch": 0.09,
"grad_norm": 0.31851826485165624,
"learning_rate": 8.164445746082883e-05,
"loss": 1.6955,
"step": 634
},
{
"epoch": 0.09,
"grad_norm": 0.31677231407393813,
"learning_rate": 8.163127837165032e-05,
"loss": 1.6931,
"step": 635
},
{
"epoch": 0.09,
"grad_norm": 0.3235809995826595,
"learning_rate": 8.161809928247181e-05,
"loss": 1.6919,
"step": 636
},
{
"epoch": 0.09,
"grad_norm": 0.31715316802417376,
"learning_rate": 8.160492019329331e-05,
"loss": 1.6972,
"step": 637
},
{
"epoch": 0.09,
"grad_norm": 0.258710234201555,
"learning_rate": 8.15917411041148e-05,
"loss": 1.6908,
"step": 638
},
{
"epoch": 0.09,
"grad_norm": 0.19364210951795302,
"learning_rate": 8.157856201493631e-05,
"loss": 1.6893,
"step": 639
},
{
"epoch": 0.09,
"grad_norm": 0.1934820062184886,
"learning_rate": 8.15653829257578e-05,
"loss": 1.6937,
"step": 640
},
{
"epoch": 0.09,
"grad_norm": 0.22124564568502275,
"learning_rate": 8.15522038365793e-05,
"loss": 1.6857,
"step": 641
},
{
"epoch": 0.09,
"grad_norm": 0.24527757345647355,
"learning_rate": 8.15390247474008e-05,
"loss": 1.6964,
"step": 642
},
{
"epoch": 0.09,
"grad_norm": 0.31637585588010625,
"learning_rate": 8.152584565822228e-05,
"loss": 1.6911,
"step": 643
},
{
"epoch": 0.09,
"grad_norm": 0.33231300289557464,
"learning_rate": 8.151266656904379e-05,
"loss": 1.6857,
"step": 644
},
{
"epoch": 0.09,
"grad_norm": 0.2979233182409515,
"learning_rate": 8.149948747986529e-05,
"loss": 1.6907,
"step": 645
},
{
"epoch": 0.09,
"grad_norm": 0.2637032106008522,
"learning_rate": 8.148630839068678e-05,
"loss": 1.6922,
"step": 646
},
{
"epoch": 0.09,
"grad_norm": 0.2729619019235736,
"learning_rate": 8.147312930150827e-05,
"loss": 1.6919,
"step": 647
},
{
"epoch": 0.09,
"grad_norm": 0.3056180656791934,
"learning_rate": 8.145995021232978e-05,
"loss": 1.6866,
"step": 648
},
{
"epoch": 0.1,
"grad_norm": 0.3025381384222673,
"learning_rate": 8.144677112315127e-05,
"loss": 1.6895,
"step": 649
},
{
"epoch": 0.1,
"grad_norm": 0.26847771781612867,
"learning_rate": 8.143359203397277e-05,
"loss": 1.6893,
"step": 650
},
{
"epoch": 0.1,
"grad_norm": 0.2438844041255715,
"learning_rate": 8.142041294479426e-05,
"loss": 1.6777,
"step": 651
},
{
"epoch": 0.1,
"grad_norm": 0.29212647337890685,
"learning_rate": 8.140723385561575e-05,
"loss": 1.6962,
"step": 652
},
{
"epoch": 0.1,
"grad_norm": 0.3279764517256754,
"learning_rate": 8.139405476643726e-05,
"loss": 1.686,
"step": 653
},
{
"epoch": 0.1,
"grad_norm": 0.3266201901472985,
"learning_rate": 8.138087567725876e-05,
"loss": 1.6861,
"step": 654
},
{
"epoch": 0.1,
"grad_norm": 0.29731852257716146,
"learning_rate": 8.136769658808025e-05,
"loss": 1.684,
"step": 655
},
{
"epoch": 0.1,
"grad_norm": 0.31099064594669873,
"learning_rate": 8.135451749890174e-05,
"loss": 1.6842,
"step": 656
},
{
"epoch": 0.1,
"grad_norm": 0.3060576588797521,
"learning_rate": 8.134133840972325e-05,
"loss": 1.6891,
"step": 657
},
{
"epoch": 0.1,
"grad_norm": 0.26004140101304163,
"learning_rate": 8.132815932054474e-05,
"loss": 1.6854,
"step": 658
},
{
"epoch": 0.1,
"grad_norm": 0.268187676963691,
"learning_rate": 8.131498023136624e-05,
"loss": 1.6838,
"step": 659
},
{
"epoch": 0.1,
"grad_norm": 0.2726005198375338,
"learning_rate": 8.130180114218773e-05,
"loss": 1.692,
"step": 660
},
{
"epoch": 0.1,
"grad_norm": 0.23864901844548525,
"learning_rate": 8.128862205300923e-05,
"loss": 1.6849,
"step": 661
},
{
"epoch": 0.1,
"grad_norm": 0.2244877830433347,
"learning_rate": 8.127544296383072e-05,
"loss": 1.6806,
"step": 662
},
{
"epoch": 0.1,
"grad_norm": 0.21510789414916953,
"learning_rate": 8.126226387465223e-05,
"loss": 1.6835,
"step": 663
},
{
"epoch": 0.1,
"grad_norm": 0.207451826116181,
"learning_rate": 8.124908478547372e-05,
"loss": 1.6731,
"step": 664
},
{
"epoch": 0.1,
"grad_norm": 0.2410531205632606,
"learning_rate": 8.123590569629521e-05,
"loss": 1.679,
"step": 665
},
{
"epoch": 0.1,
"grad_norm": 0.2658490081241801,
"learning_rate": 8.122272660711672e-05,
"loss": 1.683,
"step": 666
},
{
"epoch": 0.1,
"grad_norm": 0.24153141494391162,
"learning_rate": 8.12095475179382e-05,
"loss": 1.6738,
"step": 667
},
{
"epoch": 0.1,
"grad_norm": 0.23164870510130753,
"learning_rate": 8.119636842875971e-05,
"loss": 1.6867,
"step": 668
},
{
"epoch": 0.1,
"grad_norm": 0.24233480776179323,
"learning_rate": 8.11831893395812e-05,
"loss": 1.6773,
"step": 669
},
{
"epoch": 0.1,
"grad_norm": 0.2574837382310888,
"learning_rate": 8.11700102504027e-05,
"loss": 1.6844,
"step": 670
},
{
"epoch": 0.1,
"grad_norm": 0.22126722703874208,
"learning_rate": 8.115683116122419e-05,
"loss": 1.6869,
"step": 671
},
{
"epoch": 0.1,
"grad_norm": 0.17423158084927914,
"learning_rate": 8.11436520720457e-05,
"loss": 1.6826,
"step": 672
},
{
"epoch": 0.1,
"grad_norm": 0.2011587498434974,
"learning_rate": 8.113047298286719e-05,
"loss": 1.6932,
"step": 673
},
{
"epoch": 0.1,
"grad_norm": 0.23419572476137138,
"learning_rate": 8.111729389368869e-05,
"loss": 1.681,
"step": 674
},
{
"epoch": 0.1,
"grad_norm": 0.21861982894310164,
"learning_rate": 8.110411480451018e-05,
"loss": 1.6845,
"step": 675
},
{
"epoch": 0.1,
"grad_norm": 0.2020605297807059,
"learning_rate": 8.109093571533167e-05,
"loss": 1.6758,
"step": 676
},
{
"epoch": 0.1,
"grad_norm": 0.200339221622581,
"learning_rate": 8.107775662615317e-05,
"loss": 1.6852,
"step": 677
},
{
"epoch": 0.1,
"grad_norm": 0.2272004211020317,
"learning_rate": 8.106457753697467e-05,
"loss": 1.6754,
"step": 678
},
{
"epoch": 0.1,
"grad_norm": 0.2639656585184034,
"learning_rate": 8.105139844779617e-05,
"loss": 1.6651,
"step": 679
},
{
"epoch": 0.1,
"grad_norm": 0.2821270304501907,
"learning_rate": 8.103821935861766e-05,
"loss": 1.6727,
"step": 680
},
{
"epoch": 0.1,
"grad_norm": 0.2385914002979489,
"learning_rate": 8.102504026943917e-05,
"loss": 1.6757,
"step": 681
},
{
"epoch": 0.1,
"grad_norm": 0.19689463498619292,
"learning_rate": 8.101186118026066e-05,
"loss": 1.675,
"step": 682
},
{
"epoch": 0.1,
"grad_norm": 0.19388551672385698,
"learning_rate": 8.099868209108216e-05,
"loss": 1.6865,
"step": 683
},
{
"epoch": 0.1,
"grad_norm": 0.21717430076929156,
"learning_rate": 8.098550300190365e-05,
"loss": 1.6786,
"step": 684
},
{
"epoch": 0.1,
"grad_norm": 0.21911631079722832,
"learning_rate": 8.097232391272514e-05,
"loss": 1.677,
"step": 685
},
{
"epoch": 0.1,
"grad_norm": 0.21899413808647583,
"learning_rate": 8.095914482354664e-05,
"loss": 1.6695,
"step": 686
},
{
"epoch": 0.1,
"grad_norm": 0.256376958131802,
"learning_rate": 8.094596573436814e-05,
"loss": 1.681,
"step": 687
},
{
"epoch": 0.1,
"grad_norm": 0.30968776120326585,
"learning_rate": 8.093278664518964e-05,
"loss": 1.6719,
"step": 688
},
{
"epoch": 0.1,
"grad_norm": 0.3405179169627675,
"learning_rate": 8.091960755601113e-05,
"loss": 1.6729,
"step": 689
},
{
"epoch": 0.1,
"grad_norm": 0.3758025068653412,
"learning_rate": 8.090642846683264e-05,
"loss": 1.6767,
"step": 690
},
{
"epoch": 0.1,
"grad_norm": 0.37947974517090455,
"learning_rate": 8.089324937765412e-05,
"loss": 1.6722,
"step": 691
},
{
"epoch": 0.1,
"grad_norm": 0.3209320177396622,
"learning_rate": 8.088007028847563e-05,
"loss": 1.6743,
"step": 692
},
{
"epoch": 0.1,
"grad_norm": 0.2686562575879156,
"learning_rate": 8.086689119929712e-05,
"loss": 1.6722,
"step": 693
},
{
"epoch": 0.1,
"grad_norm": 0.24152315432813073,
"learning_rate": 8.085371211011861e-05,
"loss": 1.6817,
"step": 694
},
{
"epoch": 0.1,
"grad_norm": 0.25903562203260866,
"learning_rate": 8.084053302094011e-05,
"loss": 1.6808,
"step": 695
},
{
"epoch": 0.1,
"grad_norm": 0.27215275116010934,
"learning_rate": 8.082735393176162e-05,
"loss": 1.6735,
"step": 696
},
{
"epoch": 0.1,
"grad_norm": 0.2665073871904065,
"learning_rate": 8.081417484258311e-05,
"loss": 1.6813,
"step": 697
},
{
"epoch": 0.1,
"grad_norm": 0.2595685011638328,
"learning_rate": 8.08009957534046e-05,
"loss": 1.6813,
"step": 698
},
{
"epoch": 0.1,
"grad_norm": 0.2584829914241195,
"learning_rate": 8.07878166642261e-05,
"loss": 1.6699,
"step": 699
},
{
"epoch": 0.1,
"grad_norm": 0.23308896936839194,
"learning_rate": 8.077463757504759e-05,
"loss": 1.6686,
"step": 700
},
{
"epoch": 0.1,
"grad_norm": 0.18262768380478986,
"learning_rate": 8.076145848586908e-05,
"loss": 1.6727,
"step": 701
},
{
"epoch": 0.1,
"grad_norm": 0.17402802326250397,
"learning_rate": 8.074827939669059e-05,
"loss": 1.671,
"step": 702
},
{
"epoch": 0.1,
"grad_norm": 0.2079124720731814,
"learning_rate": 8.073510030751208e-05,
"loss": 1.6718,
"step": 703
},
{
"epoch": 0.1,
"grad_norm": 0.21030626632342236,
"learning_rate": 8.072192121833358e-05,
"loss": 1.675,
"step": 704
},
{
"epoch": 0.1,
"grad_norm": 0.19979944547574685,
"learning_rate": 8.070874212915509e-05,
"loss": 1.6719,
"step": 705
},
{
"epoch": 0.1,
"grad_norm": 0.1960619390208134,
"learning_rate": 8.069556303997658e-05,
"loss": 1.6729,
"step": 706
},
{
"epoch": 0.1,
"grad_norm": 0.1800588887952547,
"learning_rate": 8.068238395079807e-05,
"loss": 1.6742,
"step": 707
},
{
"epoch": 0.1,
"grad_norm": 0.171338131397515,
"learning_rate": 8.066920486161957e-05,
"loss": 1.6681,
"step": 708
},
{
"epoch": 0.1,
"grad_norm": 0.1952527594313872,
"learning_rate": 8.065602577244106e-05,
"loss": 1.672,
"step": 709
},
{
"epoch": 0.1,
"grad_norm": 0.2029111128206527,
"learning_rate": 8.064284668326255e-05,
"loss": 1.6729,
"step": 710
},
{
"epoch": 0.1,
"grad_norm": 0.2093511465528856,
"learning_rate": 8.062966759408406e-05,
"loss": 1.6662,
"step": 711
},
{
"epoch": 0.1,
"grad_norm": 0.2355948665280437,
"learning_rate": 8.061648850490556e-05,
"loss": 1.6691,
"step": 712
},
{
"epoch": 0.1,
"grad_norm": 0.25612769652057044,
"learning_rate": 8.060330941572705e-05,
"loss": 1.6691,
"step": 713
},
{
"epoch": 0.1,
"grad_norm": 0.3178891893081287,
"learning_rate": 8.059013032654856e-05,
"loss": 1.6702,
"step": 714
},
{
"epoch": 0.1,
"grad_norm": 0.32928109757980245,
"learning_rate": 8.057695123737004e-05,
"loss": 1.6804,
"step": 715
},
{
"epoch": 0.1,
"grad_norm": 0.2732800407347181,
"learning_rate": 8.056377214819154e-05,
"loss": 1.6799,
"step": 716
},
{
"epoch": 0.1,
"grad_norm": 0.24062058611392861,
"learning_rate": 8.055059305901304e-05,
"loss": 1.6678,
"step": 717
},
{
"epoch": 0.11,
"grad_norm": 0.3002907368750325,
"learning_rate": 8.053741396983453e-05,
"loss": 1.6672,
"step": 718
},
{
"epoch": 0.11,
"grad_norm": 0.31553599792720294,
"learning_rate": 8.052423488065603e-05,
"loss": 1.671,
"step": 719
},
{
"epoch": 0.11,
"grad_norm": 0.25035709769683784,
"learning_rate": 8.051105579147753e-05,
"loss": 1.6746,
"step": 720
},
{
"epoch": 0.11,
"grad_norm": 0.16571551955558791,
"learning_rate": 8.049787670229903e-05,
"loss": 1.6619,
"step": 721
},
{
"epoch": 0.11,
"grad_norm": 0.21454683239201158,
"learning_rate": 8.048469761312052e-05,
"loss": 1.6686,
"step": 722
},
{
"epoch": 0.11,
"grad_norm": 0.2310455452470971,
"learning_rate": 8.047151852394201e-05,
"loss": 1.6771,
"step": 723
},
{
"epoch": 0.11,
"grad_norm": 0.1994236882273046,
"learning_rate": 8.045833943476351e-05,
"loss": 1.671,
"step": 724
},
{
"epoch": 0.11,
"grad_norm": 0.18974389362814953,
"learning_rate": 8.0445160345585e-05,
"loss": 1.6675,
"step": 725
},
{
"epoch": 0.11,
"grad_norm": 0.1860500322260672,
"learning_rate": 8.043198125640651e-05,
"loss": 1.658,
"step": 726
},
{
"epoch": 0.11,
"grad_norm": 0.18863809672549492,
"learning_rate": 8.0418802167228e-05,
"loss": 1.6692,
"step": 727
},
{
"epoch": 0.11,
"grad_norm": 0.19261227728615138,
"learning_rate": 8.04056230780495e-05,
"loss": 1.6722,
"step": 728
},
{
"epoch": 0.11,
"grad_norm": 0.19104427920831676,
"learning_rate": 8.0392443988871e-05,
"loss": 1.6682,
"step": 729
},
{
"epoch": 0.11,
"grad_norm": 0.19351874126385604,
"learning_rate": 8.03792648996925e-05,
"loss": 1.6698,
"step": 730
},
{
"epoch": 0.11,
"grad_norm": 0.18561279955951115,
"learning_rate": 8.036608581051399e-05,
"loss": 1.6587,
"step": 731
},
{
"epoch": 0.11,
"grad_norm": 0.20733342786735096,
"learning_rate": 8.035290672133548e-05,
"loss": 1.6686,
"step": 732
},
{
"epoch": 0.11,
"grad_norm": 0.24389803201789031,
"learning_rate": 8.033972763215698e-05,
"loss": 1.6699,
"step": 733
},
{
"epoch": 0.11,
"grad_norm": 0.3303044019383874,
"learning_rate": 8.032654854297847e-05,
"loss": 1.6685,
"step": 734
},
{
"epoch": 0.11,
"grad_norm": 0.4326972184756624,
"learning_rate": 8.031336945379998e-05,
"loss": 1.671,
"step": 735
},
{
"epoch": 0.11,
"grad_norm": 0.4749815372235629,
"learning_rate": 8.030019036462147e-05,
"loss": 1.6732,
"step": 736
},
{
"epoch": 0.11,
"grad_norm": 0.4074470713549075,
"learning_rate": 8.028701127544297e-05,
"loss": 1.6627,
"step": 737
},
{
"epoch": 0.11,
"grad_norm": 0.2325050031860467,
"learning_rate": 8.027383218626447e-05,
"loss": 1.6624,
"step": 738
},
{
"epoch": 0.11,
"grad_norm": 0.26482099963107963,
"learning_rate": 8.026065309708595e-05,
"loss": 1.6714,
"step": 739
},
{
"epoch": 0.11,
"grad_norm": 0.36679572880941774,
"learning_rate": 8.024747400790746e-05,
"loss": 1.6704,
"step": 740
},
{
"epoch": 0.11,
"grad_norm": 0.26232767280919794,
"learning_rate": 8.023429491872896e-05,
"loss": 1.663,
"step": 741
},
{
"epoch": 0.11,
"grad_norm": 0.21445846555925532,
"learning_rate": 8.022111582955045e-05,
"loss": 1.6525,
"step": 742
},
{
"epoch": 0.11,
"grad_norm": 0.2532638225714052,
"learning_rate": 8.020793674037194e-05,
"loss": 1.6642,
"step": 743
},
{
"epoch": 0.11,
"grad_norm": 0.2042942070651275,
"learning_rate": 8.019475765119345e-05,
"loss": 1.6598,
"step": 744
},
{
"epoch": 0.11,
"grad_norm": 0.171722618385623,
"learning_rate": 8.018157856201494e-05,
"loss": 1.6588,
"step": 745
},
{
"epoch": 0.11,
"grad_norm": 0.1858922147873516,
"learning_rate": 8.016839947283644e-05,
"loss": 1.6644,
"step": 746
},
{
"epoch": 0.11,
"grad_norm": 0.1800446333017018,
"learning_rate": 8.015522038365793e-05,
"loss": 1.6596,
"step": 747
},
{
"epoch": 0.11,
"grad_norm": 0.1803808001678191,
"learning_rate": 8.014204129447943e-05,
"loss": 1.672,
"step": 748
},
{
"epoch": 0.11,
"grad_norm": 0.1768537234586101,
"learning_rate": 8.012886220530092e-05,
"loss": 1.6657,
"step": 749
},
{
"epoch": 0.11,
"grad_norm": 0.19980216971257497,
"learning_rate": 8.011568311612243e-05,
"loss": 1.6534,
"step": 750
},
{
"epoch": 0.11,
"grad_norm": 0.1832264261976631,
"learning_rate": 8.010250402694392e-05,
"loss": 1.6591,
"step": 751
},
{
"epoch": 0.11,
"grad_norm": 0.16513639779797532,
"learning_rate": 8.008932493776541e-05,
"loss": 1.6648,
"step": 752
},
{
"epoch": 0.11,
"grad_norm": 0.17022762074604036,
"learning_rate": 8.007614584858692e-05,
"loss": 1.6591,
"step": 753
},
{
"epoch": 0.11,
"grad_norm": 0.1681936250352472,
"learning_rate": 8.006296675940841e-05,
"loss": 1.6571,
"step": 754
},
{
"epoch": 0.11,
"grad_norm": 0.21312758323221945,
"learning_rate": 8.004978767022991e-05,
"loss": 1.6562,
"step": 755
},
{
"epoch": 0.11,
"grad_norm": 0.2523673118943729,
"learning_rate": 8.00366085810514e-05,
"loss": 1.6663,
"step": 756
},
{
"epoch": 0.11,
"grad_norm": 0.2623904537775464,
"learning_rate": 8.00234294918729e-05,
"loss": 1.6674,
"step": 757
},
{
"epoch": 0.11,
"grad_norm": 0.24341343434762125,
"learning_rate": 8.001025040269439e-05,
"loss": 1.6713,
"step": 758
},
{
"epoch": 0.11,
"grad_norm": 0.217043521795163,
"learning_rate": 7.99970713135159e-05,
"loss": 1.6706,
"step": 759
},
{
"epoch": 0.11,
"grad_norm": 0.20902151519730236,
"learning_rate": 7.998389222433739e-05,
"loss": 1.664,
"step": 760
},
{
"epoch": 0.11,
"grad_norm": 0.22505288309338028,
"learning_rate": 7.997071313515888e-05,
"loss": 1.6588,
"step": 761
},
{
"epoch": 0.11,
"grad_norm": 0.22608418892993373,
"learning_rate": 7.995753404598039e-05,
"loss": 1.6617,
"step": 762
},
{
"epoch": 0.11,
"grad_norm": 0.2204380979289179,
"learning_rate": 7.994435495680187e-05,
"loss": 1.6573,
"step": 763
},
{
"epoch": 0.11,
"grad_norm": 0.1969471585321503,
"learning_rate": 7.993117586762337e-05,
"loss": 1.6601,
"step": 764
},
{
"epoch": 0.11,
"grad_norm": 0.20082564014850096,
"learning_rate": 7.991799677844487e-05,
"loss": 1.6617,
"step": 765
},
{
"epoch": 0.11,
"grad_norm": 0.20192437570777552,
"learning_rate": 7.990481768926637e-05,
"loss": 1.6544,
"step": 766
},
{
"epoch": 0.11,
"grad_norm": 0.2574830285462354,
"learning_rate": 7.989163860008786e-05,
"loss": 1.6533,
"step": 767
},
{
"epoch": 0.11,
"grad_norm": 0.23002776136831127,
"learning_rate": 7.987845951090937e-05,
"loss": 1.661,
"step": 768
},
{
"epoch": 0.11,
"grad_norm": 0.22713613412496464,
"learning_rate": 7.986528042173086e-05,
"loss": 1.6663,
"step": 769
},
{
"epoch": 0.11,
"grad_norm": 0.2042318478702029,
"learning_rate": 7.985210133255236e-05,
"loss": 1.6546,
"step": 770
},
{
"epoch": 0.11,
"grad_norm": 0.20073586989829498,
"learning_rate": 7.983892224337385e-05,
"loss": 1.6591,
"step": 771
},
{
"epoch": 0.11,
"grad_norm": 0.22156482786667436,
"learning_rate": 7.982574315419534e-05,
"loss": 1.6559,
"step": 772
},
{
"epoch": 0.11,
"grad_norm": 0.23204275860685752,
"learning_rate": 7.981256406501684e-05,
"loss": 1.654,
"step": 773
},
{
"epoch": 0.11,
"grad_norm": 0.22578907014380378,
"learning_rate": 7.979938497583834e-05,
"loss": 1.6598,
"step": 774
},
{
"epoch": 0.11,
"grad_norm": 0.22278741960348777,
"learning_rate": 7.978620588665984e-05,
"loss": 1.6521,
"step": 775
},
{
"epoch": 0.11,
"grad_norm": 0.21685600767937083,
"learning_rate": 7.977302679748133e-05,
"loss": 1.6607,
"step": 776
},
{
"epoch": 0.11,
"grad_norm": 0.22266066710897367,
"learning_rate": 7.975984770830284e-05,
"loss": 1.6487,
"step": 777
},
{
"epoch": 0.11,
"grad_norm": 0.19632860059659135,
"learning_rate": 7.974666861912433e-05,
"loss": 1.6613,
"step": 778
},
{
"epoch": 0.11,
"grad_norm": 0.18234394704028628,
"learning_rate": 7.973348952994583e-05,
"loss": 1.6574,
"step": 779
},
{
"epoch": 0.11,
"grad_norm": 0.177598112664479,
"learning_rate": 7.972031044076732e-05,
"loss": 1.6529,
"step": 780
},
{
"epoch": 0.11,
"grad_norm": 0.19095741463896163,
"learning_rate": 7.970713135158881e-05,
"loss": 1.654,
"step": 781
},
{
"epoch": 0.11,
"grad_norm": 0.23354657913440505,
"learning_rate": 7.969395226241031e-05,
"loss": 1.6568,
"step": 782
},
{
"epoch": 0.11,
"grad_norm": 0.23167577068773595,
"learning_rate": 7.968077317323181e-05,
"loss": 1.6554,
"step": 783
},
{
"epoch": 0.11,
"grad_norm": 0.21367008980339683,
"learning_rate": 7.966759408405331e-05,
"loss": 1.6495,
"step": 784
},
{
"epoch": 0.11,
"grad_norm": 0.2207178544949823,
"learning_rate": 7.96544149948748e-05,
"loss": 1.6548,
"step": 785
},
{
"epoch": 0.12,
"grad_norm": 0.21891851522878888,
"learning_rate": 7.964123590569631e-05,
"loss": 1.657,
"step": 786
},
{
"epoch": 0.12,
"grad_norm": 0.24695814979029723,
"learning_rate": 7.962805681651779e-05,
"loss": 1.6488,
"step": 787
},
{
"epoch": 0.12,
"grad_norm": 0.287912034486528,
"learning_rate": 7.961487772733928e-05,
"loss": 1.65,
"step": 788
},
{
"epoch": 0.12,
"grad_norm": 0.3399743120660956,
"learning_rate": 7.960169863816079e-05,
"loss": 1.6526,
"step": 789
},
{
"epoch": 0.12,
"grad_norm": 0.3616996652020229,
"learning_rate": 7.958851954898228e-05,
"loss": 1.6602,
"step": 790
},
{
"epoch": 0.12,
"grad_norm": 0.3063137729348896,
"learning_rate": 7.957534045980378e-05,
"loss": 1.6573,
"step": 791
},
{
"epoch": 0.12,
"grad_norm": 0.24333897069393048,
"learning_rate": 7.956216137062529e-05,
"loss": 1.6517,
"step": 792
},
{
"epoch": 0.12,
"grad_norm": 0.2536956392375361,
"learning_rate": 7.954898228144678e-05,
"loss": 1.6548,
"step": 793
},
{
"epoch": 0.12,
"grad_norm": 0.2992558453991358,
"learning_rate": 7.953580319226827e-05,
"loss": 1.6476,
"step": 794
},
{
"epoch": 0.12,
"grad_norm": 0.2919938016387346,
"learning_rate": 7.952262410308977e-05,
"loss": 1.658,
"step": 795
},
{
"epoch": 0.12,
"grad_norm": 0.22425446070873226,
"learning_rate": 7.950944501391126e-05,
"loss": 1.6561,
"step": 796
},
{
"epoch": 0.12,
"grad_norm": 0.17979423947699205,
"learning_rate": 7.949626592473275e-05,
"loss": 1.6511,
"step": 797
},
{
"epoch": 0.12,
"grad_norm": 0.2003220647484553,
"learning_rate": 7.948308683555426e-05,
"loss": 1.6584,
"step": 798
},
{
"epoch": 0.12,
"grad_norm": 0.21655839117724143,
"learning_rate": 7.946990774637576e-05,
"loss": 1.6413,
"step": 799
},
{
"epoch": 0.12,
"grad_norm": 0.19654884015176893,
"learning_rate": 7.945672865719725e-05,
"loss": 1.6478,
"step": 800
},
{
"epoch": 0.12,
"grad_norm": 0.20643622425199437,
"learning_rate": 7.944354956801876e-05,
"loss": 1.6487,
"step": 801
},
{
"epoch": 0.12,
"grad_norm": 0.2210426562771846,
"learning_rate": 7.943037047884025e-05,
"loss": 1.6417,
"step": 802
},
{
"epoch": 0.12,
"grad_norm": 0.2370399462767824,
"learning_rate": 7.941719138966174e-05,
"loss": 1.6526,
"step": 803
},
{
"epoch": 0.12,
"grad_norm": 0.27064884564726144,
"learning_rate": 7.940401230048324e-05,
"loss": 1.6452,
"step": 804
},
{
"epoch": 0.12,
"grad_norm": 0.24922057360329597,
"learning_rate": 7.939083321130473e-05,
"loss": 1.6476,
"step": 805
},
{
"epoch": 0.12,
"grad_norm": 0.22578777752481574,
"learning_rate": 7.937765412212622e-05,
"loss": 1.6512,
"step": 806
},
{
"epoch": 0.12,
"grad_norm": 0.2450981377837511,
"learning_rate": 7.936447503294773e-05,
"loss": 1.6447,
"step": 807
},
{
"epoch": 0.12,
"grad_norm": 0.274010641548682,
"learning_rate": 7.935129594376923e-05,
"loss": 1.6559,
"step": 808
},
{
"epoch": 0.12,
"grad_norm": 0.25880479404705736,
"learning_rate": 7.933811685459072e-05,
"loss": 1.6489,
"step": 809
},
{
"epoch": 0.12,
"grad_norm": 0.22048806582617783,
"learning_rate": 7.932493776541223e-05,
"loss": 1.6524,
"step": 810
},
{
"epoch": 0.12,
"grad_norm": 0.1887738283254872,
"learning_rate": 7.931175867623371e-05,
"loss": 1.64,
"step": 811
},
{
"epoch": 0.12,
"grad_norm": 0.18522517056579318,
"learning_rate": 7.92985795870552e-05,
"loss": 1.6494,
"step": 812
},
{
"epoch": 0.12,
"grad_norm": 0.22546381379315691,
"learning_rate": 7.928540049787671e-05,
"loss": 1.6464,
"step": 813
},
{
"epoch": 0.12,
"grad_norm": 0.2506628844421445,
"learning_rate": 7.92722214086982e-05,
"loss": 1.6542,
"step": 814
},
{
"epoch": 0.12,
"grad_norm": 0.23722684616063827,
"learning_rate": 7.92590423195197e-05,
"loss": 1.6462,
"step": 815
},
{
"epoch": 0.12,
"grad_norm": 0.21527026226593318,
"learning_rate": 7.92458632303412e-05,
"loss": 1.6429,
"step": 816
},
{
"epoch": 0.12,
"grad_norm": 0.20417560769017126,
"learning_rate": 7.92326841411627e-05,
"loss": 1.6463,
"step": 817
},
{
"epoch": 0.12,
"grad_norm": 0.25657787896551143,
"learning_rate": 7.921950505198419e-05,
"loss": 1.655,
"step": 818
},
{
"epoch": 0.12,
"grad_norm": 0.267753342657363,
"learning_rate": 7.920632596280568e-05,
"loss": 1.6519,
"step": 819
},
{
"epoch": 0.12,
"grad_norm": 0.24920909128098695,
"learning_rate": 7.919314687362718e-05,
"loss": 1.6473,
"step": 820
},
{
"epoch": 0.12,
"grad_norm": 0.2478367552587059,
"learning_rate": 7.917996778444867e-05,
"loss": 1.6426,
"step": 821
},
{
"epoch": 0.12,
"grad_norm": 0.2153066175677768,
"learning_rate": 7.916678869527018e-05,
"loss": 1.6491,
"step": 822
},
{
"epoch": 0.12,
"grad_norm": 0.2014632227972863,
"learning_rate": 7.915360960609167e-05,
"loss": 1.6505,
"step": 823
},
{
"epoch": 0.12,
"grad_norm": 0.2444739546695772,
"learning_rate": 7.914043051691317e-05,
"loss": 1.6481,
"step": 824
},
{
"epoch": 0.12,
"grad_norm": 0.24448366789416284,
"learning_rate": 7.912725142773467e-05,
"loss": 1.6536,
"step": 825
},
{
"epoch": 0.12,
"grad_norm": 0.18988494902818237,
"learning_rate": 7.911407233855617e-05,
"loss": 1.6527,
"step": 826
},
{
"epoch": 0.12,
"grad_norm": 0.1596982906421288,
"learning_rate": 7.910089324937766e-05,
"loss": 1.6434,
"step": 827
},
{
"epoch": 0.12,
"grad_norm": 0.18102812476693692,
"learning_rate": 7.908771416019915e-05,
"loss": 1.6518,
"step": 828
},
{
"epoch": 0.12,
"grad_norm": 0.2102523140397989,
"learning_rate": 7.907453507102065e-05,
"loss": 1.6498,
"step": 829
},
{
"epoch": 0.12,
"grad_norm": 0.23412540024706124,
"learning_rate": 7.906135598184214e-05,
"loss": 1.641,
"step": 830
},
{
"epoch": 0.12,
"grad_norm": 0.24005571074787757,
"learning_rate": 7.904817689266365e-05,
"loss": 1.6431,
"step": 831
},
{
"epoch": 0.12,
"grad_norm": 0.23238476130237243,
"learning_rate": 7.903499780348514e-05,
"loss": 1.6413,
"step": 832
},
{
"epoch": 0.12,
"grad_norm": 0.1970713398858913,
"learning_rate": 7.902181871430664e-05,
"loss": 1.6423,
"step": 833
},
{
"epoch": 0.12,
"grad_norm": 0.2242200272612161,
"learning_rate": 7.900863962512814e-05,
"loss": 1.6517,
"step": 834
},
{
"epoch": 0.12,
"grad_norm": 0.22807610187834748,
"learning_rate": 7.899546053594962e-05,
"loss": 1.6406,
"step": 835
},
{
"epoch": 0.12,
"grad_norm": 0.1972521968320005,
"learning_rate": 7.898228144677112e-05,
"loss": 1.6507,
"step": 836
},
{
"epoch": 0.12,
"grad_norm": 0.23394212314333337,
"learning_rate": 7.896910235759263e-05,
"loss": 1.6534,
"step": 837
},
{
"epoch": 0.12,
"grad_norm": 0.26703958427818103,
"learning_rate": 7.895592326841412e-05,
"loss": 1.6324,
"step": 838
},
{
"epoch": 0.12,
"grad_norm": 0.22223709057066937,
"learning_rate": 7.894274417923561e-05,
"loss": 1.6472,
"step": 839
},
{
"epoch": 0.12,
"grad_norm": 0.18015143412151977,
"learning_rate": 7.892956509005712e-05,
"loss": 1.6523,
"step": 840
},
{
"epoch": 0.12,
"grad_norm": 0.16776687777891955,
"learning_rate": 7.891638600087861e-05,
"loss": 1.6408,
"step": 841
},
{
"epoch": 0.12,
"grad_norm": 0.2110093032771735,
"learning_rate": 7.890320691170011e-05,
"loss": 1.6475,
"step": 842
},
{
"epoch": 0.12,
"grad_norm": 0.21006634515360098,
"learning_rate": 7.88900278225216e-05,
"loss": 1.6495,
"step": 843
},
{
"epoch": 0.12,
"grad_norm": 0.19405253109457415,
"learning_rate": 7.88768487333431e-05,
"loss": 1.6373,
"step": 844
},
{
"epoch": 0.12,
"grad_norm": 0.17598128896680296,
"learning_rate": 7.886366964416459e-05,
"loss": 1.6319,
"step": 845
},
{
"epoch": 0.12,
"grad_norm": 0.1649091128056297,
"learning_rate": 7.88504905549861e-05,
"loss": 1.6457,
"step": 846
},
{
"epoch": 0.12,
"grad_norm": 0.15971492084809902,
"learning_rate": 7.883731146580759e-05,
"loss": 1.6352,
"step": 847
},
{
"epoch": 0.12,
"grad_norm": 0.17712277192126102,
"learning_rate": 7.882413237662908e-05,
"loss": 1.6381,
"step": 848
},
{
"epoch": 0.12,
"grad_norm": 0.20906554492964108,
"learning_rate": 7.881095328745059e-05,
"loss": 1.6437,
"step": 849
},
{
"epoch": 0.12,
"grad_norm": 0.2142843140002136,
"learning_rate": 7.879777419827209e-05,
"loss": 1.6434,
"step": 850
},
{
"epoch": 0.12,
"grad_norm": 0.21006215293899588,
"learning_rate": 7.878459510909357e-05,
"loss": 1.6427,
"step": 851
},
{
"epoch": 0.12,
"grad_norm": 0.23775701204551145,
"learning_rate": 7.877141601991507e-05,
"loss": 1.6455,
"step": 852
},
{
"epoch": 0.12,
"grad_norm": 0.2707019175159962,
"learning_rate": 7.875823693073657e-05,
"loss": 1.6456,
"step": 853
},
{
"epoch": 0.13,
"grad_norm": 0.3127403618384618,
"learning_rate": 7.874505784155806e-05,
"loss": 1.6385,
"step": 854
},
{
"epoch": 0.13,
"grad_norm": 0.36233560236813683,
"learning_rate": 7.873187875237957e-05,
"loss": 1.6392,
"step": 855
},
{
"epoch": 0.13,
"grad_norm": 0.36917547935418177,
"learning_rate": 7.871869966320106e-05,
"loss": 1.643,
"step": 856
},
{
"epoch": 0.13,
"grad_norm": 0.2799689100614571,
"learning_rate": 7.870552057402255e-05,
"loss": 1.6364,
"step": 857
},
{
"epoch": 0.13,
"grad_norm": 0.1909288497464617,
"learning_rate": 7.869234148484406e-05,
"loss": 1.6442,
"step": 858
},
{
"epoch": 0.13,
"grad_norm": 0.22049646685021101,
"learning_rate": 7.867916239566554e-05,
"loss": 1.6381,
"step": 859
},
{
"epoch": 0.13,
"grad_norm": 0.2792850134812065,
"learning_rate": 7.866598330648704e-05,
"loss": 1.6439,
"step": 860
},
{
"epoch": 0.13,
"grad_norm": 0.2767586748336834,
"learning_rate": 7.865280421730854e-05,
"loss": 1.6406,
"step": 861
},
{
"epoch": 0.13,
"grad_norm": 0.1964234052122111,
"learning_rate": 7.863962512813004e-05,
"loss": 1.6403,
"step": 862
},
{
"epoch": 0.13,
"grad_norm": 0.21916883082393832,
"learning_rate": 7.862644603895153e-05,
"loss": 1.6433,
"step": 863
},
{
"epoch": 0.13,
"grad_norm": 0.2664436290112629,
"learning_rate": 7.861326694977304e-05,
"loss": 1.6422,
"step": 864
},
{
"epoch": 0.13,
"grad_norm": 0.21301889349155526,
"learning_rate": 7.860008786059453e-05,
"loss": 1.6416,
"step": 865
},
{
"epoch": 0.13,
"grad_norm": 0.17884914022978088,
"learning_rate": 7.858690877141603e-05,
"loss": 1.6312,
"step": 866
},
{
"epoch": 0.13,
"grad_norm": 0.18843102073542442,
"learning_rate": 7.857372968223752e-05,
"loss": 1.6416,
"step": 867
},
{
"epoch": 0.13,
"grad_norm": 0.18688924194495246,
"learning_rate": 7.856055059305901e-05,
"loss": 1.6441,
"step": 868
},
{
"epoch": 0.13,
"grad_norm": 0.1650145694966954,
"learning_rate": 7.854737150388051e-05,
"loss": 1.6454,
"step": 869
},
{
"epoch": 0.13,
"grad_norm": 0.1582803846188149,
"learning_rate": 7.853419241470201e-05,
"loss": 1.6359,
"step": 870
},
{
"epoch": 0.13,
"grad_norm": 0.1914936260863352,
"learning_rate": 7.852101332552351e-05,
"loss": 1.6346,
"step": 871
},
{
"epoch": 0.13,
"grad_norm": 0.2272665695903623,
"learning_rate": 7.8507834236345e-05,
"loss": 1.6424,
"step": 872
},
{
"epoch": 0.13,
"grad_norm": 0.2918408917481599,
"learning_rate": 7.849465514716651e-05,
"loss": 1.6362,
"step": 873
},
{
"epoch": 0.13,
"grad_norm": 0.33124958511818076,
"learning_rate": 7.8481476057988e-05,
"loss": 1.6457,
"step": 874
},
{
"epoch": 0.13,
"grad_norm": 0.28392389646236854,
"learning_rate": 7.846829696880948e-05,
"loss": 1.636,
"step": 875
},
{
"epoch": 0.13,
"grad_norm": 0.21187264881262416,
"learning_rate": 7.845511787963099e-05,
"loss": 1.6391,
"step": 876
},
{
"epoch": 0.13,
"grad_norm": 0.19512382269665135,
"learning_rate": 7.844193879045248e-05,
"loss": 1.6375,
"step": 877
},
{
"epoch": 0.13,
"grad_norm": 0.2315133095228095,
"learning_rate": 7.842875970127398e-05,
"loss": 1.6434,
"step": 878
},
{
"epoch": 0.13,
"grad_norm": 0.22455542676851648,
"learning_rate": 7.841558061209548e-05,
"loss": 1.6375,
"step": 879
},
{
"epoch": 0.13,
"grad_norm": 0.18274445518703517,
"learning_rate": 7.840240152291698e-05,
"loss": 1.6403,
"step": 880
},
{
"epoch": 0.13,
"grad_norm": 0.1680652375376494,
"learning_rate": 7.838922243373847e-05,
"loss": 1.6253,
"step": 881
},
{
"epoch": 0.13,
"grad_norm": 0.2029462350044378,
"learning_rate": 7.837604334455998e-05,
"loss": 1.6335,
"step": 882
},
{
"epoch": 0.13,
"grad_norm": 0.2385785209220654,
"learning_rate": 7.836286425538146e-05,
"loss": 1.636,
"step": 883
},
{
"epoch": 0.13,
"grad_norm": 0.22966225461134943,
"learning_rate": 7.834968516620295e-05,
"loss": 1.6351,
"step": 884
},
{
"epoch": 0.13,
"grad_norm": 0.20620730487272254,
"learning_rate": 7.833650607702446e-05,
"loss": 1.64,
"step": 885
},
{
"epoch": 0.13,
"grad_norm": 0.20815921246674304,
"learning_rate": 7.832332698784595e-05,
"loss": 1.642,
"step": 886
},
{
"epoch": 0.13,
"grad_norm": 0.21329180245890958,
"learning_rate": 7.831014789866745e-05,
"loss": 1.6289,
"step": 887
},
{
"epoch": 0.13,
"grad_norm": 0.20448951459967843,
"learning_rate": 7.829696880948896e-05,
"loss": 1.6327,
"step": 888
},
{
"epoch": 0.13,
"grad_norm": 0.19419304903131157,
"learning_rate": 7.828378972031045e-05,
"loss": 1.6322,
"step": 889
},
{
"epoch": 0.13,
"grad_norm": 0.18030150429004935,
"learning_rate": 7.827061063113194e-05,
"loss": 1.6331,
"step": 890
},
{
"epoch": 0.13,
"grad_norm": 0.1898733739259939,
"learning_rate": 7.825743154195344e-05,
"loss": 1.6342,
"step": 891
},
{
"epoch": 0.13,
"grad_norm": 0.21064065788890746,
"learning_rate": 7.824425245277493e-05,
"loss": 1.6407,
"step": 892
},
{
"epoch": 0.13,
"grad_norm": 0.20679616302698337,
"learning_rate": 7.823107336359642e-05,
"loss": 1.6328,
"step": 893
},
{
"epoch": 0.13,
"grad_norm": 0.21400981491846863,
"learning_rate": 7.821789427441793e-05,
"loss": 1.6426,
"step": 894
},
{
"epoch": 0.13,
"grad_norm": 0.20061186083901789,
"learning_rate": 7.820471518523943e-05,
"loss": 1.6359,
"step": 895
},
{
"epoch": 0.13,
"grad_norm": 0.18863815744505064,
"learning_rate": 7.819153609606092e-05,
"loss": 1.6307,
"step": 896
},
{
"epoch": 0.13,
"grad_norm": 0.19996719699171972,
"learning_rate": 7.817835700688243e-05,
"loss": 1.6299,
"step": 897
},
{
"epoch": 0.13,
"grad_norm": 0.2074855070501711,
"learning_rate": 7.816517791770392e-05,
"loss": 1.6431,
"step": 898
},
{
"epoch": 0.13,
"grad_norm": 0.18908061531345277,
"learning_rate": 7.81519988285254e-05,
"loss": 1.6359,
"step": 899
},
{
"epoch": 0.13,
"grad_norm": 0.1621341838335611,
"learning_rate": 7.813881973934691e-05,
"loss": 1.6334,
"step": 900
},
{
"epoch": 0.13,
"eval_loss": 1.5936576128005981,
"eval_runtime": 447.3124,
"eval_samples_per_second": 40.274,
"eval_steps_per_second": 0.06,
"step": 900
}
],
"logging_steps": 1,
"max_steps": 6829,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 900,
"total_flos": 2.133855112790016e+16,
"train_batch_size": 42,
"trial_name": null,
"trial_params": null
}