ssoyeun's picture
Upload folder using huggingface_hub
9acc414 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 492,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006097560975609756,
"grad_norm": 1.763520359992981,
"learning_rate": 0.0001,
"loss": 1.1695,
"step": 1
},
{
"epoch": 0.012195121951219513,
"grad_norm": 1.4942116737365723,
"learning_rate": 0.0001,
"loss": 1.0553,
"step": 2
},
{
"epoch": 0.018292682926829267,
"grad_norm": 1.345508337020874,
"learning_rate": 0.0001,
"loss": 1.0218,
"step": 3
},
{
"epoch": 0.024390243902439025,
"grad_norm": 1.123505711555481,
"learning_rate": 0.0001,
"loss": 0.9164,
"step": 4
},
{
"epoch": 0.03048780487804878,
"grad_norm": 0.823020875453949,
"learning_rate": 0.0001,
"loss": 0.9038,
"step": 5
},
{
"epoch": 0.036585365853658534,
"grad_norm": 0.763806939125061,
"learning_rate": 0.0001,
"loss": 0.9062,
"step": 6
},
{
"epoch": 0.042682926829268296,
"grad_norm": 0.7193121314048767,
"learning_rate": 0.0001,
"loss": 0.8466,
"step": 7
},
{
"epoch": 0.04878048780487805,
"grad_norm": 0.7080236077308655,
"learning_rate": 0.0001,
"loss": 0.82,
"step": 8
},
{
"epoch": 0.054878048780487805,
"grad_norm": 0.6981502175331116,
"learning_rate": 0.0001,
"loss": 0.8546,
"step": 9
},
{
"epoch": 0.06097560975609756,
"grad_norm": 0.7174396514892578,
"learning_rate": 0.0001,
"loss": 0.8438,
"step": 10
},
{
"epoch": 0.06707317073170732,
"grad_norm": 0.6729642152786255,
"learning_rate": 0.0001,
"loss": 0.8285,
"step": 11
},
{
"epoch": 0.07317073170731707,
"grad_norm": 0.6757375001907349,
"learning_rate": 0.0001,
"loss": 0.8068,
"step": 12
},
{
"epoch": 0.07926829268292683,
"grad_norm": 0.6743811368942261,
"learning_rate": 0.0001,
"loss": 0.7853,
"step": 13
},
{
"epoch": 0.08536585365853659,
"grad_norm": 0.6855434775352478,
"learning_rate": 0.0001,
"loss": 0.7305,
"step": 14
},
{
"epoch": 0.09146341463414634,
"grad_norm": 0.7576789259910583,
"learning_rate": 0.0001,
"loss": 0.7894,
"step": 15
},
{
"epoch": 0.0975609756097561,
"grad_norm": 0.6285218000411987,
"learning_rate": 0.0001,
"loss": 0.7821,
"step": 16
},
{
"epoch": 0.10365853658536585,
"grad_norm": 0.6224460005760193,
"learning_rate": 0.0001,
"loss": 0.806,
"step": 17
},
{
"epoch": 0.10975609756097561,
"grad_norm": 0.6526975631713867,
"learning_rate": 0.0001,
"loss": 0.7725,
"step": 18
},
{
"epoch": 0.11585365853658537,
"grad_norm": 0.7162805795669556,
"learning_rate": 0.0001,
"loss": 0.7637,
"step": 19
},
{
"epoch": 0.12195121951219512,
"grad_norm": 0.6594821214675903,
"learning_rate": 0.0001,
"loss": 0.7953,
"step": 20
},
{
"epoch": 0.12804878048780488,
"grad_norm": 0.6285718679428101,
"learning_rate": 0.0001,
"loss": 0.7673,
"step": 21
},
{
"epoch": 0.13414634146341464,
"grad_norm": 0.6275126338005066,
"learning_rate": 0.0001,
"loss": 0.7639,
"step": 22
},
{
"epoch": 0.1402439024390244,
"grad_norm": 0.6683803200721741,
"learning_rate": 0.0001,
"loss": 0.7598,
"step": 23
},
{
"epoch": 0.14634146341463414,
"grad_norm": 0.6154472231864929,
"learning_rate": 0.0001,
"loss": 0.7243,
"step": 24
},
{
"epoch": 0.1524390243902439,
"grad_norm": 0.6709151864051819,
"learning_rate": 0.0001,
"loss": 0.7869,
"step": 25
},
{
"epoch": 0.15853658536585366,
"grad_norm": 0.6176601648330688,
"learning_rate": 0.0001,
"loss": 0.7181,
"step": 26
},
{
"epoch": 0.16463414634146342,
"grad_norm": 0.5965794324874878,
"learning_rate": 0.0001,
"loss": 0.6955,
"step": 27
},
{
"epoch": 0.17073170731707318,
"grad_norm": 0.5950392484664917,
"learning_rate": 0.0001,
"loss": 0.6866,
"step": 28
},
{
"epoch": 0.17682926829268292,
"grad_norm": 0.5902345180511475,
"learning_rate": 0.0001,
"loss": 0.7488,
"step": 29
},
{
"epoch": 0.18292682926829268,
"grad_norm": 0.5966442227363586,
"learning_rate": 0.0001,
"loss": 0.766,
"step": 30
},
{
"epoch": 0.18902439024390244,
"grad_norm": 0.6065996289253235,
"learning_rate": 0.0001,
"loss": 0.7602,
"step": 31
},
{
"epoch": 0.1951219512195122,
"grad_norm": 0.6001562476158142,
"learning_rate": 0.0001,
"loss": 0.7307,
"step": 32
},
{
"epoch": 0.20121951219512196,
"grad_norm": 0.5457689166069031,
"learning_rate": 0.0001,
"loss": 0.7117,
"step": 33
},
{
"epoch": 0.2073170731707317,
"grad_norm": 0.5943721532821655,
"learning_rate": 0.0001,
"loss": 0.7419,
"step": 34
},
{
"epoch": 0.21341463414634146,
"grad_norm": 0.5822892785072327,
"learning_rate": 0.0001,
"loss": 0.7199,
"step": 35
},
{
"epoch": 0.21951219512195122,
"grad_norm": 0.5900689959526062,
"learning_rate": 0.0001,
"loss": 0.7001,
"step": 36
},
{
"epoch": 0.22560975609756098,
"grad_norm": 0.6492246389389038,
"learning_rate": 0.0001,
"loss": 0.7024,
"step": 37
},
{
"epoch": 0.23170731707317074,
"grad_norm": 0.5830572247505188,
"learning_rate": 0.0001,
"loss": 0.7311,
"step": 38
},
{
"epoch": 0.23780487804878048,
"grad_norm": 0.6123290061950684,
"learning_rate": 0.0001,
"loss": 0.7593,
"step": 39
},
{
"epoch": 0.24390243902439024,
"grad_norm": 0.6116678714752197,
"learning_rate": 0.0001,
"loss": 0.7079,
"step": 40
},
{
"epoch": 0.25,
"grad_norm": 0.6134564876556396,
"learning_rate": 0.0001,
"loss": 0.7426,
"step": 41
},
{
"epoch": 0.25609756097560976,
"grad_norm": 0.5981906652450562,
"learning_rate": 0.0001,
"loss": 0.7207,
"step": 42
},
{
"epoch": 0.2621951219512195,
"grad_norm": 0.6197260022163391,
"learning_rate": 0.0001,
"loss": 0.743,
"step": 43
},
{
"epoch": 0.2682926829268293,
"grad_norm": 0.5889937877655029,
"learning_rate": 0.0001,
"loss": 0.7111,
"step": 44
},
{
"epoch": 0.27439024390243905,
"grad_norm": 0.5781967639923096,
"learning_rate": 0.0001,
"loss": 0.7088,
"step": 45
},
{
"epoch": 0.2804878048780488,
"grad_norm": 0.5735342502593994,
"learning_rate": 0.0001,
"loss": 0.7264,
"step": 46
},
{
"epoch": 0.2865853658536585,
"grad_norm": 0.6068210005760193,
"learning_rate": 0.0001,
"loss": 0.7217,
"step": 47
},
{
"epoch": 0.2926829268292683,
"grad_norm": 0.584036648273468,
"learning_rate": 0.0001,
"loss": 0.6998,
"step": 48
},
{
"epoch": 0.29878048780487804,
"grad_norm": 0.5738788843154907,
"learning_rate": 0.0001,
"loss": 0.6662,
"step": 49
},
{
"epoch": 0.3048780487804878,
"grad_norm": 0.5746581554412842,
"learning_rate": 0.0001,
"loss": 0.6702,
"step": 50
},
{
"epoch": 0.31097560975609756,
"grad_norm": 0.5572565793991089,
"learning_rate": 0.0001,
"loss": 0.6766,
"step": 51
},
{
"epoch": 0.3170731707317073,
"grad_norm": 0.6274172067642212,
"learning_rate": 0.0001,
"loss": 0.7509,
"step": 52
},
{
"epoch": 0.3231707317073171,
"grad_norm": 0.5426685810089111,
"learning_rate": 0.0001,
"loss": 0.7065,
"step": 53
},
{
"epoch": 0.32926829268292684,
"grad_norm": 0.5456064343452454,
"learning_rate": 0.0001,
"loss": 0.6069,
"step": 54
},
{
"epoch": 0.3353658536585366,
"grad_norm": 0.5641257762908936,
"learning_rate": 0.0001,
"loss": 0.6862,
"step": 55
},
{
"epoch": 0.34146341463414637,
"grad_norm": 0.5878000259399414,
"learning_rate": 0.0001,
"loss": 0.6827,
"step": 56
},
{
"epoch": 0.3475609756097561,
"grad_norm": 0.5976933240890503,
"learning_rate": 0.0001,
"loss": 0.6838,
"step": 57
},
{
"epoch": 0.35365853658536583,
"grad_norm": 0.5872485041618347,
"learning_rate": 0.0001,
"loss": 0.7017,
"step": 58
},
{
"epoch": 0.3597560975609756,
"grad_norm": 0.5930238366127014,
"learning_rate": 0.0001,
"loss": 0.6735,
"step": 59
},
{
"epoch": 0.36585365853658536,
"grad_norm": 0.5682117342948914,
"learning_rate": 0.0001,
"loss": 0.6686,
"step": 60
},
{
"epoch": 0.3719512195121951,
"grad_norm": 0.5660499334335327,
"learning_rate": 0.0001,
"loss": 0.6579,
"step": 61
},
{
"epoch": 0.3780487804878049,
"grad_norm": 0.5715780854225159,
"learning_rate": 0.0001,
"loss": 0.7127,
"step": 62
},
{
"epoch": 0.38414634146341464,
"grad_norm": 0.5816344022750854,
"learning_rate": 0.0001,
"loss": 0.7158,
"step": 63
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.5610223412513733,
"learning_rate": 0.0001,
"loss": 0.6527,
"step": 64
},
{
"epoch": 0.39634146341463417,
"grad_norm": 0.5489451885223389,
"learning_rate": 0.0001,
"loss": 0.6902,
"step": 65
},
{
"epoch": 0.4024390243902439,
"grad_norm": 0.5633963942527771,
"learning_rate": 0.0001,
"loss": 0.729,
"step": 66
},
{
"epoch": 0.40853658536585363,
"grad_norm": 0.5687914490699768,
"learning_rate": 0.0001,
"loss": 0.6558,
"step": 67
},
{
"epoch": 0.4146341463414634,
"grad_norm": 0.5886531472206116,
"learning_rate": 0.0001,
"loss": 0.7309,
"step": 68
},
{
"epoch": 0.42073170731707316,
"grad_norm": 0.546073853969574,
"learning_rate": 0.0001,
"loss": 0.6625,
"step": 69
},
{
"epoch": 0.4268292682926829,
"grad_norm": 0.5979751348495483,
"learning_rate": 0.0001,
"loss": 0.6942,
"step": 70
},
{
"epoch": 0.4329268292682927,
"grad_norm": 0.5324491858482361,
"learning_rate": 0.0001,
"loss": 0.6274,
"step": 71
},
{
"epoch": 0.43902439024390244,
"grad_norm": 0.6174746751785278,
"learning_rate": 0.0001,
"loss": 0.7073,
"step": 72
},
{
"epoch": 0.4451219512195122,
"grad_norm": 0.5845648646354675,
"learning_rate": 0.0001,
"loss": 0.6871,
"step": 73
},
{
"epoch": 0.45121951219512196,
"grad_norm": 0.5905411839485168,
"learning_rate": 0.0001,
"loss": 0.6846,
"step": 74
},
{
"epoch": 0.4573170731707317,
"grad_norm": 0.5970960855484009,
"learning_rate": 0.0001,
"loss": 0.6588,
"step": 75
},
{
"epoch": 0.4634146341463415,
"grad_norm": 0.5933733582496643,
"learning_rate": 0.0001,
"loss": 0.678,
"step": 76
},
{
"epoch": 0.4695121951219512,
"grad_norm": 0.5747849941253662,
"learning_rate": 0.0001,
"loss": 0.683,
"step": 77
},
{
"epoch": 0.47560975609756095,
"grad_norm": 0.5409815311431885,
"learning_rate": 0.0001,
"loss": 0.6287,
"step": 78
},
{
"epoch": 0.4817073170731707,
"grad_norm": 0.6004408001899719,
"learning_rate": 0.0001,
"loss": 0.6356,
"step": 79
},
{
"epoch": 0.4878048780487805,
"grad_norm": 0.5724059343338013,
"learning_rate": 0.0001,
"loss": 0.6388,
"step": 80
},
{
"epoch": 0.49390243902439024,
"grad_norm": 0.6099798083305359,
"learning_rate": 0.0001,
"loss": 0.7115,
"step": 81
},
{
"epoch": 0.5,
"grad_norm": 0.5958842039108276,
"learning_rate": 0.0001,
"loss": 0.655,
"step": 82
},
{
"epoch": 0.5060975609756098,
"grad_norm": 0.6181111335754395,
"learning_rate": 0.0001,
"loss": 0.6391,
"step": 83
},
{
"epoch": 0.5121951219512195,
"grad_norm": 0.5894577503204346,
"learning_rate": 0.0001,
"loss": 0.6791,
"step": 84
},
{
"epoch": 0.5182926829268293,
"grad_norm": 0.5830883979797363,
"learning_rate": 0.0001,
"loss": 0.6582,
"step": 85
},
{
"epoch": 0.524390243902439,
"grad_norm": 0.5686275362968445,
"learning_rate": 0.0001,
"loss": 0.678,
"step": 86
},
{
"epoch": 0.5304878048780488,
"grad_norm": 0.6119154095649719,
"learning_rate": 0.0001,
"loss": 0.6714,
"step": 87
},
{
"epoch": 0.5365853658536586,
"grad_norm": 0.5826413035392761,
"learning_rate": 0.0001,
"loss": 0.6746,
"step": 88
},
{
"epoch": 0.5426829268292683,
"grad_norm": 0.6128208041191101,
"learning_rate": 0.0001,
"loss": 0.6851,
"step": 89
},
{
"epoch": 0.5487804878048781,
"grad_norm": 0.575299859046936,
"learning_rate": 0.0001,
"loss": 0.6439,
"step": 90
},
{
"epoch": 0.5548780487804879,
"grad_norm": 0.6011075377464294,
"learning_rate": 0.0001,
"loss": 0.689,
"step": 91
},
{
"epoch": 0.5609756097560976,
"grad_norm": 0.5696834325790405,
"learning_rate": 0.0001,
"loss": 0.688,
"step": 92
},
{
"epoch": 0.5670731707317073,
"grad_norm": 0.5776868462562561,
"learning_rate": 0.0001,
"loss": 0.6237,
"step": 93
},
{
"epoch": 0.573170731707317,
"grad_norm": 0.5697721242904663,
"learning_rate": 0.0001,
"loss": 0.6551,
"step": 94
},
{
"epoch": 0.5792682926829268,
"grad_norm": 0.5542324185371399,
"learning_rate": 0.0001,
"loss": 0.615,
"step": 95
},
{
"epoch": 0.5853658536585366,
"grad_norm": 0.5746421217918396,
"learning_rate": 0.0001,
"loss": 0.6814,
"step": 96
},
{
"epoch": 0.5914634146341463,
"grad_norm": 0.5714977383613586,
"learning_rate": 0.0001,
"loss": 0.6698,
"step": 97
},
{
"epoch": 0.5975609756097561,
"grad_norm": 0.5868296027183533,
"learning_rate": 0.0001,
"loss": 0.6482,
"step": 98
},
{
"epoch": 0.6036585365853658,
"grad_norm": 0.5577363967895508,
"learning_rate": 0.0001,
"loss": 0.663,
"step": 99
},
{
"epoch": 0.6097560975609756,
"grad_norm": 0.51622474193573,
"learning_rate": 0.0001,
"loss": 0.5813,
"step": 100
},
{
"epoch": 0.6158536585365854,
"grad_norm": 0.5596529245376587,
"learning_rate": 0.0001,
"loss": 0.6157,
"step": 101
},
{
"epoch": 0.6219512195121951,
"grad_norm": 0.585007905960083,
"learning_rate": 0.0001,
"loss": 0.6734,
"step": 102
},
{
"epoch": 0.6280487804878049,
"grad_norm": 0.5682265758514404,
"learning_rate": 0.0001,
"loss": 0.6231,
"step": 103
},
{
"epoch": 0.6341463414634146,
"grad_norm": 0.6157271265983582,
"learning_rate": 0.0001,
"loss": 0.6679,
"step": 104
},
{
"epoch": 0.6402439024390244,
"grad_norm": 0.5796582698822021,
"learning_rate": 0.0001,
"loss": 0.6091,
"step": 105
},
{
"epoch": 0.6463414634146342,
"grad_norm": 0.5919722318649292,
"learning_rate": 0.0001,
"loss": 0.6744,
"step": 106
},
{
"epoch": 0.6524390243902439,
"grad_norm": 0.5803415775299072,
"learning_rate": 0.0001,
"loss": 0.6316,
"step": 107
},
{
"epoch": 0.6585365853658537,
"grad_norm": 0.5573592782020569,
"learning_rate": 0.0001,
"loss": 0.6028,
"step": 108
},
{
"epoch": 0.6646341463414634,
"grad_norm": 0.5864866375923157,
"learning_rate": 0.0001,
"loss": 0.6442,
"step": 109
},
{
"epoch": 0.6707317073170732,
"grad_norm": 0.5456053018569946,
"learning_rate": 0.0001,
"loss": 0.6233,
"step": 110
},
{
"epoch": 0.676829268292683,
"grad_norm": 0.575710654258728,
"learning_rate": 0.0001,
"loss": 0.6303,
"step": 111
},
{
"epoch": 0.6829268292682927,
"grad_norm": 0.6122698783874512,
"learning_rate": 0.0001,
"loss": 0.6676,
"step": 112
},
{
"epoch": 0.6890243902439024,
"grad_norm": 0.5976404547691345,
"learning_rate": 0.0001,
"loss": 0.6533,
"step": 113
},
{
"epoch": 0.6951219512195121,
"grad_norm": 0.6462607979774475,
"learning_rate": 0.0001,
"loss": 0.7024,
"step": 114
},
{
"epoch": 0.7012195121951219,
"grad_norm": 0.5650457143783569,
"learning_rate": 0.0001,
"loss": 0.6667,
"step": 115
},
{
"epoch": 0.7073170731707317,
"grad_norm": 0.5858912467956543,
"learning_rate": 0.0001,
"loss": 0.6492,
"step": 116
},
{
"epoch": 0.7134146341463414,
"grad_norm": 0.5636318325996399,
"learning_rate": 0.0001,
"loss": 0.6112,
"step": 117
},
{
"epoch": 0.7195121951219512,
"grad_norm": 0.5599079728126526,
"learning_rate": 0.0001,
"loss": 0.6817,
"step": 118
},
{
"epoch": 0.725609756097561,
"grad_norm": 0.551928699016571,
"learning_rate": 0.0001,
"loss": 0.6534,
"step": 119
},
{
"epoch": 0.7317073170731707,
"grad_norm": 0.5585001707077026,
"learning_rate": 0.0001,
"loss": 0.6517,
"step": 120
},
{
"epoch": 0.7378048780487805,
"grad_norm": 0.5939499139785767,
"learning_rate": 0.0001,
"loss": 0.637,
"step": 121
},
{
"epoch": 0.7439024390243902,
"grad_norm": 0.6028351187705994,
"learning_rate": 0.0001,
"loss": 0.6497,
"step": 122
},
{
"epoch": 0.75,
"grad_norm": 0.6053422689437866,
"learning_rate": 0.0001,
"loss": 0.6606,
"step": 123
},
{
"epoch": 0.7560975609756098,
"grad_norm": 0.5626771450042725,
"learning_rate": 0.0001,
"loss": 0.6475,
"step": 124
},
{
"epoch": 0.7621951219512195,
"grad_norm": 0.5561665892601013,
"learning_rate": 0.0001,
"loss": 0.6126,
"step": 125
},
{
"epoch": 0.7682926829268293,
"grad_norm": 0.5361859202384949,
"learning_rate": 0.0001,
"loss": 0.6737,
"step": 126
},
{
"epoch": 0.774390243902439,
"grad_norm": 0.5999827980995178,
"learning_rate": 0.0001,
"loss": 0.627,
"step": 127
},
{
"epoch": 0.7804878048780488,
"grad_norm": 0.5717467665672302,
"learning_rate": 0.0001,
"loss": 0.7242,
"step": 128
},
{
"epoch": 0.7865853658536586,
"grad_norm": 0.5655209422111511,
"learning_rate": 0.0001,
"loss": 0.6072,
"step": 129
},
{
"epoch": 0.7926829268292683,
"grad_norm": 0.5843133926391602,
"learning_rate": 0.0001,
"loss": 0.6727,
"step": 130
},
{
"epoch": 0.7987804878048781,
"grad_norm": 0.5787593722343445,
"learning_rate": 0.0001,
"loss": 0.6394,
"step": 131
},
{
"epoch": 0.8048780487804879,
"grad_norm": 0.5661312341690063,
"learning_rate": 0.0001,
"loss": 0.6122,
"step": 132
},
{
"epoch": 0.8109756097560976,
"grad_norm": 0.602393388748169,
"learning_rate": 0.0001,
"loss": 0.6193,
"step": 133
},
{
"epoch": 0.8170731707317073,
"grad_norm": 0.630905032157898,
"learning_rate": 0.0001,
"loss": 0.6427,
"step": 134
},
{
"epoch": 0.823170731707317,
"grad_norm": 0.6203592419624329,
"learning_rate": 0.0001,
"loss": 0.6491,
"step": 135
},
{
"epoch": 0.8292682926829268,
"grad_norm": 0.5753608345985413,
"learning_rate": 0.0001,
"loss": 0.6295,
"step": 136
},
{
"epoch": 0.8353658536585366,
"grad_norm": 0.5919385552406311,
"learning_rate": 0.0001,
"loss": 0.6262,
"step": 137
},
{
"epoch": 0.8414634146341463,
"grad_norm": 0.564659833908081,
"learning_rate": 0.0001,
"loss": 0.6437,
"step": 138
},
{
"epoch": 0.8475609756097561,
"grad_norm": 0.5595895648002625,
"learning_rate": 0.0001,
"loss": 0.628,
"step": 139
},
{
"epoch": 0.8536585365853658,
"grad_norm": 0.5651856064796448,
"learning_rate": 0.0001,
"loss": 0.622,
"step": 140
},
{
"epoch": 0.8597560975609756,
"grad_norm": 0.5735089778900146,
"learning_rate": 0.0001,
"loss": 0.6313,
"step": 141
},
{
"epoch": 0.8658536585365854,
"grad_norm": 0.6084374189376831,
"learning_rate": 0.0001,
"loss": 0.6528,
"step": 142
},
{
"epoch": 0.8719512195121951,
"grad_norm": 0.5673129558563232,
"learning_rate": 0.0001,
"loss": 0.6163,
"step": 143
},
{
"epoch": 0.8780487804878049,
"grad_norm": 0.5617730021476746,
"learning_rate": 0.0001,
"loss": 0.6397,
"step": 144
},
{
"epoch": 0.8841463414634146,
"grad_norm": 0.5928285121917725,
"learning_rate": 0.0001,
"loss": 0.64,
"step": 145
},
{
"epoch": 0.8902439024390244,
"grad_norm": 0.5878246426582336,
"learning_rate": 0.0001,
"loss": 0.6691,
"step": 146
},
{
"epoch": 0.8963414634146342,
"grad_norm": 0.5934311747550964,
"learning_rate": 0.0001,
"loss": 0.6325,
"step": 147
},
{
"epoch": 0.9024390243902439,
"grad_norm": 0.5465561151504517,
"learning_rate": 0.0001,
"loss": 0.663,
"step": 148
},
{
"epoch": 0.9085365853658537,
"grad_norm": 0.5870200991630554,
"learning_rate": 0.0001,
"loss": 0.6104,
"step": 149
},
{
"epoch": 0.9146341463414634,
"grad_norm": 0.6161399483680725,
"learning_rate": 0.0001,
"loss": 0.6553,
"step": 150
},
{
"epoch": 0.9207317073170732,
"grad_norm": 0.5733305811882019,
"learning_rate": 0.0001,
"loss": 0.6167,
"step": 151
},
{
"epoch": 0.926829268292683,
"grad_norm": 0.595331072807312,
"learning_rate": 0.0001,
"loss": 0.6594,
"step": 152
},
{
"epoch": 0.9329268292682927,
"grad_norm": 0.5634722709655762,
"learning_rate": 0.0001,
"loss": 0.6435,
"step": 153
},
{
"epoch": 0.9390243902439024,
"grad_norm": 0.5649352073669434,
"learning_rate": 0.0001,
"loss": 0.6338,
"step": 154
},
{
"epoch": 0.9451219512195121,
"grad_norm": 0.5804089903831482,
"learning_rate": 0.0001,
"loss": 0.6151,
"step": 155
},
{
"epoch": 0.9512195121951219,
"grad_norm": 0.5910571217536926,
"learning_rate": 0.0001,
"loss": 0.6083,
"step": 156
},
{
"epoch": 0.9573170731707317,
"grad_norm": 0.6512947082519531,
"learning_rate": 0.0001,
"loss": 0.652,
"step": 157
},
{
"epoch": 0.9634146341463414,
"grad_norm": 0.6277866363525391,
"learning_rate": 0.0001,
"loss": 0.6363,
"step": 158
},
{
"epoch": 0.9695121951219512,
"grad_norm": 0.5870842933654785,
"learning_rate": 0.0001,
"loss": 0.6417,
"step": 159
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.546256422996521,
"learning_rate": 0.0001,
"loss": 0.5957,
"step": 160
},
{
"epoch": 0.9817073170731707,
"grad_norm": 0.5940456390380859,
"learning_rate": 0.0001,
"loss": 0.5774,
"step": 161
},
{
"epoch": 0.9878048780487805,
"grad_norm": 0.5390895009040833,
"learning_rate": 0.0001,
"loss": 0.6131,
"step": 162
},
{
"epoch": 0.9939024390243902,
"grad_norm": 0.5646426677703857,
"learning_rate": 0.0001,
"loss": 0.6247,
"step": 163
},
{
"epoch": 1.0,
"grad_norm": 0.5933319330215454,
"learning_rate": 0.0001,
"loss": 0.6107,
"step": 164
},
{
"epoch": 1.0060975609756098,
"grad_norm": 0.5555415749549866,
"learning_rate": 0.0001,
"loss": 0.5038,
"step": 165
},
{
"epoch": 1.0121951219512195,
"grad_norm": 0.5714491605758667,
"learning_rate": 0.0001,
"loss": 0.5403,
"step": 166
},
{
"epoch": 1.0182926829268293,
"grad_norm": 0.6099926829338074,
"learning_rate": 0.0001,
"loss": 0.4943,
"step": 167
},
{
"epoch": 1.024390243902439,
"grad_norm": 0.7038013339042664,
"learning_rate": 0.0001,
"loss": 0.4801,
"step": 168
},
{
"epoch": 1.0304878048780488,
"grad_norm": 0.6525987982749939,
"learning_rate": 0.0001,
"loss": 0.499,
"step": 169
},
{
"epoch": 1.0365853658536586,
"grad_norm": 0.5772536396980286,
"learning_rate": 0.0001,
"loss": 0.4899,
"step": 170
},
{
"epoch": 1.0426829268292683,
"grad_norm": 0.5953510999679565,
"learning_rate": 0.0001,
"loss": 0.5343,
"step": 171
},
{
"epoch": 1.048780487804878,
"grad_norm": 0.579450786113739,
"learning_rate": 0.0001,
"loss": 0.5222,
"step": 172
},
{
"epoch": 1.0548780487804879,
"grad_norm": 0.5960140228271484,
"learning_rate": 0.0001,
"loss": 0.4936,
"step": 173
},
{
"epoch": 1.0609756097560976,
"grad_norm": 0.5782721042633057,
"learning_rate": 0.0001,
"loss": 0.487,
"step": 174
},
{
"epoch": 1.0670731707317074,
"grad_norm": 0.6194652318954468,
"learning_rate": 0.0001,
"loss": 0.5045,
"step": 175
},
{
"epoch": 1.0731707317073171,
"grad_norm": 0.7137989401817322,
"learning_rate": 0.0001,
"loss": 0.5206,
"step": 176
},
{
"epoch": 1.079268292682927,
"grad_norm": 0.6591524481773376,
"learning_rate": 0.0001,
"loss": 0.5203,
"step": 177
},
{
"epoch": 1.0853658536585367,
"grad_norm": 0.5615283846855164,
"learning_rate": 0.0001,
"loss": 0.4845,
"step": 178
},
{
"epoch": 1.0914634146341464,
"grad_norm": 0.5729933381080627,
"learning_rate": 0.0001,
"loss": 0.5166,
"step": 179
},
{
"epoch": 1.0975609756097562,
"grad_norm": 0.5670926570892334,
"learning_rate": 0.0001,
"loss": 0.5311,
"step": 180
},
{
"epoch": 1.103658536585366,
"grad_norm": 0.5750375390052795,
"learning_rate": 0.0001,
"loss": 0.4739,
"step": 181
},
{
"epoch": 1.1097560975609757,
"grad_norm": 0.5616285800933838,
"learning_rate": 0.0001,
"loss": 0.513,
"step": 182
},
{
"epoch": 1.1158536585365855,
"grad_norm": 0.6150811910629272,
"learning_rate": 0.0001,
"loss": 0.53,
"step": 183
},
{
"epoch": 1.1219512195121952,
"grad_norm": 0.6283072233200073,
"learning_rate": 0.0001,
"loss": 0.5099,
"step": 184
},
{
"epoch": 1.1280487804878048,
"grad_norm": 0.5622886419296265,
"learning_rate": 0.0001,
"loss": 0.4663,
"step": 185
},
{
"epoch": 1.1341463414634148,
"grad_norm": 0.6202870607376099,
"learning_rate": 0.0001,
"loss": 0.5113,
"step": 186
},
{
"epoch": 1.1402439024390243,
"grad_norm": 0.5678901672363281,
"learning_rate": 0.0001,
"loss": 0.4595,
"step": 187
},
{
"epoch": 1.146341463414634,
"grad_norm": 0.6146119832992554,
"learning_rate": 0.0001,
"loss": 0.5248,
"step": 188
},
{
"epoch": 1.1524390243902438,
"grad_norm": 0.5726969838142395,
"learning_rate": 0.0001,
"loss": 0.5016,
"step": 189
},
{
"epoch": 1.1585365853658536,
"grad_norm": 0.5848289132118225,
"learning_rate": 0.0001,
"loss": 0.5236,
"step": 190
},
{
"epoch": 1.1646341463414633,
"grad_norm": 0.598795473575592,
"learning_rate": 0.0001,
"loss": 0.5444,
"step": 191
},
{
"epoch": 1.170731707317073,
"grad_norm": 0.5984260439872742,
"learning_rate": 0.0001,
"loss": 0.5291,
"step": 192
},
{
"epoch": 1.1768292682926829,
"grad_norm": 0.5640114545822144,
"learning_rate": 0.0001,
"loss": 0.5366,
"step": 193
},
{
"epoch": 1.1829268292682926,
"grad_norm": 0.5771395564079285,
"learning_rate": 0.0001,
"loss": 0.519,
"step": 194
},
{
"epoch": 1.1890243902439024,
"grad_norm": 0.5926110744476318,
"learning_rate": 0.0001,
"loss": 0.4945,
"step": 195
},
{
"epoch": 1.1951219512195121,
"grad_norm": 0.6406283974647522,
"learning_rate": 0.0001,
"loss": 0.5313,
"step": 196
},
{
"epoch": 1.201219512195122,
"grad_norm": 0.5671162009239197,
"learning_rate": 0.0001,
"loss": 0.4971,
"step": 197
},
{
"epoch": 1.2073170731707317,
"grad_norm": 0.5952590703964233,
"learning_rate": 0.0001,
"loss": 0.4886,
"step": 198
},
{
"epoch": 1.2134146341463414,
"grad_norm": 0.6368497014045715,
"learning_rate": 0.0001,
"loss": 0.4984,
"step": 199
},
{
"epoch": 1.2195121951219512,
"grad_norm": 0.6427241563796997,
"learning_rate": 0.0001,
"loss": 0.5201,
"step": 200
},
{
"epoch": 1.225609756097561,
"grad_norm": 0.5814225673675537,
"learning_rate": 0.0001,
"loss": 0.5021,
"step": 201
},
{
"epoch": 1.2317073170731707,
"grad_norm": 0.5985032916069031,
"learning_rate": 0.0001,
"loss": 0.4969,
"step": 202
},
{
"epoch": 1.2378048780487805,
"grad_norm": 0.5723533630371094,
"learning_rate": 0.0001,
"loss": 0.485,
"step": 203
},
{
"epoch": 1.2439024390243902,
"grad_norm": 0.598479688167572,
"learning_rate": 0.0001,
"loss": 0.496,
"step": 204
},
{
"epoch": 1.25,
"grad_norm": 0.6005733013153076,
"learning_rate": 0.0001,
"loss": 0.4746,
"step": 205
},
{
"epoch": 1.2560975609756098,
"grad_norm": 0.630957841873169,
"learning_rate": 0.0001,
"loss": 0.5069,
"step": 206
},
{
"epoch": 1.2621951219512195,
"grad_norm": 0.6369969248771667,
"learning_rate": 0.0001,
"loss": 0.4869,
"step": 207
},
{
"epoch": 1.2682926829268293,
"grad_norm": 0.6387524008750916,
"learning_rate": 0.0001,
"loss": 0.5133,
"step": 208
},
{
"epoch": 1.274390243902439,
"grad_norm": 0.6263754367828369,
"learning_rate": 0.0001,
"loss": 0.5444,
"step": 209
},
{
"epoch": 1.2804878048780488,
"grad_norm": 0.557532012462616,
"learning_rate": 0.0001,
"loss": 0.4726,
"step": 210
},
{
"epoch": 1.2865853658536586,
"grad_norm": 0.576702892780304,
"learning_rate": 0.0001,
"loss": 0.5325,
"step": 211
},
{
"epoch": 1.2926829268292683,
"grad_norm": 0.6313229203224182,
"learning_rate": 0.0001,
"loss": 0.5044,
"step": 212
},
{
"epoch": 1.298780487804878,
"grad_norm": 0.625912070274353,
"learning_rate": 0.0001,
"loss": 0.5381,
"step": 213
},
{
"epoch": 1.3048780487804879,
"grad_norm": 0.6148139238357544,
"learning_rate": 0.0001,
"loss": 0.4934,
"step": 214
},
{
"epoch": 1.3109756097560976,
"grad_norm": 0.6258604526519775,
"learning_rate": 0.0001,
"loss": 0.5239,
"step": 215
},
{
"epoch": 1.3170731707317074,
"grad_norm": 0.6130456924438477,
"learning_rate": 0.0001,
"loss": 0.5014,
"step": 216
},
{
"epoch": 1.3231707317073171,
"grad_norm": 0.606001615524292,
"learning_rate": 0.0001,
"loss": 0.5201,
"step": 217
},
{
"epoch": 1.329268292682927,
"grad_norm": 0.5635973215103149,
"learning_rate": 0.0001,
"loss": 0.4932,
"step": 218
},
{
"epoch": 1.3353658536585367,
"grad_norm": 0.5979434251785278,
"learning_rate": 0.0001,
"loss": 0.5142,
"step": 219
},
{
"epoch": 1.3414634146341464,
"grad_norm": 0.5663168430328369,
"learning_rate": 0.0001,
"loss": 0.524,
"step": 220
},
{
"epoch": 1.3475609756097562,
"grad_norm": 0.6072438955307007,
"learning_rate": 0.0001,
"loss": 0.4997,
"step": 221
},
{
"epoch": 1.3536585365853657,
"grad_norm": 0.601750373840332,
"learning_rate": 0.0001,
"loss": 0.4946,
"step": 222
},
{
"epoch": 1.3597560975609757,
"grad_norm": 0.6556447744369507,
"learning_rate": 0.0001,
"loss": 0.5114,
"step": 223
},
{
"epoch": 1.3658536585365852,
"grad_norm": 0.6329565048217773,
"learning_rate": 0.0001,
"loss": 0.512,
"step": 224
},
{
"epoch": 1.3719512195121952,
"grad_norm": 0.6002699136734009,
"learning_rate": 0.0001,
"loss": 0.494,
"step": 225
},
{
"epoch": 1.3780487804878048,
"grad_norm": 0.6447397470474243,
"learning_rate": 0.0001,
"loss": 0.548,
"step": 226
},
{
"epoch": 1.3841463414634148,
"grad_norm": 0.5840697288513184,
"learning_rate": 0.0001,
"loss": 0.5177,
"step": 227
},
{
"epoch": 1.3902439024390243,
"grad_norm": 0.5911181569099426,
"learning_rate": 0.0001,
"loss": 0.5183,
"step": 228
},
{
"epoch": 1.3963414634146343,
"grad_norm": 0.6022722125053406,
"learning_rate": 0.0001,
"loss": 0.476,
"step": 229
},
{
"epoch": 1.4024390243902438,
"grad_norm": 0.5788743495941162,
"learning_rate": 0.0001,
"loss": 0.5109,
"step": 230
},
{
"epoch": 1.4085365853658536,
"grad_norm": 0.5945917963981628,
"learning_rate": 0.0001,
"loss": 0.4869,
"step": 231
},
{
"epoch": 1.4146341463414633,
"grad_norm": 0.638956606388092,
"learning_rate": 0.0001,
"loss": 0.53,
"step": 232
},
{
"epoch": 1.420731707317073,
"grad_norm": 0.6204885840415955,
"learning_rate": 0.0001,
"loss": 0.5205,
"step": 233
},
{
"epoch": 1.4268292682926829,
"grad_norm": 0.5931024551391602,
"learning_rate": 0.0001,
"loss": 0.5167,
"step": 234
},
{
"epoch": 1.4329268292682926,
"grad_norm": 0.5996592044830322,
"learning_rate": 0.0001,
"loss": 0.4935,
"step": 235
},
{
"epoch": 1.4390243902439024,
"grad_norm": 0.6242860555648804,
"learning_rate": 0.0001,
"loss": 0.5047,
"step": 236
},
{
"epoch": 1.4451219512195121,
"grad_norm": 0.5914901494979858,
"learning_rate": 0.0001,
"loss": 0.5092,
"step": 237
},
{
"epoch": 1.451219512195122,
"grad_norm": 0.6710638999938965,
"learning_rate": 0.0001,
"loss": 0.5437,
"step": 238
},
{
"epoch": 1.4573170731707317,
"grad_norm": 0.6554276347160339,
"learning_rate": 0.0001,
"loss": 0.4906,
"step": 239
},
{
"epoch": 1.4634146341463414,
"grad_norm": 0.6532212495803833,
"learning_rate": 0.0001,
"loss": 0.5508,
"step": 240
},
{
"epoch": 1.4695121951219512,
"grad_norm": 0.5957479476928711,
"learning_rate": 0.0001,
"loss": 0.4902,
"step": 241
},
{
"epoch": 1.475609756097561,
"grad_norm": 0.5946776270866394,
"learning_rate": 0.0001,
"loss": 0.5085,
"step": 242
},
{
"epoch": 1.4817073170731707,
"grad_norm": 0.5819572806358337,
"learning_rate": 0.0001,
"loss": 0.4819,
"step": 243
},
{
"epoch": 1.4878048780487805,
"grad_norm": 0.6151570081710815,
"learning_rate": 0.0001,
"loss": 0.5058,
"step": 244
},
{
"epoch": 1.4939024390243902,
"grad_norm": 0.6580333709716797,
"learning_rate": 0.0001,
"loss": 0.506,
"step": 245
},
{
"epoch": 1.5,
"grad_norm": 0.6214548945426941,
"learning_rate": 0.0001,
"loss": 0.4739,
"step": 246
},
{
"epoch": 1.5060975609756098,
"grad_norm": 0.6240037083625793,
"learning_rate": 0.0001,
"loss": 0.4898,
"step": 247
},
{
"epoch": 1.5121951219512195,
"grad_norm": 0.6115790605545044,
"learning_rate": 0.0001,
"loss": 0.5143,
"step": 248
},
{
"epoch": 1.5182926829268293,
"grad_norm": 0.5654324293136597,
"learning_rate": 0.0001,
"loss": 0.4409,
"step": 249
},
{
"epoch": 1.524390243902439,
"grad_norm": 0.5737196207046509,
"learning_rate": 0.0001,
"loss": 0.4936,
"step": 250
},
{
"epoch": 1.5304878048780488,
"grad_norm": 0.6084273457527161,
"learning_rate": 0.0001,
"loss": 0.5182,
"step": 251
},
{
"epoch": 1.5365853658536586,
"grad_norm": 0.5695486664772034,
"learning_rate": 0.0001,
"loss": 0.4857,
"step": 252
},
{
"epoch": 1.5426829268292683,
"grad_norm": 0.5693416595458984,
"learning_rate": 0.0001,
"loss": 0.5028,
"step": 253
},
{
"epoch": 1.548780487804878,
"grad_norm": 0.5976539850234985,
"learning_rate": 0.0001,
"loss": 0.492,
"step": 254
},
{
"epoch": 1.5548780487804879,
"grad_norm": 0.6122463941574097,
"learning_rate": 0.0001,
"loss": 0.5412,
"step": 255
},
{
"epoch": 1.5609756097560976,
"grad_norm": 0.5977299213409424,
"learning_rate": 0.0001,
"loss": 0.5173,
"step": 256
},
{
"epoch": 1.5670731707317072,
"grad_norm": 0.5926475524902344,
"learning_rate": 0.0001,
"loss": 0.5037,
"step": 257
},
{
"epoch": 1.5731707317073171,
"grad_norm": 0.5920047163963318,
"learning_rate": 0.0001,
"loss": 0.4779,
"step": 258
},
{
"epoch": 1.5792682926829267,
"grad_norm": 0.5987219214439392,
"learning_rate": 0.0001,
"loss": 0.5132,
"step": 259
},
{
"epoch": 1.5853658536585367,
"grad_norm": 0.5943930149078369,
"learning_rate": 0.0001,
"loss": 0.4938,
"step": 260
},
{
"epoch": 1.5914634146341462,
"grad_norm": 0.6259720921516418,
"learning_rate": 0.0001,
"loss": 0.5295,
"step": 261
},
{
"epoch": 1.5975609756097562,
"grad_norm": 0.6168601512908936,
"learning_rate": 0.0001,
"loss": 0.4633,
"step": 262
},
{
"epoch": 1.6036585365853657,
"grad_norm": 0.6057328581809998,
"learning_rate": 0.0001,
"loss": 0.5074,
"step": 263
},
{
"epoch": 1.6097560975609757,
"grad_norm": 0.607790470123291,
"learning_rate": 0.0001,
"loss": 0.5068,
"step": 264
},
{
"epoch": 1.6158536585365852,
"grad_norm": 0.5669077634811401,
"learning_rate": 0.0001,
"loss": 0.4578,
"step": 265
},
{
"epoch": 1.6219512195121952,
"grad_norm": 0.58953458070755,
"learning_rate": 0.0001,
"loss": 0.512,
"step": 266
},
{
"epoch": 1.6280487804878048,
"grad_norm": 0.6138054728507996,
"learning_rate": 0.0001,
"loss": 0.5035,
"step": 267
},
{
"epoch": 1.6341463414634148,
"grad_norm": 0.6316951513290405,
"learning_rate": 0.0001,
"loss": 0.5374,
"step": 268
},
{
"epoch": 1.6402439024390243,
"grad_norm": 0.5779020190238953,
"learning_rate": 0.0001,
"loss": 0.4934,
"step": 269
},
{
"epoch": 1.6463414634146343,
"grad_norm": 0.6008270978927612,
"learning_rate": 0.0001,
"loss": 0.4628,
"step": 270
},
{
"epoch": 1.6524390243902438,
"grad_norm": 0.5894110202789307,
"learning_rate": 0.0001,
"loss": 0.5109,
"step": 271
},
{
"epoch": 1.6585365853658538,
"grad_norm": 0.5894849896430969,
"learning_rate": 0.0001,
"loss": 0.4861,
"step": 272
},
{
"epoch": 1.6646341463414633,
"grad_norm": 0.6085466146469116,
"learning_rate": 0.0001,
"loss": 0.5101,
"step": 273
},
{
"epoch": 1.6707317073170733,
"grad_norm": 0.6503622531890869,
"learning_rate": 0.0001,
"loss": 0.5508,
"step": 274
},
{
"epoch": 1.6768292682926829,
"grad_norm": 0.6089245676994324,
"learning_rate": 0.0001,
"loss": 0.4911,
"step": 275
},
{
"epoch": 1.6829268292682928,
"grad_norm": 0.6388260126113892,
"learning_rate": 0.0001,
"loss": 0.5165,
"step": 276
},
{
"epoch": 1.6890243902439024,
"grad_norm": 0.6048246622085571,
"learning_rate": 0.0001,
"loss": 0.5405,
"step": 277
},
{
"epoch": 1.6951219512195121,
"grad_norm": 0.5887222290039062,
"learning_rate": 0.0001,
"loss": 0.5205,
"step": 278
},
{
"epoch": 1.701219512195122,
"grad_norm": 0.6097093820571899,
"learning_rate": 0.0001,
"loss": 0.5139,
"step": 279
},
{
"epoch": 1.7073170731707317,
"grad_norm": 0.5547489523887634,
"learning_rate": 0.0001,
"loss": 0.4915,
"step": 280
},
{
"epoch": 1.7134146341463414,
"grad_norm": 0.6122882962226868,
"learning_rate": 0.0001,
"loss": 0.493,
"step": 281
},
{
"epoch": 1.7195121951219512,
"grad_norm": 0.6592060923576355,
"learning_rate": 0.0001,
"loss": 0.5314,
"step": 282
},
{
"epoch": 1.725609756097561,
"grad_norm": 0.6154331564903259,
"learning_rate": 0.0001,
"loss": 0.5025,
"step": 283
},
{
"epoch": 1.7317073170731707,
"grad_norm": 0.5997411608695984,
"learning_rate": 0.0001,
"loss": 0.5057,
"step": 284
},
{
"epoch": 1.7378048780487805,
"grad_norm": 0.615349292755127,
"learning_rate": 0.0001,
"loss": 0.5195,
"step": 285
},
{
"epoch": 1.7439024390243902,
"grad_norm": 0.6155688762664795,
"learning_rate": 0.0001,
"loss": 0.5194,
"step": 286
},
{
"epoch": 1.75,
"grad_norm": 0.5677372217178345,
"learning_rate": 0.0001,
"loss": 0.5433,
"step": 287
},
{
"epoch": 1.7560975609756098,
"grad_norm": 0.5937820672988892,
"learning_rate": 0.0001,
"loss": 0.5269,
"step": 288
},
{
"epoch": 1.7621951219512195,
"grad_norm": 0.5868131518363953,
"learning_rate": 0.0001,
"loss": 0.535,
"step": 289
},
{
"epoch": 1.7682926829268293,
"grad_norm": 0.6256383061408997,
"learning_rate": 0.0001,
"loss": 0.5196,
"step": 290
},
{
"epoch": 1.774390243902439,
"grad_norm": 0.6187792420387268,
"learning_rate": 0.0001,
"loss": 0.5027,
"step": 291
},
{
"epoch": 1.7804878048780488,
"grad_norm": 0.6260528564453125,
"learning_rate": 0.0001,
"loss": 0.5437,
"step": 292
},
{
"epoch": 1.7865853658536586,
"grad_norm": 0.5868582129478455,
"learning_rate": 0.0001,
"loss": 0.5133,
"step": 293
},
{
"epoch": 1.7926829268292683,
"grad_norm": 0.6079871654510498,
"learning_rate": 0.0001,
"loss": 0.5102,
"step": 294
},
{
"epoch": 1.798780487804878,
"grad_norm": 0.5693763494491577,
"learning_rate": 0.0001,
"loss": 0.4933,
"step": 295
},
{
"epoch": 1.8048780487804879,
"grad_norm": 0.6394689679145813,
"learning_rate": 0.0001,
"loss": 0.5452,
"step": 296
},
{
"epoch": 1.8109756097560976,
"grad_norm": 0.6318659782409668,
"learning_rate": 0.0001,
"loss": 0.5391,
"step": 297
},
{
"epoch": 1.8170731707317072,
"grad_norm": 0.5786278247833252,
"learning_rate": 0.0001,
"loss": 0.5129,
"step": 298
},
{
"epoch": 1.8231707317073171,
"grad_norm": 0.6378489136695862,
"learning_rate": 0.0001,
"loss": 0.4935,
"step": 299
},
{
"epoch": 1.8292682926829267,
"grad_norm": 0.637844979763031,
"learning_rate": 0.0001,
"loss": 0.5057,
"step": 300
},
{
"epoch": 1.8353658536585367,
"grad_norm": 0.6403583288192749,
"learning_rate": 0.0001,
"loss": 0.542,
"step": 301
},
{
"epoch": 1.8414634146341462,
"grad_norm": 0.6149348616600037,
"learning_rate": 0.0001,
"loss": 0.5108,
"step": 302
},
{
"epoch": 1.8475609756097562,
"grad_norm": 0.5945342779159546,
"learning_rate": 0.0001,
"loss": 0.496,
"step": 303
},
{
"epoch": 1.8536585365853657,
"grad_norm": 0.6346225142478943,
"learning_rate": 0.0001,
"loss": 0.5463,
"step": 304
},
{
"epoch": 1.8597560975609757,
"grad_norm": 0.590212881565094,
"learning_rate": 0.0001,
"loss": 0.5126,
"step": 305
},
{
"epoch": 1.8658536585365852,
"grad_norm": 0.5924628973007202,
"learning_rate": 0.0001,
"loss": 0.5096,
"step": 306
},
{
"epoch": 1.8719512195121952,
"grad_norm": 0.6342692375183105,
"learning_rate": 0.0001,
"loss": 0.5063,
"step": 307
},
{
"epoch": 1.8780487804878048,
"grad_norm": 0.6688621640205383,
"learning_rate": 0.0001,
"loss": 0.5534,
"step": 308
},
{
"epoch": 1.8841463414634148,
"grad_norm": 0.628839910030365,
"learning_rate": 0.0001,
"loss": 0.4975,
"step": 309
},
{
"epoch": 1.8902439024390243,
"grad_norm": 0.6141210794448853,
"learning_rate": 0.0001,
"loss": 0.4777,
"step": 310
},
{
"epoch": 1.8963414634146343,
"grad_norm": 0.6270496845245361,
"learning_rate": 0.0001,
"loss": 0.5019,
"step": 311
},
{
"epoch": 1.9024390243902438,
"grad_norm": 0.5861090421676636,
"learning_rate": 0.0001,
"loss": 0.5066,
"step": 312
},
{
"epoch": 1.9085365853658538,
"grad_norm": 0.5715667009353638,
"learning_rate": 0.0001,
"loss": 0.4766,
"step": 313
},
{
"epoch": 1.9146341463414633,
"grad_norm": 0.6288326978683472,
"learning_rate": 0.0001,
"loss": 0.5152,
"step": 314
},
{
"epoch": 1.9207317073170733,
"grad_norm": 0.5759385228157043,
"learning_rate": 0.0001,
"loss": 0.51,
"step": 315
},
{
"epoch": 1.9268292682926829,
"grad_norm": 0.6145620346069336,
"learning_rate": 0.0001,
"loss": 0.5104,
"step": 316
},
{
"epoch": 1.9329268292682928,
"grad_norm": 0.6138148903846741,
"learning_rate": 0.0001,
"loss": 0.4967,
"step": 317
},
{
"epoch": 1.9390243902439024,
"grad_norm": 0.6269311308860779,
"learning_rate": 0.0001,
"loss": 0.5479,
"step": 318
},
{
"epoch": 1.9451219512195121,
"grad_norm": 0.6406437754631042,
"learning_rate": 0.0001,
"loss": 0.5199,
"step": 319
},
{
"epoch": 1.951219512195122,
"grad_norm": 0.5639004707336426,
"learning_rate": 0.0001,
"loss": 0.4852,
"step": 320
},
{
"epoch": 1.9573170731707317,
"grad_norm": 0.5929526090621948,
"learning_rate": 0.0001,
"loss": 0.5253,
"step": 321
},
{
"epoch": 1.9634146341463414,
"grad_norm": 0.59356689453125,
"learning_rate": 0.0001,
"loss": 0.5094,
"step": 322
},
{
"epoch": 1.9695121951219512,
"grad_norm": 0.6183592677116394,
"learning_rate": 0.0001,
"loss": 0.495,
"step": 323
},
{
"epoch": 1.975609756097561,
"grad_norm": 0.5988680720329285,
"learning_rate": 0.0001,
"loss": 0.5017,
"step": 324
},
{
"epoch": 1.9817073170731707,
"grad_norm": 0.6253383159637451,
"learning_rate": 0.0001,
"loss": 0.5101,
"step": 325
},
{
"epoch": 1.9878048780487805,
"grad_norm": 0.6147765517234802,
"learning_rate": 0.0001,
"loss": 0.4952,
"step": 326
},
{
"epoch": 1.9939024390243902,
"grad_norm": 0.6041817665100098,
"learning_rate": 0.0001,
"loss": 0.5042,
"step": 327
},
{
"epoch": 2.0,
"grad_norm": 0.5927252769470215,
"learning_rate": 0.0001,
"loss": 0.4996,
"step": 328
},
{
"epoch": 2.0060975609756095,
"grad_norm": 0.6218935251235962,
"learning_rate": 0.0001,
"loss": 0.4171,
"step": 329
},
{
"epoch": 2.0121951219512195,
"grad_norm": 0.5569261312484741,
"learning_rate": 0.0001,
"loss": 0.3905,
"step": 330
},
{
"epoch": 2.018292682926829,
"grad_norm": 0.5948651432991028,
"learning_rate": 0.0001,
"loss": 0.3704,
"step": 331
},
{
"epoch": 2.024390243902439,
"grad_norm": 0.6893870830535889,
"learning_rate": 0.0001,
"loss": 0.3446,
"step": 332
},
{
"epoch": 2.0304878048780486,
"grad_norm": 0.6298575401306152,
"learning_rate": 0.0001,
"loss": 0.3657,
"step": 333
},
{
"epoch": 2.0365853658536586,
"grad_norm": 0.6463242173194885,
"learning_rate": 0.0001,
"loss": 0.3752,
"step": 334
},
{
"epoch": 2.042682926829268,
"grad_norm": 0.6220399141311646,
"learning_rate": 0.0001,
"loss": 0.4133,
"step": 335
},
{
"epoch": 2.048780487804878,
"grad_norm": 0.6175084710121155,
"learning_rate": 0.0001,
"loss": 0.3856,
"step": 336
},
{
"epoch": 2.0548780487804876,
"grad_norm": 0.5709812641143799,
"learning_rate": 0.0001,
"loss": 0.3791,
"step": 337
},
{
"epoch": 2.0609756097560976,
"grad_norm": 0.5842687487602234,
"learning_rate": 0.0001,
"loss": 0.3981,
"step": 338
},
{
"epoch": 2.067073170731707,
"grad_norm": 0.5711541771888733,
"learning_rate": 0.0001,
"loss": 0.3463,
"step": 339
},
{
"epoch": 2.073170731707317,
"grad_norm": 0.6160522103309631,
"learning_rate": 0.0001,
"loss": 0.3579,
"step": 340
},
{
"epoch": 2.0792682926829267,
"grad_norm": 0.6163449287414551,
"learning_rate": 0.0001,
"loss": 0.3651,
"step": 341
},
{
"epoch": 2.0853658536585367,
"grad_norm": 0.6386067271232605,
"learning_rate": 0.0001,
"loss": 0.4165,
"step": 342
},
{
"epoch": 2.091463414634146,
"grad_norm": 0.6074360609054565,
"learning_rate": 0.0001,
"loss": 0.383,
"step": 343
},
{
"epoch": 2.097560975609756,
"grad_norm": 0.5862374305725098,
"learning_rate": 0.0001,
"loss": 0.3658,
"step": 344
},
{
"epoch": 2.1036585365853657,
"grad_norm": 0.5639402270317078,
"learning_rate": 0.0001,
"loss": 0.3708,
"step": 345
},
{
"epoch": 2.1097560975609757,
"grad_norm": 0.5674434304237366,
"learning_rate": 0.0001,
"loss": 0.376,
"step": 346
},
{
"epoch": 2.1158536585365852,
"grad_norm": 0.641013503074646,
"learning_rate": 0.0001,
"loss": 0.3898,
"step": 347
},
{
"epoch": 2.1219512195121952,
"grad_norm": 0.6373003125190735,
"learning_rate": 0.0001,
"loss": 0.3998,
"step": 348
},
{
"epoch": 2.1280487804878048,
"grad_norm": 0.6026149392127991,
"learning_rate": 0.0001,
"loss": 0.3419,
"step": 349
},
{
"epoch": 2.1341463414634148,
"grad_norm": 0.5974167585372925,
"learning_rate": 0.0001,
"loss": 0.3501,
"step": 350
},
{
"epoch": 2.1402439024390243,
"grad_norm": 0.5709217190742493,
"learning_rate": 0.0001,
"loss": 0.4023,
"step": 351
},
{
"epoch": 2.1463414634146343,
"grad_norm": 0.6201815605163574,
"learning_rate": 0.0001,
"loss": 0.3801,
"step": 352
},
{
"epoch": 2.152439024390244,
"grad_norm": 0.5644124150276184,
"learning_rate": 0.0001,
"loss": 0.3536,
"step": 353
},
{
"epoch": 2.158536585365854,
"grad_norm": 0.5843915343284607,
"learning_rate": 0.0001,
"loss": 0.367,
"step": 354
},
{
"epoch": 2.1646341463414633,
"grad_norm": 0.6504707336425781,
"learning_rate": 0.0001,
"loss": 0.41,
"step": 355
},
{
"epoch": 2.1707317073170733,
"grad_norm": 0.6272132396697998,
"learning_rate": 0.0001,
"loss": 0.3642,
"step": 356
},
{
"epoch": 2.176829268292683,
"grad_norm": 0.6171401143074036,
"learning_rate": 0.0001,
"loss": 0.3709,
"step": 357
},
{
"epoch": 2.182926829268293,
"grad_norm": 0.5451359748840332,
"learning_rate": 0.0001,
"loss": 0.3699,
"step": 358
},
{
"epoch": 2.1890243902439024,
"grad_norm": 0.5557040572166443,
"learning_rate": 0.0001,
"loss": 0.3889,
"step": 359
},
{
"epoch": 2.1951219512195124,
"grad_norm": 0.5514318943023682,
"learning_rate": 0.0001,
"loss": 0.3595,
"step": 360
},
{
"epoch": 2.201219512195122,
"grad_norm": 0.6279582381248474,
"learning_rate": 0.0001,
"loss": 0.365,
"step": 361
},
{
"epoch": 2.207317073170732,
"grad_norm": 0.6362396478652954,
"learning_rate": 0.0001,
"loss": 0.3676,
"step": 362
},
{
"epoch": 2.2134146341463414,
"grad_norm": 0.6167373061180115,
"learning_rate": 0.0001,
"loss": 0.4047,
"step": 363
},
{
"epoch": 2.2195121951219514,
"grad_norm": 0.5988054871559143,
"learning_rate": 0.0001,
"loss": 0.3866,
"step": 364
},
{
"epoch": 2.225609756097561,
"grad_norm": 0.6260228753089905,
"learning_rate": 0.0001,
"loss": 0.3969,
"step": 365
},
{
"epoch": 2.231707317073171,
"grad_norm": 0.5669357180595398,
"learning_rate": 0.0001,
"loss": 0.3624,
"step": 366
},
{
"epoch": 2.2378048780487805,
"grad_norm": 0.5572336316108704,
"learning_rate": 0.0001,
"loss": 0.3802,
"step": 367
},
{
"epoch": 2.2439024390243905,
"grad_norm": 0.577407956123352,
"learning_rate": 0.0001,
"loss": 0.3814,
"step": 368
},
{
"epoch": 2.25,
"grad_norm": 0.5576046109199524,
"learning_rate": 0.0001,
"loss": 0.3529,
"step": 369
},
{
"epoch": 2.2560975609756095,
"grad_norm": 0.5899252891540527,
"learning_rate": 0.0001,
"loss": 0.361,
"step": 370
},
{
"epoch": 2.2621951219512195,
"grad_norm": 0.6026024222373962,
"learning_rate": 0.0001,
"loss": 0.3602,
"step": 371
},
{
"epoch": 2.2682926829268295,
"grad_norm": 0.651066780090332,
"learning_rate": 0.0001,
"loss": 0.3646,
"step": 372
},
{
"epoch": 2.274390243902439,
"grad_norm": 0.6255848407745361,
"learning_rate": 0.0001,
"loss": 0.3468,
"step": 373
},
{
"epoch": 2.2804878048780486,
"grad_norm": 0.6624294519424438,
"learning_rate": 0.0001,
"loss": 0.3928,
"step": 374
},
{
"epoch": 2.2865853658536586,
"grad_norm": 0.5514746308326721,
"learning_rate": 0.0001,
"loss": 0.374,
"step": 375
},
{
"epoch": 2.292682926829268,
"grad_norm": 0.5865519642829895,
"learning_rate": 0.0001,
"loss": 0.387,
"step": 376
},
{
"epoch": 2.298780487804878,
"grad_norm": 0.5901021957397461,
"learning_rate": 0.0001,
"loss": 0.3922,
"step": 377
},
{
"epoch": 2.3048780487804876,
"grad_norm": 0.5819031000137329,
"learning_rate": 0.0001,
"loss": 0.3825,
"step": 378
},
{
"epoch": 2.3109756097560976,
"grad_norm": 0.5795203447341919,
"learning_rate": 0.0001,
"loss": 0.3983,
"step": 379
},
{
"epoch": 2.317073170731707,
"grad_norm": 0.5817603468894958,
"learning_rate": 0.0001,
"loss": 0.3892,
"step": 380
},
{
"epoch": 2.323170731707317,
"grad_norm": 0.5905787348747253,
"learning_rate": 0.0001,
"loss": 0.3662,
"step": 381
},
{
"epoch": 2.3292682926829267,
"grad_norm": 0.6160801649093628,
"learning_rate": 0.0001,
"loss": 0.382,
"step": 382
},
{
"epoch": 2.3353658536585367,
"grad_norm": 0.6367721557617188,
"learning_rate": 0.0001,
"loss": 0.3684,
"step": 383
},
{
"epoch": 2.341463414634146,
"grad_norm": 0.6236375570297241,
"learning_rate": 0.0001,
"loss": 0.3671,
"step": 384
},
{
"epoch": 2.347560975609756,
"grad_norm": 0.5669872164726257,
"learning_rate": 0.0001,
"loss": 0.3634,
"step": 385
},
{
"epoch": 2.3536585365853657,
"grad_norm": 0.5991116166114807,
"learning_rate": 0.0001,
"loss": 0.3628,
"step": 386
},
{
"epoch": 2.3597560975609757,
"grad_norm": 0.5670086145401001,
"learning_rate": 0.0001,
"loss": 0.3635,
"step": 387
},
{
"epoch": 2.3658536585365852,
"grad_norm": 0.629401683807373,
"learning_rate": 0.0001,
"loss": 0.3925,
"step": 388
},
{
"epoch": 2.3719512195121952,
"grad_norm": 0.6248301267623901,
"learning_rate": 0.0001,
"loss": 0.3825,
"step": 389
},
{
"epoch": 2.3780487804878048,
"grad_norm": 0.5823646187782288,
"learning_rate": 0.0001,
"loss": 0.3775,
"step": 390
},
{
"epoch": 2.3841463414634148,
"grad_norm": 0.6670135855674744,
"learning_rate": 0.0001,
"loss": 0.3927,
"step": 391
},
{
"epoch": 2.3902439024390243,
"grad_norm": 0.6390913128852844,
"learning_rate": 0.0001,
"loss": 0.4057,
"step": 392
},
{
"epoch": 2.3963414634146343,
"grad_norm": 0.5848169922828674,
"learning_rate": 0.0001,
"loss": 0.3712,
"step": 393
},
{
"epoch": 2.402439024390244,
"grad_norm": 0.5966094732284546,
"learning_rate": 0.0001,
"loss": 0.3713,
"step": 394
},
{
"epoch": 2.408536585365854,
"grad_norm": 0.6144512891769409,
"learning_rate": 0.0001,
"loss": 0.3698,
"step": 395
},
{
"epoch": 2.4146341463414633,
"grad_norm": 0.5988245010375977,
"learning_rate": 0.0001,
"loss": 0.3686,
"step": 396
},
{
"epoch": 2.4207317073170733,
"grad_norm": 0.6109009981155396,
"learning_rate": 0.0001,
"loss": 0.3921,
"step": 397
},
{
"epoch": 2.426829268292683,
"grad_norm": 0.6432120203971863,
"learning_rate": 0.0001,
"loss": 0.4231,
"step": 398
},
{
"epoch": 2.432926829268293,
"grad_norm": 0.5902109742164612,
"learning_rate": 0.0001,
"loss": 0.3699,
"step": 399
},
{
"epoch": 2.4390243902439024,
"grad_norm": 0.6081752777099609,
"learning_rate": 0.0001,
"loss": 0.3836,
"step": 400
},
{
"epoch": 2.4451219512195124,
"grad_norm": 0.6146216988563538,
"learning_rate": 0.0001,
"loss": 0.3785,
"step": 401
},
{
"epoch": 2.451219512195122,
"grad_norm": 0.6472842693328857,
"learning_rate": 0.0001,
"loss": 0.373,
"step": 402
},
{
"epoch": 2.457317073170732,
"grad_norm": 0.60771644115448,
"learning_rate": 0.0001,
"loss": 0.3685,
"step": 403
},
{
"epoch": 2.4634146341463414,
"grad_norm": 0.6457931995391846,
"learning_rate": 0.0001,
"loss": 0.3746,
"step": 404
},
{
"epoch": 2.4695121951219514,
"grad_norm": 0.5895772576332092,
"learning_rate": 0.0001,
"loss": 0.3758,
"step": 405
},
{
"epoch": 2.475609756097561,
"grad_norm": 0.6693524718284607,
"learning_rate": 0.0001,
"loss": 0.3904,
"step": 406
},
{
"epoch": 2.4817073170731705,
"grad_norm": 0.6366068124771118,
"learning_rate": 0.0001,
"loss": 0.3923,
"step": 407
},
{
"epoch": 2.4878048780487805,
"grad_norm": 0.6241960525512695,
"learning_rate": 0.0001,
"loss": 0.3559,
"step": 408
},
{
"epoch": 2.4939024390243905,
"grad_norm": 0.6247851252555847,
"learning_rate": 0.0001,
"loss": 0.3881,
"step": 409
},
{
"epoch": 2.5,
"grad_norm": 0.6421067714691162,
"learning_rate": 0.0001,
"loss": 0.4021,
"step": 410
},
{
"epoch": 2.5060975609756095,
"grad_norm": 0.7222415804862976,
"learning_rate": 0.0001,
"loss": 0.391,
"step": 411
},
{
"epoch": 2.5121951219512195,
"grad_norm": 0.6274811625480652,
"learning_rate": 0.0001,
"loss": 0.3817,
"step": 412
},
{
"epoch": 2.5182926829268295,
"grad_norm": 0.5927621126174927,
"learning_rate": 0.0001,
"loss": 0.3595,
"step": 413
},
{
"epoch": 2.524390243902439,
"grad_norm": 0.5889265537261963,
"learning_rate": 0.0001,
"loss": 0.3684,
"step": 414
},
{
"epoch": 2.5304878048780486,
"grad_norm": 0.6477332711219788,
"learning_rate": 0.0001,
"loss": 0.4308,
"step": 415
},
{
"epoch": 2.5365853658536586,
"grad_norm": 0.6162149906158447,
"learning_rate": 0.0001,
"loss": 0.4087,
"step": 416
},
{
"epoch": 2.5426829268292686,
"grad_norm": 0.6609845757484436,
"learning_rate": 0.0001,
"loss": 0.4028,
"step": 417
},
{
"epoch": 2.548780487804878,
"grad_norm": 0.6425780057907104,
"learning_rate": 0.0001,
"loss": 0.3832,
"step": 418
},
{
"epoch": 2.5548780487804876,
"grad_norm": 0.6117408275604248,
"learning_rate": 0.0001,
"loss": 0.368,
"step": 419
},
{
"epoch": 2.5609756097560976,
"grad_norm": 0.6596407890319824,
"learning_rate": 0.0001,
"loss": 0.3848,
"step": 420
},
{
"epoch": 2.567073170731707,
"grad_norm": 0.6080613136291504,
"learning_rate": 0.0001,
"loss": 0.3862,
"step": 421
},
{
"epoch": 2.573170731707317,
"grad_norm": 0.6160922646522522,
"learning_rate": 0.0001,
"loss": 0.3797,
"step": 422
},
{
"epoch": 2.5792682926829267,
"grad_norm": 0.6346991658210754,
"learning_rate": 0.0001,
"loss": 0.3702,
"step": 423
},
{
"epoch": 2.5853658536585367,
"grad_norm": 0.6169600486755371,
"learning_rate": 0.0001,
"loss": 0.3931,
"step": 424
},
{
"epoch": 2.591463414634146,
"grad_norm": 0.6396271586418152,
"learning_rate": 0.0001,
"loss": 0.4133,
"step": 425
},
{
"epoch": 2.597560975609756,
"grad_norm": 0.5953004360198975,
"learning_rate": 0.0001,
"loss": 0.3732,
"step": 426
},
{
"epoch": 2.6036585365853657,
"grad_norm": 0.6704226732254028,
"learning_rate": 0.0001,
"loss": 0.3924,
"step": 427
},
{
"epoch": 2.6097560975609757,
"grad_norm": 0.6755167245864868,
"learning_rate": 0.0001,
"loss": 0.3891,
"step": 428
},
{
"epoch": 2.6158536585365852,
"grad_norm": 0.6189351677894592,
"learning_rate": 0.0001,
"loss": 0.4072,
"step": 429
},
{
"epoch": 2.6219512195121952,
"grad_norm": 0.6409624218940735,
"learning_rate": 0.0001,
"loss": 0.382,
"step": 430
},
{
"epoch": 2.6280487804878048,
"grad_norm": 0.629356324672699,
"learning_rate": 0.0001,
"loss": 0.3783,
"step": 431
},
{
"epoch": 2.6341463414634148,
"grad_norm": 0.6259102821350098,
"learning_rate": 0.0001,
"loss": 0.3837,
"step": 432
},
{
"epoch": 2.6402439024390243,
"grad_norm": 0.6589633822441101,
"learning_rate": 0.0001,
"loss": 0.3958,
"step": 433
},
{
"epoch": 2.6463414634146343,
"grad_norm": 0.6646971702575684,
"learning_rate": 0.0001,
"loss": 0.3948,
"step": 434
},
{
"epoch": 2.652439024390244,
"grad_norm": 0.6579565405845642,
"learning_rate": 0.0001,
"loss": 0.3749,
"step": 435
},
{
"epoch": 2.658536585365854,
"grad_norm": 0.6253348588943481,
"learning_rate": 0.0001,
"loss": 0.3737,
"step": 436
},
{
"epoch": 2.6646341463414633,
"grad_norm": 0.6139116287231445,
"learning_rate": 0.0001,
"loss": 0.4165,
"step": 437
},
{
"epoch": 2.6707317073170733,
"grad_norm": 0.6256686449050903,
"learning_rate": 0.0001,
"loss": 0.3838,
"step": 438
},
{
"epoch": 2.676829268292683,
"grad_norm": 0.6139652729034424,
"learning_rate": 0.0001,
"loss": 0.3751,
"step": 439
},
{
"epoch": 2.682926829268293,
"grad_norm": 0.6227155923843384,
"learning_rate": 0.0001,
"loss": 0.3752,
"step": 440
},
{
"epoch": 2.6890243902439024,
"grad_norm": 0.590382993221283,
"learning_rate": 0.0001,
"loss": 0.3896,
"step": 441
},
{
"epoch": 2.6951219512195124,
"grad_norm": 0.6084756255149841,
"learning_rate": 0.0001,
"loss": 0.3725,
"step": 442
},
{
"epoch": 2.701219512195122,
"grad_norm": 0.6576021909713745,
"learning_rate": 0.0001,
"loss": 0.4095,
"step": 443
},
{
"epoch": 2.7073170731707314,
"grad_norm": 0.6265486478805542,
"learning_rate": 0.0001,
"loss": 0.3868,
"step": 444
},
{
"epoch": 2.7134146341463414,
"grad_norm": 0.651096761226654,
"learning_rate": 0.0001,
"loss": 0.4042,
"step": 445
},
{
"epoch": 2.7195121951219514,
"grad_norm": 0.6373317241668701,
"learning_rate": 0.0001,
"loss": 0.4209,
"step": 446
},
{
"epoch": 2.725609756097561,
"grad_norm": 0.6040897965431213,
"learning_rate": 0.0001,
"loss": 0.4084,
"step": 447
},
{
"epoch": 2.7317073170731705,
"grad_norm": 0.6254827976226807,
"learning_rate": 0.0001,
"loss": 0.3646,
"step": 448
},
{
"epoch": 2.7378048780487805,
"grad_norm": 0.6285514831542969,
"learning_rate": 0.0001,
"loss": 0.3711,
"step": 449
},
{
"epoch": 2.7439024390243905,
"grad_norm": 0.675573468208313,
"learning_rate": 0.0001,
"loss": 0.4191,
"step": 450
},
{
"epoch": 2.75,
"grad_norm": 0.6126376390457153,
"learning_rate": 0.0001,
"loss": 0.3782,
"step": 451
},
{
"epoch": 2.7560975609756095,
"grad_norm": 0.6281729340553284,
"learning_rate": 0.0001,
"loss": 0.3778,
"step": 452
},
{
"epoch": 2.7621951219512195,
"grad_norm": 0.5908406376838684,
"learning_rate": 0.0001,
"loss": 0.3927,
"step": 453
},
{
"epoch": 2.7682926829268295,
"grad_norm": 0.6050170660018921,
"learning_rate": 0.0001,
"loss": 0.431,
"step": 454
},
{
"epoch": 2.774390243902439,
"grad_norm": 0.624231219291687,
"learning_rate": 0.0001,
"loss": 0.3774,
"step": 455
},
{
"epoch": 2.7804878048780486,
"grad_norm": 0.6320463418960571,
"learning_rate": 0.0001,
"loss": 0.4062,
"step": 456
},
{
"epoch": 2.7865853658536586,
"grad_norm": 0.6329071521759033,
"learning_rate": 0.0001,
"loss": 0.3962,
"step": 457
},
{
"epoch": 2.7926829268292686,
"grad_norm": 0.6450055241584778,
"learning_rate": 0.0001,
"loss": 0.4096,
"step": 458
},
{
"epoch": 2.798780487804878,
"grad_norm": 0.6559624671936035,
"learning_rate": 0.0001,
"loss": 0.4015,
"step": 459
},
{
"epoch": 2.8048780487804876,
"grad_norm": 0.5944327116012573,
"learning_rate": 0.0001,
"loss": 0.3864,
"step": 460
},
{
"epoch": 2.8109756097560976,
"grad_norm": 0.6524405479431152,
"learning_rate": 0.0001,
"loss": 0.4245,
"step": 461
},
{
"epoch": 2.817073170731707,
"grad_norm": 0.6659778952598572,
"learning_rate": 0.0001,
"loss": 0.419,
"step": 462
},
{
"epoch": 2.823170731707317,
"grad_norm": 0.6520142555236816,
"learning_rate": 0.0001,
"loss": 0.4163,
"step": 463
},
{
"epoch": 2.8292682926829267,
"grad_norm": 0.6226247549057007,
"learning_rate": 0.0001,
"loss": 0.3898,
"step": 464
},
{
"epoch": 2.8353658536585367,
"grad_norm": 0.6132051348686218,
"learning_rate": 0.0001,
"loss": 0.3854,
"step": 465
},
{
"epoch": 2.841463414634146,
"grad_norm": 0.6409340500831604,
"learning_rate": 0.0001,
"loss": 0.3663,
"step": 466
},
{
"epoch": 2.847560975609756,
"grad_norm": 0.638858437538147,
"learning_rate": 0.0001,
"loss": 0.381,
"step": 467
},
{
"epoch": 2.8536585365853657,
"grad_norm": 0.6682012677192688,
"learning_rate": 0.0001,
"loss": 0.4027,
"step": 468
},
{
"epoch": 2.8597560975609757,
"grad_norm": 0.6829751133918762,
"learning_rate": 0.0001,
"loss": 0.4232,
"step": 469
},
{
"epoch": 2.8658536585365852,
"grad_norm": 0.6196625232696533,
"learning_rate": 0.0001,
"loss": 0.3629,
"step": 470
},
{
"epoch": 2.8719512195121952,
"grad_norm": 0.6654703617095947,
"learning_rate": 0.0001,
"loss": 0.4071,
"step": 471
},
{
"epoch": 2.8780487804878048,
"grad_norm": 0.6258810758590698,
"learning_rate": 0.0001,
"loss": 0.3893,
"step": 472
},
{
"epoch": 2.8841463414634148,
"grad_norm": 0.6281041502952576,
"learning_rate": 0.0001,
"loss": 0.3978,
"step": 473
},
{
"epoch": 2.8902439024390243,
"grad_norm": 0.6136834621429443,
"learning_rate": 0.0001,
"loss": 0.4258,
"step": 474
},
{
"epoch": 2.8963414634146343,
"grad_norm": 0.6135198473930359,
"learning_rate": 0.0001,
"loss": 0.3793,
"step": 475
},
{
"epoch": 2.902439024390244,
"grad_norm": 0.6039949059486389,
"learning_rate": 0.0001,
"loss": 0.4034,
"step": 476
},
{
"epoch": 2.908536585365854,
"grad_norm": 0.6059561967849731,
"learning_rate": 0.0001,
"loss": 0.3997,
"step": 477
},
{
"epoch": 2.9146341463414633,
"grad_norm": 0.6142321825027466,
"learning_rate": 0.0001,
"loss": 0.3778,
"step": 478
},
{
"epoch": 2.9207317073170733,
"grad_norm": 0.6661014556884766,
"learning_rate": 0.0001,
"loss": 0.4241,
"step": 479
},
{
"epoch": 2.926829268292683,
"grad_norm": 0.6781815886497498,
"learning_rate": 0.0001,
"loss": 0.3969,
"step": 480
},
{
"epoch": 2.932926829268293,
"grad_norm": 0.6294031739234924,
"learning_rate": 0.0001,
"loss": 0.3768,
"step": 481
},
{
"epoch": 2.9390243902439024,
"grad_norm": 0.6458147764205933,
"learning_rate": 0.0001,
"loss": 0.393,
"step": 482
},
{
"epoch": 2.9451219512195124,
"grad_norm": 0.5952702760696411,
"learning_rate": 0.0001,
"loss": 0.3844,
"step": 483
},
{
"epoch": 2.951219512195122,
"grad_norm": 0.5768480896949768,
"learning_rate": 0.0001,
"loss": 0.3893,
"step": 484
},
{
"epoch": 2.9573170731707314,
"grad_norm": 0.6429164409637451,
"learning_rate": 0.0001,
"loss": 0.4078,
"step": 485
},
{
"epoch": 2.9634146341463414,
"grad_norm": 0.5966724753379822,
"learning_rate": 0.0001,
"loss": 0.3689,
"step": 486
},
{
"epoch": 2.9695121951219514,
"grad_norm": 0.6305826306343079,
"learning_rate": 0.0001,
"loss": 0.3982,
"step": 487
},
{
"epoch": 2.975609756097561,
"grad_norm": 0.6368945240974426,
"learning_rate": 0.0001,
"loss": 0.4033,
"step": 488
},
{
"epoch": 2.9817073170731705,
"grad_norm": 0.6413828730583191,
"learning_rate": 0.0001,
"loss": 0.392,
"step": 489
},
{
"epoch": 2.9878048780487805,
"grad_norm": 0.626516580581665,
"learning_rate": 0.0001,
"loss": 0.3908,
"step": 490
},
{
"epoch": 2.9939024390243905,
"grad_norm": 0.6416463255882263,
"learning_rate": 0.0001,
"loss": 0.397,
"step": 491
},
{
"epoch": 3.0,
"grad_norm": 0.6507825255393982,
"learning_rate": 0.0001,
"loss": 0.3931,
"step": 492
}
],
"logging_steps": 1,
"max_steps": 492,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.340951084078203e+17,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}