camembertv2-base-xnli / trainer_state.json
wissamantoun's picture
Upload folder using huggingface_hub
c53d8e3 verified
{
"best_metric": 0.8285140562248996,
"best_model_checkpoint": "/scratch/camembertv2/runs/results/xnli/camembertv2-base-bf16-p2-17000/max_seq_length-160-gradient_accumulation_steps-4-precision-fp32-learning_rate-1e-05-epochs-10-lr_scheduler-cosine-warmup_steps-0.1/SEED-666/checkpoint-61360",
"epoch": 10.0,
"eval_steps": 500,
"global_step": 122720,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008148631029986962,
"grad_norm": 12.58836841583252,
"learning_rate": 8.148631029986963e-08,
"loss": 1.1012,
"step": 100
},
{
"epoch": 0.016297262059973925,
"grad_norm": 1.359410285949707,
"learning_rate": 1.6297262059973925e-07,
"loss": 1.1011,
"step": 200
},
{
"epoch": 0.024445893089960886,
"grad_norm": 1.128892183303833,
"learning_rate": 2.4445893089960885e-07,
"loss": 1.0978,
"step": 300
},
{
"epoch": 0.03259452411994785,
"grad_norm": 1.3794234991073608,
"learning_rate": 3.259452411994785e-07,
"loss": 1.0999,
"step": 400
},
{
"epoch": 0.04074315514993481,
"grad_norm": 1.3247599601745605,
"learning_rate": 4.0743155149934816e-07,
"loss": 1.0984,
"step": 500
},
{
"epoch": 0.04889178617992177,
"grad_norm": 0.9611015319824219,
"learning_rate": 4.889178617992177e-07,
"loss": 1.1001,
"step": 600
},
{
"epoch": 0.05704041720990873,
"grad_norm": 0.9682479500770569,
"learning_rate": 5.704041720990874e-07,
"loss": 1.0985,
"step": 700
},
{
"epoch": 0.0651890482398957,
"grad_norm": 1.950333833694458,
"learning_rate": 6.51890482398957e-07,
"loss": 1.0989,
"step": 800
},
{
"epoch": 0.07333767926988266,
"grad_norm": 1.4916733503341675,
"learning_rate": 7.333767926988267e-07,
"loss": 1.0964,
"step": 900
},
{
"epoch": 0.08148631029986962,
"grad_norm": 1.1135200262069702,
"learning_rate": 8.148631029986963e-07,
"loss": 1.096,
"step": 1000
},
{
"epoch": 0.08963494132985658,
"grad_norm": 1.773497462272644,
"learning_rate": 8.963494132985659e-07,
"loss": 1.094,
"step": 1100
},
{
"epoch": 0.09778357235984354,
"grad_norm": 1.5511926412582397,
"learning_rate": 9.778357235984354e-07,
"loss": 1.093,
"step": 1200
},
{
"epoch": 0.1059322033898305,
"grad_norm": 1.389298915863037,
"learning_rate": 1.059322033898305e-06,
"loss": 1.0871,
"step": 1300
},
{
"epoch": 0.11408083441981746,
"grad_norm": 2.486689329147339,
"learning_rate": 1.1408083441981747e-06,
"loss": 1.0751,
"step": 1400
},
{
"epoch": 0.12222946544980444,
"grad_norm": 2.697650194168091,
"learning_rate": 1.2222946544980446e-06,
"loss": 1.0505,
"step": 1500
},
{
"epoch": 0.1303780964797914,
"grad_norm": 3.557525157928467,
"learning_rate": 1.303780964797914e-06,
"loss": 1.0393,
"step": 1600
},
{
"epoch": 0.13852672750977835,
"grad_norm": 4.691379070281982,
"learning_rate": 1.3852672750977837e-06,
"loss": 1.0147,
"step": 1700
},
{
"epoch": 0.14667535853976532,
"grad_norm": 5.234630107879639,
"learning_rate": 1.4667535853976533e-06,
"loss": 0.9971,
"step": 1800
},
{
"epoch": 0.15482398956975227,
"grad_norm": 6.027713298797607,
"learning_rate": 1.5482398956975228e-06,
"loss": 1.0007,
"step": 1900
},
{
"epoch": 0.16297262059973924,
"grad_norm": 13.33498477935791,
"learning_rate": 1.6297262059973926e-06,
"loss": 0.984,
"step": 2000
},
{
"epoch": 0.17112125162972622,
"grad_norm": 9.432430267333984,
"learning_rate": 1.7112125162972623e-06,
"loss": 0.9633,
"step": 2100
},
{
"epoch": 0.17926988265971316,
"grad_norm": 7.303864479064941,
"learning_rate": 1.7926988265971317e-06,
"loss": 0.9463,
"step": 2200
},
{
"epoch": 0.18741851368970014,
"grad_norm": 6.125274181365967,
"learning_rate": 1.8741851368970016e-06,
"loss": 0.9336,
"step": 2300
},
{
"epoch": 0.19556714471968709,
"grad_norm": 6.614850044250488,
"learning_rate": 1.955671447196871e-06,
"loss": 0.9388,
"step": 2400
},
{
"epoch": 0.20371577574967406,
"grad_norm": 7.883510589599609,
"learning_rate": 2.037157757496741e-06,
"loss": 0.9122,
"step": 2500
},
{
"epoch": 0.211864406779661,
"grad_norm": 6.615538597106934,
"learning_rate": 2.11864406779661e-06,
"loss": 0.8907,
"step": 2600
},
{
"epoch": 0.22001303780964798,
"grad_norm": 6.040781021118164,
"learning_rate": 2.20013037809648e-06,
"loss": 0.8725,
"step": 2700
},
{
"epoch": 0.22816166883963493,
"grad_norm": 9.688776016235352,
"learning_rate": 2.2816166883963494e-06,
"loss": 0.8674,
"step": 2800
},
{
"epoch": 0.2363102998696219,
"grad_norm": 15.747467994689941,
"learning_rate": 2.363102998696219e-06,
"loss": 0.8199,
"step": 2900
},
{
"epoch": 0.24445893089960888,
"grad_norm": 9.381732940673828,
"learning_rate": 2.444589308996089e-06,
"loss": 0.831,
"step": 3000
},
{
"epoch": 0.2526075619295958,
"grad_norm": 8.603889465332031,
"learning_rate": 2.5260756192959584e-06,
"loss": 0.811,
"step": 3100
},
{
"epoch": 0.2607561929595828,
"grad_norm": 11.614546775817871,
"learning_rate": 2.607561929595828e-06,
"loss": 0.789,
"step": 3200
},
{
"epoch": 0.2689048239895698,
"grad_norm": 7.733945846557617,
"learning_rate": 2.689048239895698e-06,
"loss": 0.7947,
"step": 3300
},
{
"epoch": 0.2770534550195567,
"grad_norm": 14.573506355285645,
"learning_rate": 2.7705345501955674e-06,
"loss": 0.7913,
"step": 3400
},
{
"epoch": 0.28520208604954367,
"grad_norm": 11.938140869140625,
"learning_rate": 2.852020860495437e-06,
"loss": 0.793,
"step": 3500
},
{
"epoch": 0.29335071707953064,
"grad_norm": 9.235187530517578,
"learning_rate": 2.9335071707953067e-06,
"loss": 0.7538,
"step": 3600
},
{
"epoch": 0.3014993481095176,
"grad_norm": 9.092159271240234,
"learning_rate": 3.0149934810951763e-06,
"loss": 0.7547,
"step": 3700
},
{
"epoch": 0.30964797913950454,
"grad_norm": 11.72921371459961,
"learning_rate": 3.0964797913950456e-06,
"loss": 0.7461,
"step": 3800
},
{
"epoch": 0.3177966101694915,
"grad_norm": 15.118708610534668,
"learning_rate": 3.1779661016949152e-06,
"loss": 0.7171,
"step": 3900
},
{
"epoch": 0.3259452411994785,
"grad_norm": 17.719839096069336,
"learning_rate": 3.2594524119947853e-06,
"loss": 0.7027,
"step": 4000
},
{
"epoch": 0.33409387222946546,
"grad_norm": 10.063789367675781,
"learning_rate": 3.340938722294655e-06,
"loss": 0.7229,
"step": 4100
},
{
"epoch": 0.34224250325945244,
"grad_norm": 8.052227020263672,
"learning_rate": 3.4224250325945246e-06,
"loss": 0.7218,
"step": 4200
},
{
"epoch": 0.35039113428943935,
"grad_norm": 9.68342399597168,
"learning_rate": 3.503911342894394e-06,
"loss": 0.6873,
"step": 4300
},
{
"epoch": 0.35853976531942633,
"grad_norm": 9.140670776367188,
"learning_rate": 3.5853976531942635e-06,
"loss": 0.702,
"step": 4400
},
{
"epoch": 0.3666883963494133,
"grad_norm": 8.805059432983398,
"learning_rate": 3.666883963494133e-06,
"loss": 0.7245,
"step": 4500
},
{
"epoch": 0.3748370273794003,
"grad_norm": 7.228201389312744,
"learning_rate": 3.748370273794003e-06,
"loss": 0.6651,
"step": 4600
},
{
"epoch": 0.3829856584093872,
"grad_norm": 8.284133911132812,
"learning_rate": 3.829856584093872e-06,
"loss": 0.6956,
"step": 4700
},
{
"epoch": 0.39113428943937417,
"grad_norm": 8.938249588012695,
"learning_rate": 3.911342894393742e-06,
"loss": 0.6777,
"step": 4800
},
{
"epoch": 0.39928292046936115,
"grad_norm": 10.810254096984863,
"learning_rate": 3.992829204693612e-06,
"loss": 0.6803,
"step": 4900
},
{
"epoch": 0.4074315514993481,
"grad_norm": 11.629922866821289,
"learning_rate": 4.074315514993482e-06,
"loss": 0.6659,
"step": 5000
},
{
"epoch": 0.4155801825293351,
"grad_norm": 7.82265043258667,
"learning_rate": 4.1558018252933515e-06,
"loss": 0.6842,
"step": 5100
},
{
"epoch": 0.423728813559322,
"grad_norm": 9.290712356567383,
"learning_rate": 4.23728813559322e-06,
"loss": 0.6711,
"step": 5200
},
{
"epoch": 0.431877444589309,
"grad_norm": 10.643411636352539,
"learning_rate": 4.31877444589309e-06,
"loss": 0.6521,
"step": 5300
},
{
"epoch": 0.44002607561929596,
"grad_norm": 8.533503532409668,
"learning_rate": 4.40026075619296e-06,
"loss": 0.6613,
"step": 5400
},
{
"epoch": 0.44817470664928294,
"grad_norm": 12.260805130004883,
"learning_rate": 4.48174706649283e-06,
"loss": 0.6512,
"step": 5500
},
{
"epoch": 0.45632333767926986,
"grad_norm": 7.977556228637695,
"learning_rate": 4.563233376792699e-06,
"loss": 0.6499,
"step": 5600
},
{
"epoch": 0.46447196870925683,
"grad_norm": 7.418649673461914,
"learning_rate": 4.6447196870925686e-06,
"loss": 0.6591,
"step": 5700
},
{
"epoch": 0.4726205997392438,
"grad_norm": 10.594202995300293,
"learning_rate": 4.726205997392438e-06,
"loss": 0.6497,
"step": 5800
},
{
"epoch": 0.4807692307692308,
"grad_norm": 11.133523941040039,
"learning_rate": 4.807692307692308e-06,
"loss": 0.6538,
"step": 5900
},
{
"epoch": 0.48891786179921776,
"grad_norm": 12.108560562133789,
"learning_rate": 4.889178617992178e-06,
"loss": 0.6195,
"step": 6000
},
{
"epoch": 0.4970664928292047,
"grad_norm": 9.70545482635498,
"learning_rate": 4.970664928292047e-06,
"loss": 0.6351,
"step": 6100
},
{
"epoch": 0.5052151238591917,
"grad_norm": 12.699902534484863,
"learning_rate": 5.052151238591917e-06,
"loss": 0.6557,
"step": 6200
},
{
"epoch": 0.5133637548891786,
"grad_norm": 10.324420928955078,
"learning_rate": 5.1336375488917865e-06,
"loss": 0.6415,
"step": 6300
},
{
"epoch": 0.5215123859191656,
"grad_norm": 10.3858642578125,
"learning_rate": 5.215123859191656e-06,
"loss": 0.624,
"step": 6400
},
{
"epoch": 0.5296610169491526,
"grad_norm": 13.573092460632324,
"learning_rate": 5.296610169491526e-06,
"loss": 0.6622,
"step": 6500
},
{
"epoch": 0.5378096479791395,
"grad_norm": 8.366503715515137,
"learning_rate": 5.378096479791396e-06,
"loss": 0.6166,
"step": 6600
},
{
"epoch": 0.5459582790091264,
"grad_norm": 6.413454532623291,
"learning_rate": 5.459582790091264e-06,
"loss": 0.6315,
"step": 6700
},
{
"epoch": 0.5541069100391134,
"grad_norm": 7.670026779174805,
"learning_rate": 5.541069100391135e-06,
"loss": 0.612,
"step": 6800
},
{
"epoch": 0.5622555410691004,
"grad_norm": 10.53145694732666,
"learning_rate": 5.622555410691004e-06,
"loss": 0.6167,
"step": 6900
},
{
"epoch": 0.5704041720990873,
"grad_norm": 6.5404462814331055,
"learning_rate": 5.704041720990874e-06,
"loss": 0.6226,
"step": 7000
},
{
"epoch": 0.5785528031290743,
"grad_norm": 9.084834098815918,
"learning_rate": 5.785528031290744e-06,
"loss": 0.6214,
"step": 7100
},
{
"epoch": 0.5867014341590613,
"grad_norm": 9.231087684631348,
"learning_rate": 5.867014341590613e-06,
"loss": 0.6245,
"step": 7200
},
{
"epoch": 0.5948500651890483,
"grad_norm": 8.526376724243164,
"learning_rate": 5.948500651890483e-06,
"loss": 0.6205,
"step": 7300
},
{
"epoch": 0.6029986962190352,
"grad_norm": 9.337794303894043,
"learning_rate": 6.029986962190353e-06,
"loss": 0.6156,
"step": 7400
},
{
"epoch": 0.6111473272490222,
"grad_norm": 8.846671104431152,
"learning_rate": 6.111473272490222e-06,
"loss": 0.6142,
"step": 7500
},
{
"epoch": 0.6192959582790091,
"grad_norm": 8.68179988861084,
"learning_rate": 6.192959582790091e-06,
"loss": 0.6218,
"step": 7600
},
{
"epoch": 0.627444589308996,
"grad_norm": 9.76940631866455,
"learning_rate": 6.274445893089961e-06,
"loss": 0.587,
"step": 7700
},
{
"epoch": 0.635593220338983,
"grad_norm": 7.811220169067383,
"learning_rate": 6.3559322033898304e-06,
"loss": 0.6002,
"step": 7800
},
{
"epoch": 0.64374185136897,
"grad_norm": 8.950928688049316,
"learning_rate": 6.4374185136897e-06,
"loss": 0.6032,
"step": 7900
},
{
"epoch": 0.651890482398957,
"grad_norm": 6.704097270965576,
"learning_rate": 6.518904823989571e-06,
"loss": 0.5993,
"step": 8000
},
{
"epoch": 0.660039113428944,
"grad_norm": 11.18411922454834,
"learning_rate": 6.60039113428944e-06,
"loss": 0.6035,
"step": 8100
},
{
"epoch": 0.6681877444589309,
"grad_norm": 8.417338371276855,
"learning_rate": 6.68187744458931e-06,
"loss": 0.624,
"step": 8200
},
{
"epoch": 0.6763363754889179,
"grad_norm": 9.916496276855469,
"learning_rate": 6.7633637548891795e-06,
"loss": 0.6275,
"step": 8300
},
{
"epoch": 0.6844850065189049,
"grad_norm": 8.701171875,
"learning_rate": 6.844850065189049e-06,
"loss": 0.5773,
"step": 8400
},
{
"epoch": 0.6926336375488917,
"grad_norm": 10.245955467224121,
"learning_rate": 6.926336375488918e-06,
"loss": 0.6139,
"step": 8500
},
{
"epoch": 0.7007822685788787,
"grad_norm": 6.190640926361084,
"learning_rate": 7.007822685788788e-06,
"loss": 0.5833,
"step": 8600
},
{
"epoch": 0.7089308996088657,
"grad_norm": 10.875850677490234,
"learning_rate": 7.089308996088657e-06,
"loss": 0.6,
"step": 8700
},
{
"epoch": 0.7170795306388527,
"grad_norm": 8.644452095031738,
"learning_rate": 7.170795306388527e-06,
"loss": 0.6097,
"step": 8800
},
{
"epoch": 0.7252281616688396,
"grad_norm": 8.089356422424316,
"learning_rate": 7.252281616688397e-06,
"loss": 0.583,
"step": 8900
},
{
"epoch": 0.7333767926988266,
"grad_norm": 12.513883590698242,
"learning_rate": 7.333767926988266e-06,
"loss": 0.5669,
"step": 9000
},
{
"epoch": 0.7415254237288136,
"grad_norm": 9.404706001281738,
"learning_rate": 7.415254237288137e-06,
"loss": 0.5833,
"step": 9100
},
{
"epoch": 0.7496740547588006,
"grad_norm": 6.789037227630615,
"learning_rate": 7.496740547588006e-06,
"loss": 0.5985,
"step": 9200
},
{
"epoch": 0.7578226857887875,
"grad_norm": 7.355409145355225,
"learning_rate": 7.578226857887876e-06,
"loss": 0.5686,
"step": 9300
},
{
"epoch": 0.7659713168187744,
"grad_norm": 7.175694465637207,
"learning_rate": 7.659713168187744e-06,
"loss": 0.5991,
"step": 9400
},
{
"epoch": 0.7741199478487614,
"grad_norm": 6.2600274085998535,
"learning_rate": 7.741199478487615e-06,
"loss": 0.5803,
"step": 9500
},
{
"epoch": 0.7822685788787483,
"grad_norm": 11.514883995056152,
"learning_rate": 7.822685788787483e-06,
"loss": 0.5802,
"step": 9600
},
{
"epoch": 0.7904172099087353,
"grad_norm": 6.594653129577637,
"learning_rate": 7.904172099087354e-06,
"loss": 0.5772,
"step": 9700
},
{
"epoch": 0.7985658409387223,
"grad_norm": 10.59202766418457,
"learning_rate": 7.985658409387224e-06,
"loss": 0.5848,
"step": 9800
},
{
"epoch": 0.8067144719687093,
"grad_norm": 7.8735151290893555,
"learning_rate": 8.067144719687093e-06,
"loss": 0.5813,
"step": 9900
},
{
"epoch": 0.8148631029986962,
"grad_norm": 9.064979553222656,
"learning_rate": 8.148631029986964e-06,
"loss": 0.5792,
"step": 10000
},
{
"epoch": 0.8230117340286832,
"grad_norm": 10.0288667678833,
"learning_rate": 8.230117340286832e-06,
"loss": 0.5622,
"step": 10100
},
{
"epoch": 0.8311603650586702,
"grad_norm": 8.7724609375,
"learning_rate": 8.311603650586703e-06,
"loss": 0.5767,
"step": 10200
},
{
"epoch": 0.8393089960886571,
"grad_norm": 8.127886772155762,
"learning_rate": 8.393089960886572e-06,
"loss": 0.5721,
"step": 10300
},
{
"epoch": 0.847457627118644,
"grad_norm": 7.77069616317749,
"learning_rate": 8.47457627118644e-06,
"loss": 0.5925,
"step": 10400
},
{
"epoch": 0.855606258148631,
"grad_norm": 7.864415645599365,
"learning_rate": 8.556062581486311e-06,
"loss": 0.5805,
"step": 10500
},
{
"epoch": 0.863754889178618,
"grad_norm": 7.0319952964782715,
"learning_rate": 8.63754889178618e-06,
"loss": 0.577,
"step": 10600
},
{
"epoch": 0.871903520208605,
"grad_norm": 7.513912677764893,
"learning_rate": 8.71903520208605e-06,
"loss": 0.5978,
"step": 10700
},
{
"epoch": 0.8800521512385919,
"grad_norm": 8.28197193145752,
"learning_rate": 8.80052151238592e-06,
"loss": 0.5912,
"step": 10800
},
{
"epoch": 0.8882007822685789,
"grad_norm": 7.632150650024414,
"learning_rate": 8.88200782268579e-06,
"loss": 0.5706,
"step": 10900
},
{
"epoch": 0.8963494132985659,
"grad_norm": 7.691524028778076,
"learning_rate": 8.96349413298566e-06,
"loss": 0.5612,
"step": 11000
},
{
"epoch": 0.9044980443285529,
"grad_norm": 8.549062728881836,
"learning_rate": 9.044980443285529e-06,
"loss": 0.5494,
"step": 11100
},
{
"epoch": 0.9126466753585397,
"grad_norm": 10.64492416381836,
"learning_rate": 9.126466753585398e-06,
"loss": 0.5629,
"step": 11200
},
{
"epoch": 0.9207953063885267,
"grad_norm": 7.610856056213379,
"learning_rate": 9.207953063885268e-06,
"loss": 0.5627,
"step": 11300
},
{
"epoch": 0.9289439374185137,
"grad_norm": 10.41044807434082,
"learning_rate": 9.289439374185137e-06,
"loss": 0.5756,
"step": 11400
},
{
"epoch": 0.9370925684485006,
"grad_norm": 6.464520454406738,
"learning_rate": 9.370925684485008e-06,
"loss": 0.5817,
"step": 11500
},
{
"epoch": 0.9452411994784876,
"grad_norm": 12.031845092773438,
"learning_rate": 9.452411994784876e-06,
"loss": 0.5761,
"step": 11600
},
{
"epoch": 0.9533898305084746,
"grad_norm": 8.345417022705078,
"learning_rate": 9.533898305084747e-06,
"loss": 0.5789,
"step": 11700
},
{
"epoch": 0.9615384615384616,
"grad_norm": 8.58055305480957,
"learning_rate": 9.615384615384616e-06,
"loss": 0.5745,
"step": 11800
},
{
"epoch": 0.9696870925684485,
"grad_norm": 5.948461532592773,
"learning_rate": 9.696870925684486e-06,
"loss": 0.5695,
"step": 11900
},
{
"epoch": 0.9778357235984355,
"grad_norm": 8.523883819580078,
"learning_rate": 9.778357235984357e-06,
"loss": 0.575,
"step": 12000
},
{
"epoch": 0.9859843546284224,
"grad_norm": 8.530996322631836,
"learning_rate": 9.859843546284224e-06,
"loss": 0.5496,
"step": 12100
},
{
"epoch": 0.9941329856584094,
"grad_norm": 8.197943687438965,
"learning_rate": 9.941329856584094e-06,
"loss": 0.5929,
"step": 12200
},
{
"epoch": 1.0,
"eval_accuracy": 0.8028112449799196,
"eval_loss": 0.5100582242012024,
"eval_runtime": 7.5718,
"eval_samples_per_second": 328.853,
"eval_steps_per_second": 41.206,
"step": 12272
},
{
"epoch": 1.0022816166883963,
"grad_norm": 8.692102432250977,
"learning_rate": 9.999998414230423e-06,
"loss": 0.5456,
"step": 12300
},
{
"epoch": 1.0104302477183833,
"grad_norm": 6.663279056549072,
"learning_rate": 9.999966860686959e-06,
"loss": 0.546,
"step": 12400
},
{
"epoch": 1.0185788787483703,
"grad_norm": 7.53484582901001,
"learning_rate": 9.999894854131206e-06,
"loss": 0.5182,
"step": 12500
},
{
"epoch": 1.0267275097783573,
"grad_norm": 6.181861877441406,
"learning_rate": 9.999782395145752e-06,
"loss": 0.5093,
"step": 12600
},
{
"epoch": 1.0348761408083442,
"grad_norm": 9.323958396911621,
"learning_rate": 9.999629484640457e-06,
"loss": 0.528,
"step": 12700
},
{
"epoch": 1.0430247718383312,
"grad_norm": 10.739737510681152,
"learning_rate": 9.999436123852473e-06,
"loss": 0.5423,
"step": 12800
},
{
"epoch": 1.0511734028683182,
"grad_norm": 7.653073787689209,
"learning_rate": 9.99920231434621e-06,
"loss": 0.5215,
"step": 12900
},
{
"epoch": 1.0593220338983051,
"grad_norm": 6.83660888671875,
"learning_rate": 9.998928058013346e-06,
"loss": 0.5134,
"step": 13000
},
{
"epoch": 1.0674706649282921,
"grad_norm": 10.44430923461914,
"learning_rate": 9.99861335707279e-06,
"loss": 0.5466,
"step": 13100
},
{
"epoch": 1.075619295958279,
"grad_norm": 7.2710280418396,
"learning_rate": 9.998258214070683e-06,
"loss": 0.5364,
"step": 13200
},
{
"epoch": 1.083767926988266,
"grad_norm": 5.829804420471191,
"learning_rate": 9.997862631880362e-06,
"loss": 0.5146,
"step": 13300
},
{
"epoch": 1.0919165580182528,
"grad_norm": 8.52145767211914,
"learning_rate": 9.997426613702348e-06,
"loss": 0.5105,
"step": 13400
},
{
"epoch": 1.1000651890482398,
"grad_norm": 6.255794525146484,
"learning_rate": 9.996950163064313e-06,
"loss": 0.532,
"step": 13500
},
{
"epoch": 1.1082138200782268,
"grad_norm": 8.463394165039062,
"learning_rate": 9.996433283821057e-06,
"loss": 0.5265,
"step": 13600
},
{
"epoch": 1.1163624511082137,
"grad_norm": 9.939913749694824,
"learning_rate": 9.995875980154468e-06,
"loss": 0.5297,
"step": 13700
},
{
"epoch": 1.1245110821382007,
"grad_norm": 10.322543144226074,
"learning_rate": 9.995278256573504e-06,
"loss": 0.5413,
"step": 13800
},
{
"epoch": 1.1326597131681877,
"grad_norm": 7.6662445068359375,
"learning_rate": 9.994640117914139e-06,
"loss": 0.5197,
"step": 13900
},
{
"epoch": 1.1408083441981747,
"grad_norm": 12.555916786193848,
"learning_rate": 9.99396156933933e-06,
"loss": 0.5472,
"step": 14000
},
{
"epoch": 1.1489569752281616,
"grad_norm": 12.246332168579102,
"learning_rate": 9.993242616338983e-06,
"loss": 0.5296,
"step": 14100
},
{
"epoch": 1.1571056062581486,
"grad_norm": 11.406452178955078,
"learning_rate": 9.992483264729902e-06,
"loss": 0.5266,
"step": 14200
},
{
"epoch": 1.1652542372881356,
"grad_norm": 7.620953559875488,
"learning_rate": 9.991683520655735e-06,
"loss": 0.5267,
"step": 14300
},
{
"epoch": 1.1734028683181226,
"grad_norm": 7.820069789886475,
"learning_rate": 9.990843390586938e-06,
"loss": 0.5384,
"step": 14400
},
{
"epoch": 1.1815514993481095,
"grad_norm": 8.187140464782715,
"learning_rate": 9.989962881320714e-06,
"loss": 0.5071,
"step": 14500
},
{
"epoch": 1.1897001303780965,
"grad_norm": 8.322758674621582,
"learning_rate": 9.989041999980964e-06,
"loss": 0.5342,
"step": 14600
},
{
"epoch": 1.1978487614080835,
"grad_norm": 9.802703857421875,
"learning_rate": 9.988080754018218e-06,
"loss": 0.5205,
"step": 14700
},
{
"epoch": 1.2059973924380705,
"grad_norm": 9.249838829040527,
"learning_rate": 9.987079151209588e-06,
"loss": 0.5069,
"step": 14800
},
{
"epoch": 1.2141460234680574,
"grad_norm": 4.855494022369385,
"learning_rate": 9.986037199658698e-06,
"loss": 0.5107,
"step": 14900
},
{
"epoch": 1.2222946544980444,
"grad_norm": 9.250731468200684,
"learning_rate": 9.984954907795619e-06,
"loss": 0.5093,
"step": 15000
},
{
"epoch": 1.2304432855280312,
"grad_norm": 5.86234712600708,
"learning_rate": 9.983832284376804e-06,
"loss": 0.5539,
"step": 15100
},
{
"epoch": 1.2385919165580184,
"grad_norm": 13.074224472045898,
"learning_rate": 9.982669338485012e-06,
"loss": 0.5248,
"step": 15200
},
{
"epoch": 1.2467405475880051,
"grad_norm": 12.13022518157959,
"learning_rate": 9.981466079529236e-06,
"loss": 0.5415,
"step": 15300
},
{
"epoch": 1.254889178617992,
"grad_norm": 9.259481430053711,
"learning_rate": 9.980222517244633e-06,
"loss": 0.5224,
"step": 15400
},
{
"epoch": 1.263037809647979,
"grad_norm": 7.281178951263428,
"learning_rate": 9.978938661692439e-06,
"loss": 0.5363,
"step": 15500
},
{
"epoch": 1.271186440677966,
"grad_norm": 12.429268836975098,
"learning_rate": 9.977614523259884e-06,
"loss": 0.5257,
"step": 15600
},
{
"epoch": 1.279335071707953,
"grad_norm": 8.357499122619629,
"learning_rate": 9.97625011266012e-06,
"loss": 0.5151,
"step": 15700
},
{
"epoch": 1.28748370273794,
"grad_norm": 7.741194725036621,
"learning_rate": 9.974845440932121e-06,
"loss": 0.4973,
"step": 15800
},
{
"epoch": 1.295632333767927,
"grad_norm": 12.34659481048584,
"learning_rate": 9.973400519440605e-06,
"loss": 0.5275,
"step": 15900
},
{
"epoch": 1.303780964797914,
"grad_norm": 7.972919940948486,
"learning_rate": 9.971915359875935e-06,
"loss": 0.5196,
"step": 16000
},
{
"epoch": 1.311929595827901,
"grad_norm": 6.398066520690918,
"learning_rate": 9.970389974254025e-06,
"loss": 0.5239,
"step": 16100
},
{
"epoch": 1.320078226857888,
"grad_norm": 9.441793441772461,
"learning_rate": 9.968824374916245e-06,
"loss": 0.5141,
"step": 16200
},
{
"epoch": 1.3282268578878749,
"grad_norm": 8.154695510864258,
"learning_rate": 9.967218574529323e-06,
"loss": 0.5179,
"step": 16300
},
{
"epoch": 1.3363754889178618,
"grad_norm": 9.219006538391113,
"learning_rate": 9.965572586085235e-06,
"loss": 0.4859,
"step": 16400
},
{
"epoch": 1.3445241199478488,
"grad_norm": 7.020698070526123,
"learning_rate": 9.96388642290111e-06,
"loss": 0.5128,
"step": 16500
},
{
"epoch": 1.3526727509778358,
"grad_norm": 7.134260654449463,
"learning_rate": 9.96216009861911e-06,
"loss": 0.5067,
"step": 16600
},
{
"epoch": 1.3608213820078228,
"grad_norm": 6.663614273071289,
"learning_rate": 9.96039362720634e-06,
"loss": 0.5352,
"step": 16700
},
{
"epoch": 1.3689700130378095,
"grad_norm": 7.817680358886719,
"learning_rate": 9.958587022954704e-06,
"loss": 0.5143,
"step": 16800
},
{
"epoch": 1.3771186440677967,
"grad_norm": 8.092264175415039,
"learning_rate": 9.956740300480818e-06,
"loss": 0.5111,
"step": 16900
},
{
"epoch": 1.3852672750977835,
"grad_norm": 7.305174350738525,
"learning_rate": 9.954853474725878e-06,
"loss": 0.5432,
"step": 17000
},
{
"epoch": 1.3934159061277707,
"grad_norm": 7.337920188903809,
"learning_rate": 9.952926560955547e-06,
"loss": 0.5279,
"step": 17100
},
{
"epoch": 1.4015645371577574,
"grad_norm": 8.824036598205566,
"learning_rate": 9.950959574759815e-06,
"loss": 0.5073,
"step": 17200
},
{
"epoch": 1.4097131681877444,
"grad_norm": 5.825498580932617,
"learning_rate": 9.948952532052895e-06,
"loss": 0.5208,
"step": 17300
},
{
"epoch": 1.4178617992177314,
"grad_norm": 6.746844291687012,
"learning_rate": 9.946905449073077e-06,
"loss": 0.5245,
"step": 17400
},
{
"epoch": 1.4260104302477183,
"grad_norm": 9.570401191711426,
"learning_rate": 9.944818342382607e-06,
"loss": 0.5056,
"step": 17500
},
{
"epoch": 1.4341590612777053,
"grad_norm": 8.143331527709961,
"learning_rate": 9.942691228867548e-06,
"loss": 0.5066,
"step": 17600
},
{
"epoch": 1.4423076923076923,
"grad_norm": 8.18307113647461,
"learning_rate": 9.940524125737641e-06,
"loss": 0.4933,
"step": 17700
},
{
"epoch": 1.4504563233376793,
"grad_norm": 9.306159019470215,
"learning_rate": 9.938317050526173e-06,
"loss": 0.5092,
"step": 17800
},
{
"epoch": 1.4586049543676662,
"grad_norm": 7.026943206787109,
"learning_rate": 9.936070021089834e-06,
"loss": 0.5071,
"step": 17900
},
{
"epoch": 1.4667535853976532,
"grad_norm": 8.45121955871582,
"learning_rate": 9.933783055608562e-06,
"loss": 0.5193,
"step": 18000
},
{
"epoch": 1.4749022164276402,
"grad_norm": 5.932709217071533,
"learning_rate": 9.93145617258541e-06,
"loss": 0.5311,
"step": 18100
},
{
"epoch": 1.4830508474576272,
"grad_norm": 8.077872276306152,
"learning_rate": 9.929089390846389e-06,
"loss": 0.4887,
"step": 18200
},
{
"epoch": 1.4911994784876141,
"grad_norm": 10.298677444458008,
"learning_rate": 9.926682729540313e-06,
"loss": 0.5006,
"step": 18300
},
{
"epoch": 1.4993481095176011,
"grad_norm": 7.896773815155029,
"learning_rate": 9.924236208138656e-06,
"loss": 0.4828,
"step": 18400
},
{
"epoch": 1.5074967405475879,
"grad_norm": 10.591178894042969,
"learning_rate": 9.921749846435375e-06,
"loss": 0.4936,
"step": 18500
},
{
"epoch": 1.515645371577575,
"grad_norm": 8.356033325195312,
"learning_rate": 9.919223664546774e-06,
"loss": 0.5271,
"step": 18600
},
{
"epoch": 1.5237940026075618,
"grad_norm": 9.826644897460938,
"learning_rate": 9.916657682911317e-06,
"loss": 0.5115,
"step": 18700
},
{
"epoch": 1.531942633637549,
"grad_norm": 7.742495536804199,
"learning_rate": 9.914051922289482e-06,
"loss": 0.5037,
"step": 18800
},
{
"epoch": 1.5400912646675358,
"grad_norm": 6.355010032653809,
"learning_rate": 9.91140640376358e-06,
"loss": 0.5047,
"step": 18900
},
{
"epoch": 1.548239895697523,
"grad_norm": 11.718524932861328,
"learning_rate": 9.908721148737591e-06,
"loss": 0.5074,
"step": 19000
},
{
"epoch": 1.5563885267275097,
"grad_norm": 6.173713207244873,
"learning_rate": 9.905996178936991e-06,
"loss": 0.5367,
"step": 19100
},
{
"epoch": 1.5645371577574967,
"grad_norm": 10.962457656860352,
"learning_rate": 9.903231516408576e-06,
"loss": 0.4991,
"step": 19200
},
{
"epoch": 1.5726857887874837,
"grad_norm": 6.949578285217285,
"learning_rate": 9.900427183520276e-06,
"loss": 0.4935,
"step": 19300
},
{
"epoch": 1.5808344198174706,
"grad_norm": 6.240306854248047,
"learning_rate": 9.897583202960985e-06,
"loss": 0.5136,
"step": 19400
},
{
"epoch": 1.5889830508474576,
"grad_norm": 6.609454154968262,
"learning_rate": 9.89469959774037e-06,
"loss": 0.4972,
"step": 19500
},
{
"epoch": 1.5971316818774446,
"grad_norm": 8.191039085388184,
"learning_rate": 9.891776391188694e-06,
"loss": 0.5202,
"step": 19600
},
{
"epoch": 1.6052803129074316,
"grad_norm": 7.624372959136963,
"learning_rate": 9.888813606956612e-06,
"loss": 0.515,
"step": 19700
},
{
"epoch": 1.6134289439374185,
"grad_norm": 8.45014476776123,
"learning_rate": 9.885811269014992e-06,
"loss": 0.517,
"step": 19800
},
{
"epoch": 1.6215775749674055,
"grad_norm": 6.690873146057129,
"learning_rate": 9.882769401654719e-06,
"loss": 0.5153,
"step": 19900
},
{
"epoch": 1.6297262059973925,
"grad_norm": 6.8720808029174805,
"learning_rate": 9.879688029486496e-06,
"loss": 0.5288,
"step": 20000
},
{
"epoch": 1.6378748370273795,
"grad_norm": 9.76561164855957,
"learning_rate": 9.876567177440645e-06,
"loss": 0.509,
"step": 20100
},
{
"epoch": 1.6460234680573662,
"grad_norm": 12.810523986816406,
"learning_rate": 9.873406870766906e-06,
"loss": 0.5144,
"step": 20200
},
{
"epoch": 1.6541720990873534,
"grad_norm": 6.44625997543335,
"learning_rate": 9.870207135034235e-06,
"loss": 0.5237,
"step": 20300
},
{
"epoch": 1.6623207301173402,
"grad_norm": 9.6302490234375,
"learning_rate": 9.86696799613059e-06,
"loss": 0.5094,
"step": 20400
},
{
"epoch": 1.6704693611473274,
"grad_norm": 10.308381080627441,
"learning_rate": 9.863689480262734e-06,
"loss": 0.498,
"step": 20500
},
{
"epoch": 1.6786179921773141,
"grad_norm": 11.594625473022461,
"learning_rate": 9.860371613956008e-06,
"loss": 0.5224,
"step": 20600
},
{
"epoch": 1.6867666232073013,
"grad_norm": 7.823093414306641,
"learning_rate": 9.85701442405413e-06,
"loss": 0.515,
"step": 20700
},
{
"epoch": 1.694915254237288,
"grad_norm": 6.978199481964111,
"learning_rate": 9.853617937718966e-06,
"loss": 0.5103,
"step": 20800
},
{
"epoch": 1.7030638852672753,
"grad_norm": 9.50684928894043,
"learning_rate": 9.850182182430322e-06,
"loss": 0.4876,
"step": 20900
},
{
"epoch": 1.711212516297262,
"grad_norm": 9.167742729187012,
"learning_rate": 9.84670718598571e-06,
"loss": 0.521,
"step": 21000
},
{
"epoch": 1.719361147327249,
"grad_norm": 9.103960990905762,
"learning_rate": 9.843192976500131e-06,
"loss": 0.4987,
"step": 21100
},
{
"epoch": 1.727509778357236,
"grad_norm": 7.777735233306885,
"learning_rate": 9.83963958240585e-06,
"loss": 0.4838,
"step": 21200
},
{
"epoch": 1.735658409387223,
"grad_norm": 3.7518503665924072,
"learning_rate": 9.83604703245215e-06,
"loss": 0.5019,
"step": 21300
},
{
"epoch": 1.74380704041721,
"grad_norm": 8.239873886108398,
"learning_rate": 9.832415355705118e-06,
"loss": 0.5119,
"step": 21400
},
{
"epoch": 1.7519556714471969,
"grad_norm": 7.265876293182373,
"learning_rate": 9.828744581547407e-06,
"loss": 0.4681,
"step": 21500
},
{
"epoch": 1.7601043024771839,
"grad_norm": 9.064807891845703,
"learning_rate": 9.825034739677984e-06,
"loss": 0.4737,
"step": 21600
},
{
"epoch": 1.7682529335071708,
"grad_norm": 6.92955207824707,
"learning_rate": 9.821285860111903e-06,
"loss": 0.4968,
"step": 21700
},
{
"epoch": 1.7764015645371578,
"grad_norm": 10.282632827758789,
"learning_rate": 9.817497973180062e-06,
"loss": 0.4986,
"step": 21800
},
{
"epoch": 1.7845501955671446,
"grad_norm": 5.6930084228515625,
"learning_rate": 9.813671109528949e-06,
"loss": 0.5135,
"step": 21900
},
{
"epoch": 1.7926988265971318,
"grad_norm": 6.911000728607178,
"learning_rate": 9.809805300120403e-06,
"loss": 0.5046,
"step": 22000
},
{
"epoch": 1.8008474576271185,
"grad_norm": 6.411030292510986,
"learning_rate": 9.805900576231358e-06,
"loss": 0.4926,
"step": 22100
},
{
"epoch": 1.8089960886571057,
"grad_norm": 6.620294570922852,
"learning_rate": 9.801956969453592e-06,
"loss": 0.4788,
"step": 22200
},
{
"epoch": 1.8171447196870925,
"grad_norm": 6.77543830871582,
"learning_rate": 9.797974511693471e-06,
"loss": 0.4896,
"step": 22300
},
{
"epoch": 1.8252933507170797,
"grad_norm": 7.471630573272705,
"learning_rate": 9.793953235171694e-06,
"loss": 0.4979,
"step": 22400
},
{
"epoch": 1.8334419817470664,
"grad_norm": 6.550878524780273,
"learning_rate": 9.789893172423021e-06,
"loss": 0.5081,
"step": 22500
},
{
"epoch": 1.8415906127770536,
"grad_norm": 9.887825965881348,
"learning_rate": 9.78579435629603e-06,
"loss": 0.5089,
"step": 22600
},
{
"epoch": 1.8497392438070404,
"grad_norm": 7.088003158569336,
"learning_rate": 9.781656819952826e-06,
"loss": 0.4811,
"step": 22700
},
{
"epoch": 1.8578878748370273,
"grad_norm": 6.524052619934082,
"learning_rate": 9.777480596868796e-06,
"loss": 0.5018,
"step": 22800
},
{
"epoch": 1.8660365058670143,
"grad_norm": 7.965360164642334,
"learning_rate": 9.773265720832324e-06,
"loss": 0.5144,
"step": 22900
},
{
"epoch": 1.8741851368970013,
"grad_norm": 7.510045051574707,
"learning_rate": 9.769012225944521e-06,
"loss": 0.5002,
"step": 23000
},
{
"epoch": 1.8823337679269883,
"grad_norm": 11.717968940734863,
"learning_rate": 9.764720146618955e-06,
"loss": 0.5003,
"step": 23100
},
{
"epoch": 1.8904823989569752,
"grad_norm": 5.974288463592529,
"learning_rate": 9.760389517581362e-06,
"loss": 0.4912,
"step": 23200
},
{
"epoch": 1.8986310299869622,
"grad_norm": 5.159633159637451,
"learning_rate": 9.75602037386937e-06,
"loss": 0.4861,
"step": 23300
},
{
"epoch": 1.9067796610169492,
"grad_norm": 6.651115417480469,
"learning_rate": 9.75161275083222e-06,
"loss": 0.5153,
"step": 23400
},
{
"epoch": 1.9149282920469362,
"grad_norm": 7.513479709625244,
"learning_rate": 9.747166684130474e-06,
"loss": 0.4931,
"step": 23500
},
{
"epoch": 1.9230769230769231,
"grad_norm": 8.77505874633789,
"learning_rate": 9.742682209735727e-06,
"loss": 0.501,
"step": 23600
},
{
"epoch": 1.93122555410691,
"grad_norm": 6.932135581970215,
"learning_rate": 9.738159363930324e-06,
"loss": 0.52,
"step": 23700
},
{
"epoch": 1.9393741851368969,
"grad_norm": 10.359477996826172,
"learning_rate": 9.73359818330705e-06,
"loss": 0.4877,
"step": 23800
},
{
"epoch": 1.947522816166884,
"grad_norm": 8.781031608581543,
"learning_rate": 9.72899870476885e-06,
"loss": 0.4891,
"step": 23900
},
{
"epoch": 1.9556714471968708,
"grad_norm": 8.263874053955078,
"learning_rate": 9.724360965528523e-06,
"loss": 0.5061,
"step": 24000
},
{
"epoch": 1.963820078226858,
"grad_norm": 7.766465663909912,
"learning_rate": 9.719685003108423e-06,
"loss": 0.4902,
"step": 24100
},
{
"epoch": 1.9719687092568448,
"grad_norm": 4.978456974029541,
"learning_rate": 9.714970855340152e-06,
"loss": 0.4873,
"step": 24200
},
{
"epoch": 1.980117340286832,
"grad_norm": 7.918380260467529,
"learning_rate": 9.71021856036426e-06,
"loss": 0.4941,
"step": 24300
},
{
"epoch": 1.9882659713168187,
"grad_norm": 8.015583038330078,
"learning_rate": 9.705428156629933e-06,
"loss": 0.4833,
"step": 24400
},
{
"epoch": 1.996414602346806,
"grad_norm": 7.768013954162598,
"learning_rate": 9.700599682894675e-06,
"loss": 0.4932,
"step": 24500
},
{
"epoch": 2.0,
"eval_accuracy": 0.8008032128514057,
"eval_loss": 0.5220404267311096,
"eval_runtime": 6.9482,
"eval_samples_per_second": 358.366,
"eval_steps_per_second": 44.904,
"step": 24544
},
{
"epoch": 2.0045632333767927,
"grad_norm": 7.617489337921143,
"learning_rate": 9.695733178224009e-06,
"loss": 0.4491,
"step": 24600
},
{
"epoch": 2.01271186440678,
"grad_norm": 8.741541862487793,
"learning_rate": 9.690828681991153e-06,
"loss": 0.4068,
"step": 24700
},
{
"epoch": 2.0208604954367666,
"grad_norm": 11.999881744384766,
"learning_rate": 9.685886233876696e-06,
"loss": 0.4138,
"step": 24800
},
{
"epoch": 2.029009126466754,
"grad_norm": 9.766683578491211,
"learning_rate": 9.680905873868287e-06,
"loss": 0.3986,
"step": 24900
},
{
"epoch": 2.0371577574967406,
"grad_norm": 6.533343315124512,
"learning_rate": 9.675887642260306e-06,
"loss": 0.4024,
"step": 25000
},
{
"epoch": 2.0453063885267273,
"grad_norm": 9.137768745422363,
"learning_rate": 9.670831579653539e-06,
"loss": 0.4436,
"step": 25100
},
{
"epoch": 2.0534550195567145,
"grad_norm": 9.635496139526367,
"learning_rate": 9.665737726954852e-06,
"loss": 0.4019,
"step": 25200
},
{
"epoch": 2.0616036505867013,
"grad_norm": 7.93952751159668,
"learning_rate": 9.66060612537685e-06,
"loss": 0.4221,
"step": 25300
},
{
"epoch": 2.0697522816166884,
"grad_norm": 9.508652687072754,
"learning_rate": 9.65543681643756e-06,
"loss": 0.4221,
"step": 25400
},
{
"epoch": 2.077900912646675,
"grad_norm": 9.136526107788086,
"learning_rate": 9.650229841960084e-06,
"loss": 0.4239,
"step": 25500
},
{
"epoch": 2.0860495436766624,
"grad_norm": 11.71844482421875,
"learning_rate": 9.644985244072258e-06,
"loss": 0.4047,
"step": 25600
},
{
"epoch": 2.094198174706649,
"grad_norm": 4.190426826477051,
"learning_rate": 9.639703065206323e-06,
"loss": 0.4209,
"step": 25700
},
{
"epoch": 2.1023468057366363,
"grad_norm": 11.736051559448242,
"learning_rate": 9.63438334809857e-06,
"loss": 0.4086,
"step": 25800
},
{
"epoch": 2.110495436766623,
"grad_norm": 7.024579048156738,
"learning_rate": 9.629026135789002e-06,
"loss": 0.4346,
"step": 25900
},
{
"epoch": 2.1186440677966103,
"grad_norm": 10.942073822021484,
"learning_rate": 9.62363147162098e-06,
"loss": 0.4242,
"step": 26000
},
{
"epoch": 2.126792698826597,
"grad_norm": 12.155450820922852,
"learning_rate": 9.618199399240876e-06,
"loss": 0.4706,
"step": 26100
},
{
"epoch": 2.1349413298565842,
"grad_norm": 6.733283519744873,
"learning_rate": 9.612729962597721e-06,
"loss": 0.4406,
"step": 26200
},
{
"epoch": 2.143089960886571,
"grad_norm": 7.309271335601807,
"learning_rate": 9.607223205942845e-06,
"loss": 0.4169,
"step": 26300
},
{
"epoch": 2.151238591916558,
"grad_norm": 7.154285907745361,
"learning_rate": 9.601679173829522e-06,
"loss": 0.4406,
"step": 26400
},
{
"epoch": 2.159387222946545,
"grad_norm": 8.043559074401855,
"learning_rate": 9.596097911112609e-06,
"loss": 0.4264,
"step": 26500
},
{
"epoch": 2.167535853976532,
"grad_norm": 9.203978538513184,
"learning_rate": 9.590479462948185e-06,
"loss": 0.4173,
"step": 26600
},
{
"epoch": 2.175684485006519,
"grad_norm": 7.716718673706055,
"learning_rate": 9.58482387479318e-06,
"loss": 0.412,
"step": 26700
},
{
"epoch": 2.1838331160365057,
"grad_norm": 10.910019874572754,
"learning_rate": 9.57913119240501e-06,
"loss": 0.3844,
"step": 26800
},
{
"epoch": 2.191981747066493,
"grad_norm": 7.980166435241699,
"learning_rate": 9.573401461841218e-06,
"loss": 0.4441,
"step": 26900
},
{
"epoch": 2.2001303780964796,
"grad_norm": 7.328435897827148,
"learning_rate": 9.567634729459076e-06,
"loss": 0.4118,
"step": 27000
},
{
"epoch": 2.208279009126467,
"grad_norm": 7.026157379150391,
"learning_rate": 9.561831041915238e-06,
"loss": 0.4258,
"step": 27100
},
{
"epoch": 2.2164276401564535,
"grad_norm": 10.100348472595215,
"learning_rate": 9.555990446165339e-06,
"loss": 0.4368,
"step": 27200
},
{
"epoch": 2.2245762711864407,
"grad_norm": 11.21714973449707,
"learning_rate": 9.550112989463633e-06,
"loss": 0.4253,
"step": 27300
},
{
"epoch": 2.2327249022164275,
"grad_norm": 7.6962127685546875,
"learning_rate": 9.5441987193626e-06,
"loss": 0.4273,
"step": 27400
},
{
"epoch": 2.2408735332464147,
"grad_norm": 13.219654083251953,
"learning_rate": 9.538247683712567e-06,
"loss": 0.4369,
"step": 27500
},
{
"epoch": 2.2490221642764014,
"grad_norm": 5.536248683929443,
"learning_rate": 9.532259930661315e-06,
"loss": 0.4493,
"step": 27600
},
{
"epoch": 2.2571707953063886,
"grad_norm": 7.112065315246582,
"learning_rate": 9.526235508653694e-06,
"loss": 0.4325,
"step": 27700
},
{
"epoch": 2.2653194263363754,
"grad_norm": 6.064886093139648,
"learning_rate": 9.520174466431235e-06,
"loss": 0.4353,
"step": 27800
},
{
"epoch": 2.2734680573663626,
"grad_norm": 7.9532318115234375,
"learning_rate": 9.51407685303174e-06,
"loss": 0.4358,
"step": 27900
},
{
"epoch": 2.2816166883963493,
"grad_norm": 8.64626693725586,
"learning_rate": 9.507942717788907e-06,
"loss": 0.4489,
"step": 28000
},
{
"epoch": 2.2897653194263365,
"grad_norm": 9.648942947387695,
"learning_rate": 9.50177211033191e-06,
"loss": 0.4498,
"step": 28100
},
{
"epoch": 2.2979139504563233,
"grad_norm": 7.498199939727783,
"learning_rate": 9.495565080585017e-06,
"loss": 0.4086,
"step": 28200
},
{
"epoch": 2.3060625814863105,
"grad_norm": 8.632119178771973,
"learning_rate": 9.489321678767167e-06,
"loss": 0.4207,
"step": 28300
},
{
"epoch": 2.3142112125162972,
"grad_norm": 8.807448387145996,
"learning_rate": 9.48304195539158e-06,
"loss": 0.428,
"step": 28400
},
{
"epoch": 2.322359843546284,
"grad_norm": 7.809271812438965,
"learning_rate": 9.476725961265332e-06,
"loss": 0.4546,
"step": 28500
},
{
"epoch": 2.330508474576271,
"grad_norm": 8.758193969726562,
"learning_rate": 9.470373747488966e-06,
"loss": 0.432,
"step": 28600
},
{
"epoch": 2.3386571056062584,
"grad_norm": 8.046852111816406,
"learning_rate": 9.463985365456057e-06,
"loss": 0.4169,
"step": 28700
},
{
"epoch": 2.346805736636245,
"grad_norm": 12.665115356445312,
"learning_rate": 9.457560866852805e-06,
"loss": 0.4242,
"step": 28800
},
{
"epoch": 2.354954367666232,
"grad_norm": 10.333826065063477,
"learning_rate": 9.45110030365762e-06,
"loss": 0.4603,
"step": 28900
},
{
"epoch": 2.363102998696219,
"grad_norm": 8.857953071594238,
"learning_rate": 9.444603728140698e-06,
"loss": 0.454,
"step": 29000
},
{
"epoch": 2.371251629726206,
"grad_norm": 9.135393142700195,
"learning_rate": 9.438071192863596e-06,
"loss": 0.4574,
"step": 29100
},
{
"epoch": 2.379400260756193,
"grad_norm": 6.3214921951293945,
"learning_rate": 9.43150275067881e-06,
"loss": 0.4299,
"step": 29200
},
{
"epoch": 2.38754889178618,
"grad_norm": 7.322382926940918,
"learning_rate": 9.42489845472935e-06,
"loss": 0.4265,
"step": 29300
},
{
"epoch": 2.395697522816167,
"grad_norm": 11.1491060256958,
"learning_rate": 9.418258358448298e-06,
"loss": 0.4233,
"step": 29400
},
{
"epoch": 2.4038461538461537,
"grad_norm": 7.171163082122803,
"learning_rate": 9.411582515558391e-06,
"loss": 0.4271,
"step": 29500
},
{
"epoch": 2.411994784876141,
"grad_norm": 5.758033275604248,
"learning_rate": 9.404870980071579e-06,
"loss": 0.4463,
"step": 29600
},
{
"epoch": 2.4201434159061277,
"grad_norm": 11.001411437988281,
"learning_rate": 9.398123806288588e-06,
"loss": 0.42,
"step": 29700
},
{
"epoch": 2.428292046936115,
"grad_norm": 6.28535795211792,
"learning_rate": 9.39134104879848e-06,
"loss": 0.4188,
"step": 29800
},
{
"epoch": 2.4364406779661016,
"grad_norm": 6.2432861328125,
"learning_rate": 9.38452276247821e-06,
"loss": 0.4242,
"step": 29900
},
{
"epoch": 2.444589308996089,
"grad_norm": 9.474976539611816,
"learning_rate": 9.377669002492193e-06,
"loss": 0.43,
"step": 30000
},
{
"epoch": 2.4527379400260756,
"grad_norm": 7.984436988830566,
"learning_rate": 9.37077982429184e-06,
"loss": 0.4328,
"step": 30100
},
{
"epoch": 2.4608865710560623,
"grad_norm": 8.237207412719727,
"learning_rate": 9.363855283615124e-06,
"loss": 0.4166,
"step": 30200
},
{
"epoch": 2.4690352020860495,
"grad_norm": 7.6592936515808105,
"learning_rate": 9.356895436486122e-06,
"loss": 0.4253,
"step": 30300
},
{
"epoch": 2.4771838331160367,
"grad_norm": 5.206706523895264,
"learning_rate": 9.349900339214564e-06,
"loss": 0.4414,
"step": 30400
},
{
"epoch": 2.4853324641460235,
"grad_norm": 10.161866188049316,
"learning_rate": 9.342870048395376e-06,
"loss": 0.415,
"step": 30500
},
{
"epoch": 2.4934810951760102,
"grad_norm": 4.225031852722168,
"learning_rate": 9.335804620908222e-06,
"loss": 0.4243,
"step": 30600
},
{
"epoch": 2.5016297262059974,
"grad_norm": 7.489659786224365,
"learning_rate": 9.328704113917046e-06,
"loss": 0.4417,
"step": 30700
},
{
"epoch": 2.509778357235984,
"grad_norm": 8.180109977722168,
"learning_rate": 9.32156858486961e-06,
"loss": 0.4217,
"step": 30800
},
{
"epoch": 2.5179269882659714,
"grad_norm": 9.16032886505127,
"learning_rate": 9.314398091497024e-06,
"loss": 0.4297,
"step": 30900
},
{
"epoch": 2.526075619295958,
"grad_norm": 8.16234302520752,
"learning_rate": 9.307192691813285e-06,
"loss": 0.4319,
"step": 31000
},
{
"epoch": 2.5342242503259453,
"grad_norm": 10.111699104309082,
"learning_rate": 9.299952444114802e-06,
"loss": 0.4186,
"step": 31100
},
{
"epoch": 2.542372881355932,
"grad_norm": 6.305666923522949,
"learning_rate": 9.29267740697993e-06,
"loss": 0.4382,
"step": 31200
},
{
"epoch": 2.5505215123859193,
"grad_norm": 9.985565185546875,
"learning_rate": 9.285367639268492e-06,
"loss": 0.4272,
"step": 31300
},
{
"epoch": 2.558670143415906,
"grad_norm": 10.670126914978027,
"learning_rate": 9.278023200121305e-06,
"loss": 0.4228,
"step": 31400
},
{
"epoch": 2.5668187744458932,
"grad_norm": 7.42661714553833,
"learning_rate": 9.2706441489597e-06,
"loss": 0.4314,
"step": 31500
},
{
"epoch": 2.57496740547588,
"grad_norm": 6.457535266876221,
"learning_rate": 9.263230545485044e-06,
"loss": 0.4401,
"step": 31600
},
{
"epoch": 2.583116036505867,
"grad_norm": 11.822875022888184,
"learning_rate": 9.25578244967825e-06,
"loss": 0.3865,
"step": 31700
},
{
"epoch": 2.591264667535854,
"grad_norm": 12.4473295211792,
"learning_rate": 9.2482999217993e-06,
"loss": 0.4272,
"step": 31800
},
{
"epoch": 2.5994132985658407,
"grad_norm": 5.283376693725586,
"learning_rate": 9.240783022386757e-06,
"loss": 0.4084,
"step": 31900
},
{
"epoch": 2.607561929595828,
"grad_norm": 8.190621376037598,
"learning_rate": 9.233231812257266e-06,
"loss": 0.4257,
"step": 32000
},
{
"epoch": 2.615710560625815,
"grad_norm": 6.570192813873291,
"learning_rate": 9.225646352505071e-06,
"loss": 0.4464,
"step": 32100
},
{
"epoch": 2.623859191655802,
"grad_norm": 10.470175743103027,
"learning_rate": 9.218026704501519e-06,
"loss": 0.4245,
"step": 32200
},
{
"epoch": 2.6320078226857886,
"grad_norm": 7.662964820861816,
"learning_rate": 9.210372929894561e-06,
"loss": 0.4265,
"step": 32300
},
{
"epoch": 2.640156453715776,
"grad_norm": 7.74278450012207,
"learning_rate": 9.202685090608256e-06,
"loss": 0.4293,
"step": 32400
},
{
"epoch": 2.648305084745763,
"grad_norm": 6.661880970001221,
"learning_rate": 9.194963248842266e-06,
"loss": 0.4592,
"step": 32500
},
{
"epoch": 2.6564537157757497,
"grad_norm": 8.020112991333008,
"learning_rate": 9.18720746707136e-06,
"loss": 0.4229,
"step": 32600
},
{
"epoch": 2.6646023468057365,
"grad_norm": 5.921052932739258,
"learning_rate": 9.179417808044897e-06,
"loss": 0.4141,
"step": 32700
},
{
"epoch": 2.6727509778357237,
"grad_norm": 10.444842338562012,
"learning_rate": 9.17159433478633e-06,
"loss": 0.4437,
"step": 32800
},
{
"epoch": 2.6808996088657104,
"grad_norm": 7.524814605712891,
"learning_rate": 9.163737110592697e-06,
"loss": 0.4128,
"step": 32900
},
{
"epoch": 2.6890482398956976,
"grad_norm": 10.936373710632324,
"learning_rate": 9.155846199034086e-06,
"loss": 0.4273,
"step": 33000
},
{
"epoch": 2.6971968709256844,
"grad_norm": 7.02941370010376,
"learning_rate": 9.147921663953157e-06,
"loss": 0.4433,
"step": 33100
},
{
"epoch": 2.7053455019556716,
"grad_norm": 10.595579147338867,
"learning_rate": 9.139963569464593e-06,
"loss": 0.4264,
"step": 33200
},
{
"epoch": 2.7134941329856583,
"grad_norm": 5.312283992767334,
"learning_rate": 9.131971979954603e-06,
"loss": 0.4149,
"step": 33300
},
{
"epoch": 2.7216427640156455,
"grad_norm": 7.464469909667969,
"learning_rate": 9.123946960080387e-06,
"loss": 0.4368,
"step": 33400
},
{
"epoch": 2.7297913950456323,
"grad_norm": 7.507636547088623,
"learning_rate": 9.115888574769623e-06,
"loss": 0.4344,
"step": 33500
},
{
"epoch": 2.737940026075619,
"grad_norm": 7.984206676483154,
"learning_rate": 9.107796889219933e-06,
"loss": 0.4165,
"step": 33600
},
{
"epoch": 2.7460886571056062,
"grad_norm": 9.600481986999512,
"learning_rate": 9.099671968898362e-06,
"loss": 0.4212,
"step": 33700
},
{
"epoch": 2.7542372881355934,
"grad_norm": 6.417558670043945,
"learning_rate": 9.091513879540845e-06,
"loss": 0.41,
"step": 33800
},
{
"epoch": 2.76238591916558,
"grad_norm": 7.52598762512207,
"learning_rate": 9.08332268715168e-06,
"loss": 0.4443,
"step": 33900
},
{
"epoch": 2.770534550195567,
"grad_norm": 8.766283988952637,
"learning_rate": 9.075098458002988e-06,
"loss": 0.4552,
"step": 34000
},
{
"epoch": 2.778683181225554,
"grad_norm": 7.127804756164551,
"learning_rate": 9.066841258634177e-06,
"loss": 0.426,
"step": 34100
},
{
"epoch": 2.7868318122555413,
"grad_norm": 8.190874099731445,
"learning_rate": 9.058551155851405e-06,
"loss": 0.4374,
"step": 34200
},
{
"epoch": 2.794980443285528,
"grad_norm": 7.887624740600586,
"learning_rate": 9.050228216727046e-06,
"loss": 0.437,
"step": 34300
},
{
"epoch": 2.803129074315515,
"grad_norm": 10.439249038696289,
"learning_rate": 9.041872508599136e-06,
"loss": 0.4165,
"step": 34400
},
{
"epoch": 2.811277705345502,
"grad_norm": 9.891864776611328,
"learning_rate": 9.033484099070839e-06,
"loss": 0.4336,
"step": 34500
},
{
"epoch": 2.819426336375489,
"grad_norm": 10.03987979888916,
"learning_rate": 9.025063056009886e-06,
"loss": 0.4365,
"step": 34600
},
{
"epoch": 2.827574967405476,
"grad_norm": 6.188653469085693,
"learning_rate": 9.016609447548046e-06,
"loss": 0.41,
"step": 34700
},
{
"epoch": 2.8357235984354627,
"grad_norm": 11.486917495727539,
"learning_rate": 9.008123342080553e-06,
"loss": 0.4343,
"step": 34800
},
{
"epoch": 2.84387222946545,
"grad_norm": 9.972556114196777,
"learning_rate": 8.99960480826557e-06,
"loss": 0.4282,
"step": 34900
},
{
"epoch": 2.8520208604954367,
"grad_norm": 7.771157741546631,
"learning_rate": 8.991053915023625e-06,
"loss": 0.4086,
"step": 35000
},
{
"epoch": 2.860169491525424,
"grad_norm": 5.989213943481445,
"learning_rate": 8.982470731537054e-06,
"loss": 0.4647,
"step": 35100
},
{
"epoch": 2.8683181225554106,
"grad_norm": 7.19948148727417,
"learning_rate": 8.973855327249442e-06,
"loss": 0.4086,
"step": 35200
},
{
"epoch": 2.8764667535853974,
"grad_norm": 7.22706937789917,
"learning_rate": 8.965207771865061e-06,
"loss": 0.4225,
"step": 35300
},
{
"epoch": 2.8846153846153846,
"grad_norm": 11.344962120056152,
"learning_rate": 8.95652813534831e-06,
"loss": 0.4275,
"step": 35400
},
{
"epoch": 2.8927640156453718,
"grad_norm": 10.637499809265137,
"learning_rate": 8.947816487923143e-06,
"loss": 0.4347,
"step": 35500
},
{
"epoch": 2.9009126466753585,
"grad_norm": 7.946286678314209,
"learning_rate": 8.939072900072501e-06,
"loss": 0.4218,
"step": 35600
},
{
"epoch": 2.9090612777053453,
"grad_norm": 6.058999061584473,
"learning_rate": 8.930297442537747e-06,
"loss": 0.4212,
"step": 35700
},
{
"epoch": 2.9172099087353325,
"grad_norm": 10.35421371459961,
"learning_rate": 8.921490186318092e-06,
"loss": 0.4028,
"step": 35800
},
{
"epoch": 2.9253585397653197,
"grad_norm": 8.85345458984375,
"learning_rate": 8.912651202670013e-06,
"loss": 0.4455,
"step": 35900
},
{
"epoch": 2.9335071707953064,
"grad_norm": 7.476600646972656,
"learning_rate": 8.90378056310669e-06,
"loss": 0.4212,
"step": 36000
},
{
"epoch": 2.941655801825293,
"grad_norm": 8.27695369720459,
"learning_rate": 8.894878339397416e-06,
"loss": 0.4186,
"step": 36100
},
{
"epoch": 2.9498044328552804,
"grad_norm": 8.344620704650879,
"learning_rate": 8.885944603567023e-06,
"loss": 0.4242,
"step": 36200
},
{
"epoch": 2.957953063885267,
"grad_norm": 8.976387023925781,
"learning_rate": 8.876979427895291e-06,
"loss": 0.4359,
"step": 36300
},
{
"epoch": 2.9661016949152543,
"grad_norm": 10.581543922424316,
"learning_rate": 8.867982884916377e-06,
"loss": 0.4171,
"step": 36400
},
{
"epoch": 2.974250325945241,
"grad_norm": 6.423446178436279,
"learning_rate": 8.858955047418217e-06,
"loss": 0.4248,
"step": 36500
},
{
"epoch": 2.9823989569752283,
"grad_norm": 6.647116184234619,
"learning_rate": 8.849895988441933e-06,
"loss": 0.4272,
"step": 36600
},
{
"epoch": 2.990547588005215,
"grad_norm": 11.199699401855469,
"learning_rate": 8.840805781281261e-06,
"loss": 0.4336,
"step": 36700
},
{
"epoch": 2.9986962190352022,
"grad_norm": 6.946083068847656,
"learning_rate": 8.831684499481941e-06,
"loss": 0.4278,
"step": 36800
},
{
"epoch": 3.0,
"eval_accuracy": 0.8088353413654619,
"eval_loss": 0.5133101940155029,
"eval_runtime": 6.8742,
"eval_samples_per_second": 362.222,
"eval_steps_per_second": 45.387,
"step": 36816
},
{
"epoch": 3.006844850065189,
"grad_norm": 8.117693901062012,
"learning_rate": 8.822532216841124e-06,
"loss": 0.3563,
"step": 36900
},
{
"epoch": 3.014993481095176,
"grad_norm": 8.939483642578125,
"learning_rate": 8.813349007406785e-06,
"loss": 0.3693,
"step": 37000
},
{
"epoch": 3.023142112125163,
"grad_norm": 5.619213104248047,
"learning_rate": 8.80413494547711e-06,
"loss": 0.359,
"step": 37100
},
{
"epoch": 3.03129074315515,
"grad_norm": 7.458463191986084,
"learning_rate": 8.794890105599905e-06,
"loss": 0.3631,
"step": 37200
},
{
"epoch": 3.039439374185137,
"grad_norm": 8.206454277038574,
"learning_rate": 8.785614562571991e-06,
"loss": 0.3513,
"step": 37300
},
{
"epoch": 3.047588005215124,
"grad_norm": 8.663100242614746,
"learning_rate": 8.776308391438597e-06,
"loss": 0.3348,
"step": 37400
},
{
"epoch": 3.055736636245111,
"grad_norm": 8.638208389282227,
"learning_rate": 8.766971667492754e-06,
"loss": 0.3618,
"step": 37500
},
{
"epoch": 3.0638852672750976,
"grad_norm": 8.416321754455566,
"learning_rate": 8.757604466274683e-06,
"loss": 0.3671,
"step": 37600
},
{
"epoch": 3.0720338983050848,
"grad_norm": 10.002084732055664,
"learning_rate": 8.748206863571188e-06,
"loss": 0.3462,
"step": 37700
},
{
"epoch": 3.0801825293350715,
"grad_norm": 8.242202758789062,
"learning_rate": 8.73877893541504e-06,
"loss": 0.3524,
"step": 37800
},
{
"epoch": 3.0883311603650587,
"grad_norm": 9.762850761413574,
"learning_rate": 8.729320758084363e-06,
"loss": 0.3844,
"step": 37900
},
{
"epoch": 3.0964797913950455,
"grad_norm": 13.008197784423828,
"learning_rate": 8.719832408102017e-06,
"loss": 0.3489,
"step": 38000
},
{
"epoch": 3.1046284224250327,
"grad_norm": 9.61468505859375,
"learning_rate": 8.71031396223498e-06,
"loss": 0.3386,
"step": 38100
},
{
"epoch": 3.1127770534550194,
"grad_norm": 9.158555030822754,
"learning_rate": 8.700765497493723e-06,
"loss": 0.3542,
"step": 38200
},
{
"epoch": 3.1209256844850066,
"grad_norm": 11.94726276397705,
"learning_rate": 8.69118709113159e-06,
"loss": 0.3591,
"step": 38300
},
{
"epoch": 3.1290743155149934,
"grad_norm": 9.813300132751465,
"learning_rate": 8.681578820644173e-06,
"loss": 0.3625,
"step": 38400
},
{
"epoch": 3.1372229465449806,
"grad_norm": 8.50658130645752,
"learning_rate": 8.671940763768682e-06,
"loss": 0.3789,
"step": 38500
},
{
"epoch": 3.1453715775749673,
"grad_norm": 6.037990570068359,
"learning_rate": 8.662272998483323e-06,
"loss": 0.3635,
"step": 38600
},
{
"epoch": 3.1535202086049545,
"grad_norm": 11.817001342773438,
"learning_rate": 8.65257560300666e-06,
"loss": 0.3526,
"step": 38700
},
{
"epoch": 3.1616688396349413,
"grad_norm": 4.690389156341553,
"learning_rate": 8.642848655796985e-06,
"loss": 0.3634,
"step": 38800
},
{
"epoch": 3.1698174706649285,
"grad_norm": 12.257222175598145,
"learning_rate": 8.633092235551679e-06,
"loss": 0.3626,
"step": 38900
},
{
"epoch": 3.1779661016949152,
"grad_norm": 7.710871696472168,
"learning_rate": 8.623306421206588e-06,
"loss": 0.3571,
"step": 39000
},
{
"epoch": 3.1861147327249024,
"grad_norm": 6.811945915222168,
"learning_rate": 8.613491291935365e-06,
"loss": 0.351,
"step": 39100
},
{
"epoch": 3.194263363754889,
"grad_norm": 19.7229061126709,
"learning_rate": 8.60364692714885e-06,
"loss": 0.3348,
"step": 39200
},
{
"epoch": 3.2024119947848764,
"grad_norm": 9.32421875,
"learning_rate": 8.59377340649441e-06,
"loss": 0.3437,
"step": 39300
},
{
"epoch": 3.210560625814863,
"grad_norm": 9.309675216674805,
"learning_rate": 8.583870809855306e-06,
"loss": 0.3687,
"step": 39400
},
{
"epoch": 3.21870925684485,
"grad_norm": 5.458558559417725,
"learning_rate": 8.573939217350043e-06,
"loss": 0.3584,
"step": 39500
},
{
"epoch": 3.226857887874837,
"grad_norm": 8.717120170593262,
"learning_rate": 8.563978709331717e-06,
"loss": 0.3473,
"step": 39600
},
{
"epoch": 3.235006518904824,
"grad_norm": 6.542947769165039,
"learning_rate": 8.553989366387376e-06,
"loss": 0.3806,
"step": 39700
},
{
"epoch": 3.243155149934811,
"grad_norm": 11.504007339477539,
"learning_rate": 8.543971269337355e-06,
"loss": 0.3606,
"step": 39800
},
{
"epoch": 3.2513037809647978,
"grad_norm": 9.393417358398438,
"learning_rate": 8.533924499234633e-06,
"loss": 0.3532,
"step": 39900
},
{
"epoch": 3.259452411994785,
"grad_norm": 8.129273414611816,
"learning_rate": 8.523849137364175e-06,
"loss": 0.3473,
"step": 40000
},
{
"epoch": 3.2676010430247717,
"grad_norm": 12.241875648498535,
"learning_rate": 8.513745265242263e-06,
"loss": 0.3576,
"step": 40100
},
{
"epoch": 3.275749674054759,
"grad_norm": 9.895030975341797,
"learning_rate": 8.503612964615858e-06,
"loss": 0.3458,
"step": 40200
},
{
"epoch": 3.2838983050847457,
"grad_norm": 5.42219877243042,
"learning_rate": 8.493452317461914e-06,
"loss": 0.3772,
"step": 40300
},
{
"epoch": 3.292046936114733,
"grad_norm": 8.165868759155273,
"learning_rate": 8.483263405986735e-06,
"loss": 0.3561,
"step": 40400
},
{
"epoch": 3.3001955671447196,
"grad_norm": 13.24457836151123,
"learning_rate": 8.4730463126253e-06,
"loss": 0.3587,
"step": 40500
},
{
"epoch": 3.308344198174707,
"grad_norm": 12.287585258483887,
"learning_rate": 8.462801120040595e-06,
"loss": 0.3432,
"step": 40600
},
{
"epoch": 3.3164928292046936,
"grad_norm": 8.932402610778809,
"learning_rate": 8.452527911122953e-06,
"loss": 0.3696,
"step": 40700
},
{
"epoch": 3.3246414602346808,
"grad_norm": 8.847443580627441,
"learning_rate": 8.442226768989373e-06,
"loss": 0.362,
"step": 40800
},
{
"epoch": 3.3327900912646675,
"grad_norm": 13.20019245147705,
"learning_rate": 8.431897776982851e-06,
"loss": 0.3543,
"step": 40900
},
{
"epoch": 3.3409387222946547,
"grad_norm": 8.375232696533203,
"learning_rate": 8.421541018671712e-06,
"loss": 0.3741,
"step": 41000
},
{
"epoch": 3.3490873533246415,
"grad_norm": 7.601521968841553,
"learning_rate": 8.411156577848927e-06,
"loss": 0.3518,
"step": 41100
},
{
"epoch": 3.3572359843546282,
"grad_norm": 5.853700637817383,
"learning_rate": 8.400744538531431e-06,
"loss": 0.3556,
"step": 41200
},
{
"epoch": 3.3653846153846154,
"grad_norm": 15.7562837600708,
"learning_rate": 8.390304984959455e-06,
"loss": 0.3591,
"step": 41300
},
{
"epoch": 3.373533246414602,
"grad_norm": 7.048288822174072,
"learning_rate": 8.379838001595837e-06,
"loss": 0.3774,
"step": 41400
},
{
"epoch": 3.3816818774445894,
"grad_norm": 8.532382011413574,
"learning_rate": 8.369343673125339e-06,
"loss": 0.3482,
"step": 41500
},
{
"epoch": 3.389830508474576,
"grad_norm": 5.468735218048096,
"learning_rate": 8.358822084453964e-06,
"loss": 0.3637,
"step": 41600
},
{
"epoch": 3.3979791395045633,
"grad_norm": 7.324248313903809,
"learning_rate": 8.348273320708269e-06,
"loss": 0.365,
"step": 41700
},
{
"epoch": 3.40612777053455,
"grad_norm": 8.06946849822998,
"learning_rate": 8.33769746723467e-06,
"loss": 0.3661,
"step": 41800
},
{
"epoch": 3.4142764015645373,
"grad_norm": 11.85434341430664,
"learning_rate": 8.32709460959876e-06,
"loss": 0.3542,
"step": 41900
},
{
"epoch": 3.422425032594524,
"grad_norm": 8.629081726074219,
"learning_rate": 8.316464833584618e-06,
"loss": 0.3476,
"step": 42000
},
{
"epoch": 3.430573663624511,
"grad_norm": 7.888760566711426,
"learning_rate": 8.305808225194103e-06,
"loss": 0.3752,
"step": 42100
},
{
"epoch": 3.438722294654498,
"grad_norm": 8.756083488464355,
"learning_rate": 8.295124870646168e-06,
"loss": 0.359,
"step": 42200
},
{
"epoch": 3.446870925684485,
"grad_norm": 8.682005882263184,
"learning_rate": 8.284414856376161e-06,
"loss": 0.3607,
"step": 42300
},
{
"epoch": 3.455019556714472,
"grad_norm": 14.85304069519043,
"learning_rate": 8.273678269035126e-06,
"loss": 0.3417,
"step": 42400
},
{
"epoch": 3.463168187744459,
"grad_norm": 10.479057312011719,
"learning_rate": 8.262915195489097e-06,
"loss": 0.3571,
"step": 42500
},
{
"epoch": 3.471316818774446,
"grad_norm": 9.107665061950684,
"learning_rate": 8.2521257228184e-06,
"loss": 0.3655,
"step": 42600
},
{
"epoch": 3.479465449804433,
"grad_norm": 10.01933765411377,
"learning_rate": 8.241309938316947e-06,
"loss": 0.363,
"step": 42700
},
{
"epoch": 3.48761408083442,
"grad_norm": 7.9999189376831055,
"learning_rate": 8.230467929491533e-06,
"loss": 0.3753,
"step": 42800
},
{
"epoch": 3.4957627118644066,
"grad_norm": 9.211396217346191,
"learning_rate": 8.219599784061124e-06,
"loss": 0.3389,
"step": 42900
},
{
"epoch": 3.5039113428943938,
"grad_norm": 9.140076637268066,
"learning_rate": 8.20870558995614e-06,
"loss": 0.3683,
"step": 43000
},
{
"epoch": 3.512059973924381,
"grad_norm": 9.534440040588379,
"learning_rate": 8.197785435317766e-06,
"loss": 0.3585,
"step": 43100
},
{
"epoch": 3.5202086049543677,
"grad_norm": 10.818157196044922,
"learning_rate": 8.186839408497213e-06,
"loss": 0.3546,
"step": 43200
},
{
"epoch": 3.5283572359843545,
"grad_norm": 11.54218578338623,
"learning_rate": 8.175867598055021e-06,
"loss": 0.3818,
"step": 43300
},
{
"epoch": 3.5365058670143417,
"grad_norm": 10.037505149841309,
"learning_rate": 8.164870092760336e-06,
"loss": 0.347,
"step": 43400
},
{
"epoch": 3.5446544980443284,
"grad_norm": 11.143013000488281,
"learning_rate": 8.153846981590191e-06,
"loss": 0.3633,
"step": 43500
},
{
"epoch": 3.5528031290743156,
"grad_norm": 9.558606147766113,
"learning_rate": 8.142798353728786e-06,
"loss": 0.373,
"step": 43600
},
{
"epoch": 3.5609517601043024,
"grad_norm": 13.201570510864258,
"learning_rate": 8.131724298566767e-06,
"loss": 0.3611,
"step": 43700
},
{
"epoch": 3.5691003911342896,
"grad_norm": 10.490971565246582,
"learning_rate": 8.120624905700511e-06,
"loss": 0.3292,
"step": 43800
},
{
"epoch": 3.5772490221642763,
"grad_norm": 3.778831958770752,
"learning_rate": 8.109500264931387e-06,
"loss": 0.3731,
"step": 43900
},
{
"epoch": 3.5853976531942635,
"grad_norm": 10.723892211914062,
"learning_rate": 8.098350466265034e-06,
"loss": 0.3783,
"step": 44000
},
{
"epoch": 3.5935462842242503,
"grad_norm": 9.849285125732422,
"learning_rate": 8.087175599910642e-06,
"loss": 0.337,
"step": 44100
},
{
"epoch": 3.601694915254237,
"grad_norm": 11.700067520141602,
"learning_rate": 8.07597575628021e-06,
"loss": 0.3639,
"step": 44200
},
{
"epoch": 3.609843546284224,
"grad_norm": 37.506065368652344,
"learning_rate": 8.064751025987822e-06,
"loss": 0.3644,
"step": 44300
},
{
"epoch": 3.6179921773142114,
"grad_norm": 9.770977973937988,
"learning_rate": 8.053501499848907e-06,
"loss": 0.3838,
"step": 44400
},
{
"epoch": 3.626140808344198,
"grad_norm": 14.631871223449707,
"learning_rate": 8.042227268879516e-06,
"loss": 0.3732,
"step": 44500
},
{
"epoch": 3.634289439374185,
"grad_norm": 7.656193256378174,
"learning_rate": 8.030928424295572e-06,
"loss": 0.358,
"step": 44600
},
{
"epoch": 3.642438070404172,
"grad_norm": 9.974722862243652,
"learning_rate": 8.019605057512144e-06,
"loss": 0.3588,
"step": 44700
},
{
"epoch": 3.6505867014341593,
"grad_norm": 12.311222076416016,
"learning_rate": 8.008257260142693e-06,
"loss": 0.362,
"step": 44800
},
{
"epoch": 3.658735332464146,
"grad_norm": 11.374334335327148,
"learning_rate": 7.99688512399835e-06,
"loss": 0.385,
"step": 44900
},
{
"epoch": 3.666883963494133,
"grad_norm": 7.951153755187988,
"learning_rate": 7.985488741087153e-06,
"loss": 0.352,
"step": 45000
},
{
"epoch": 3.67503259452412,
"grad_norm": 5.6287384033203125,
"learning_rate": 7.97406820361332e-06,
"loss": 0.3763,
"step": 45100
},
{
"epoch": 3.6831812255541068,
"grad_norm": 9.33438777923584,
"learning_rate": 7.962623603976491e-06,
"loss": 0.3852,
"step": 45200
},
{
"epoch": 3.691329856584094,
"grad_norm": 12.365875244140625,
"learning_rate": 7.951155034770983e-06,
"loss": 0.3775,
"step": 45300
},
{
"epoch": 3.6994784876140807,
"grad_norm": 9.91942024230957,
"learning_rate": 7.93966258878505e-06,
"loss": 0.3678,
"step": 45400
},
{
"epoch": 3.707627118644068,
"grad_norm": 9.160215377807617,
"learning_rate": 7.928146359000117e-06,
"loss": 0.36,
"step": 45500
},
{
"epoch": 3.7157757496740547,
"grad_norm": 11.565260887145996,
"learning_rate": 7.91660643859004e-06,
"loss": 0.3531,
"step": 45600
},
{
"epoch": 3.723924380704042,
"grad_norm": 4.027003765106201,
"learning_rate": 7.905042920920344e-06,
"loss": 0.3722,
"step": 45700
},
{
"epoch": 3.7320730117340286,
"grad_norm": 13.809627532958984,
"learning_rate": 7.893455899547476e-06,
"loss": 0.3524,
"step": 45800
},
{
"epoch": 3.740221642764016,
"grad_norm": 13.452054023742676,
"learning_rate": 7.881845468218039e-06,
"loss": 0.375,
"step": 45900
},
{
"epoch": 3.7483702737940026,
"grad_norm": 9.63260269165039,
"learning_rate": 7.87021172086804e-06,
"loss": 0.3636,
"step": 46000
},
{
"epoch": 3.7565189048239898,
"grad_norm": 8.539379119873047,
"learning_rate": 7.85855475162213e-06,
"loss": 0.3687,
"step": 46100
},
{
"epoch": 3.7646675358539765,
"grad_norm": 7.635307788848877,
"learning_rate": 7.846874654792835e-06,
"loss": 0.3709,
"step": 46200
},
{
"epoch": 3.7728161668839633,
"grad_norm": 8.707938194274902,
"learning_rate": 7.835171524879805e-06,
"loss": 0.3466,
"step": 46300
},
{
"epoch": 3.7809647979139505,
"grad_norm": 6.248547077178955,
"learning_rate": 7.823445456569036e-06,
"loss": 0.3706,
"step": 46400
},
{
"epoch": 3.7891134289439377,
"grad_norm": 11.434155464172363,
"learning_rate": 7.811696544732115e-06,
"loss": 0.3907,
"step": 46500
},
{
"epoch": 3.7972620599739244,
"grad_norm": 5.250894546508789,
"learning_rate": 7.799924884425447e-06,
"loss": 0.377,
"step": 46600
},
{
"epoch": 3.805410691003911,
"grad_norm": 6.875328063964844,
"learning_rate": 7.788130570889488e-06,
"loss": 0.3569,
"step": 46700
},
{
"epoch": 3.8135593220338984,
"grad_norm": 8.773159980773926,
"learning_rate": 7.776313699547971e-06,
"loss": 0.3635,
"step": 46800
},
{
"epoch": 3.821707953063885,
"grad_norm": 4.8134002685546875,
"learning_rate": 7.764474366007138e-06,
"loss": 0.345,
"step": 46900
},
{
"epoch": 3.8298565840938723,
"grad_norm": 6.085391998291016,
"learning_rate": 7.752612666054963e-06,
"loss": 0.3699,
"step": 47000
},
{
"epoch": 3.838005215123859,
"grad_norm": 8.958887100219727,
"learning_rate": 7.740728695660389e-06,
"loss": 0.3407,
"step": 47100
},
{
"epoch": 3.8461538461538463,
"grad_norm": 6.2288994789123535,
"learning_rate": 7.728822550972523e-06,
"loss": 0.3633,
"step": 47200
},
{
"epoch": 3.854302477183833,
"grad_norm": 9.540541648864746,
"learning_rate": 7.716894328319893e-06,
"loss": 0.3476,
"step": 47300
},
{
"epoch": 3.86245110821382,
"grad_norm": 5.929731369018555,
"learning_rate": 7.704944124209645e-06,
"loss": 0.3929,
"step": 47400
},
{
"epoch": 3.870599739243807,
"grad_norm": 7.797017574310303,
"learning_rate": 7.692972035326772e-06,
"loss": 0.3728,
"step": 47500
},
{
"epoch": 3.878748370273794,
"grad_norm": 14.781734466552734,
"learning_rate": 7.680978158533324e-06,
"loss": 0.3546,
"step": 47600
},
{
"epoch": 3.886897001303781,
"grad_norm": 10.41878890991211,
"learning_rate": 7.668962590867636e-06,
"loss": 0.3603,
"step": 47700
},
{
"epoch": 3.895045632333768,
"grad_norm": 8.300308227539062,
"learning_rate": 7.656925429543531e-06,
"loss": 0.3546,
"step": 47800
},
{
"epoch": 3.903194263363755,
"grad_norm": 9.709467887878418,
"learning_rate": 7.644866771949544e-06,
"loss": 0.3575,
"step": 47900
},
{
"epoch": 3.9113428943937416,
"grad_norm": 7.606164455413818,
"learning_rate": 7.632786715648128e-06,
"loss": 0.3658,
"step": 48000
},
{
"epoch": 3.919491525423729,
"grad_norm": 11.461851119995117,
"learning_rate": 7.62068535837486e-06,
"loss": 0.3653,
"step": 48100
},
{
"epoch": 3.927640156453716,
"grad_norm": 11.35883617401123,
"learning_rate": 7.608562798037662e-06,
"loss": 0.3672,
"step": 48200
},
{
"epoch": 3.9357887874837028,
"grad_norm": 9.994701385498047,
"learning_rate": 7.596419132715997e-06,
"loss": 0.3601,
"step": 48300
},
{
"epoch": 3.9439374185136895,
"grad_norm": 12.242551803588867,
"learning_rate": 7.584254460660092e-06,
"loss": 0.3552,
"step": 48400
},
{
"epoch": 3.9520860495436767,
"grad_norm": 11.628976821899414,
"learning_rate": 7.572068880290118e-06,
"loss": 0.3644,
"step": 48500
},
{
"epoch": 3.960234680573664,
"grad_norm": 9.713350296020508,
"learning_rate": 7.559862490195418e-06,
"loss": 0.3463,
"step": 48600
},
{
"epoch": 3.9683833116036507,
"grad_norm": 5.648345470428467,
"learning_rate": 7.547635389133694e-06,
"loss": 0.3483,
"step": 48700
},
{
"epoch": 3.9765319426336374,
"grad_norm": 15.131999015808105,
"learning_rate": 7.535387676030222e-06,
"loss": 0.366,
"step": 48800
},
{
"epoch": 3.9846805736636246,
"grad_norm": 8.72270393371582,
"learning_rate": 7.523119449977028e-06,
"loss": 0.3567,
"step": 48900
},
{
"epoch": 3.9928292046936114,
"grad_norm": 10.733074188232422,
"learning_rate": 7.510830810232112e-06,
"loss": 0.37,
"step": 49000
},
{
"epoch": 4.0,
"eval_accuracy": 0.8188755020080322,
"eval_loss": 0.529120922088623,
"eval_runtime": 6.8942,
"eval_samples_per_second": 361.175,
"eval_steps_per_second": 45.256,
"step": 49088
},
{
"epoch": 4.0009778357235986,
"grad_norm": 7.13838529586792,
"learning_rate": 7.498521856218637e-06,
"loss": 0.355,
"step": 49100
},
{
"epoch": 4.009126466753585,
"grad_norm": 5.439541816711426,
"learning_rate": 7.486192687524112e-06,
"loss": 0.3005,
"step": 49200
},
{
"epoch": 4.017275097783572,
"grad_norm": 17.687950134277344,
"learning_rate": 7.4738434038996e-06,
"loss": 0.2864,
"step": 49300
},
{
"epoch": 4.02542372881356,
"grad_norm": 11.162871360778809,
"learning_rate": 7.461474105258911e-06,
"loss": 0.3025,
"step": 49400
},
{
"epoch": 4.0335723598435465,
"grad_norm": 9.104811668395996,
"learning_rate": 7.449084891677785e-06,
"loss": 0.2846,
"step": 49500
},
{
"epoch": 4.041720990873533,
"grad_norm": 11.716981887817383,
"learning_rate": 7.436675863393086e-06,
"loss": 0.2984,
"step": 49600
},
{
"epoch": 4.04986962190352,
"grad_norm": 6.521731376647949,
"learning_rate": 7.424247120801997e-06,
"loss": 0.2979,
"step": 49700
},
{
"epoch": 4.058018252933508,
"grad_norm": 6.5696539878845215,
"learning_rate": 7.4117987644611985e-06,
"loss": 0.2898,
"step": 49800
},
{
"epoch": 4.066166883963494,
"grad_norm": 9.98416805267334,
"learning_rate": 7.399330895086061e-06,
"loss": 0.3115,
"step": 49900
},
{
"epoch": 4.074315514993481,
"grad_norm": 6.788928985595703,
"learning_rate": 7.386843613549828e-06,
"loss": 0.3158,
"step": 50000
},
{
"epoch": 4.082464146023468,
"grad_norm": 9.002969741821289,
"learning_rate": 7.374337020882798e-06,
"loss": 0.2964,
"step": 50100
},
{
"epoch": 4.090612777053455,
"grad_norm": 8.216889381408691,
"learning_rate": 7.3618112182715115e-06,
"loss": 0.3194,
"step": 50200
},
{
"epoch": 4.098761408083442,
"grad_norm": 17.576051712036133,
"learning_rate": 7.349266307057932e-06,
"loss": 0.3093,
"step": 50300
},
{
"epoch": 4.106910039113429,
"grad_norm": 14.113720893859863,
"learning_rate": 7.336702388738619e-06,
"loss": 0.2656,
"step": 50400
},
{
"epoch": 4.115058670143416,
"grad_norm": 13.906309127807617,
"learning_rate": 7.324119564963915e-06,
"loss": 0.2977,
"step": 50500
},
{
"epoch": 4.1232073011734025,
"grad_norm": 9.152776718139648,
"learning_rate": 7.311517937537122e-06,
"loss": 0.3067,
"step": 50600
},
{
"epoch": 4.13135593220339,
"grad_norm": 10.242730140686035,
"learning_rate": 7.29889760841367e-06,
"loss": 0.301,
"step": 50700
},
{
"epoch": 4.139504563233377,
"grad_norm": 11.567678451538086,
"learning_rate": 7.2862586797003046e-06,
"loss": 0.2997,
"step": 50800
},
{
"epoch": 4.147653194263364,
"grad_norm": 6.842143535614014,
"learning_rate": 7.27360125365425e-06,
"loss": 0.3004,
"step": 50900
},
{
"epoch": 4.15580182529335,
"grad_norm": 12.490499496459961,
"learning_rate": 7.260925432682386e-06,
"loss": 0.2959,
"step": 51000
},
{
"epoch": 4.163950456323338,
"grad_norm": 7.078547477722168,
"learning_rate": 7.248231319340422e-06,
"loss": 0.2966,
"step": 51100
},
{
"epoch": 4.172099087353325,
"grad_norm": 17.07299041748047,
"learning_rate": 7.235519016332064e-06,
"loss": 0.3241,
"step": 51200
},
{
"epoch": 4.1802477183833116,
"grad_norm": 14.579496383666992,
"learning_rate": 7.222788626508184e-06,
"loss": 0.294,
"step": 51300
},
{
"epoch": 4.188396349413298,
"grad_norm": 16.198028564453125,
"learning_rate": 7.210040252865984e-06,
"loss": 0.3049,
"step": 51400
},
{
"epoch": 4.196544980443286,
"grad_norm": 12.001542091369629,
"learning_rate": 7.197273998548174e-06,
"loss": 0.2932,
"step": 51500
},
{
"epoch": 4.204693611473273,
"grad_norm": 8.593428611755371,
"learning_rate": 7.184489966842128e-06,
"loss": 0.3147,
"step": 51600
},
{
"epoch": 4.2128422425032594,
"grad_norm": 19.119985580444336,
"learning_rate": 7.1716882611790475e-06,
"loss": 0.2929,
"step": 51700
},
{
"epoch": 4.220990873533246,
"grad_norm": 12.756973266601562,
"learning_rate": 7.1588689851331305e-06,
"loss": 0.2973,
"step": 51800
},
{
"epoch": 4.229139504563233,
"grad_norm": 11.550286293029785,
"learning_rate": 7.146032242420732e-06,
"loss": 0.2996,
"step": 51900
},
{
"epoch": 4.237288135593221,
"grad_norm": 8.533171653747559,
"learning_rate": 7.133178136899522e-06,
"loss": 0.3094,
"step": 52000
},
{
"epoch": 4.245436766623207,
"grad_norm": 11.978692054748535,
"learning_rate": 7.120306772567647e-06,
"loss": 0.3013,
"step": 52100
},
{
"epoch": 4.253585397653194,
"grad_norm": 10.963492393493652,
"learning_rate": 7.107418253562889e-06,
"loss": 0.3081,
"step": 52200
},
{
"epoch": 4.261734028683181,
"grad_norm": 11.645411491394043,
"learning_rate": 7.0945126841618225e-06,
"loss": 0.2867,
"step": 52300
},
{
"epoch": 4.2698826597131685,
"grad_norm": 11.48385238647461,
"learning_rate": 7.081590168778973e-06,
"loss": 0.3088,
"step": 52400
},
{
"epoch": 4.278031290743155,
"grad_norm": 10.083149909973145,
"learning_rate": 7.068650811965967e-06,
"loss": 0.2954,
"step": 52500
},
{
"epoch": 4.286179921773142,
"grad_norm": 10.841811180114746,
"learning_rate": 7.055694718410688e-06,
"loss": 0.2944,
"step": 52600
},
{
"epoch": 4.294328552803129,
"grad_norm": 12.332331657409668,
"learning_rate": 7.042721992936438e-06,
"loss": 0.2857,
"step": 52700
},
{
"epoch": 4.302477183833116,
"grad_norm": 13.689620971679688,
"learning_rate": 7.029732740501073e-06,
"loss": 0.3024,
"step": 52800
},
{
"epoch": 4.310625814863103,
"grad_norm": 13.064624786376953,
"learning_rate": 7.016727066196168e-06,
"loss": 0.2917,
"step": 52900
},
{
"epoch": 4.31877444589309,
"grad_norm": 8.214381217956543,
"learning_rate": 7.003705075246163e-06,
"loss": 0.3173,
"step": 53000
},
{
"epoch": 4.326923076923077,
"grad_norm": 14.797425270080566,
"learning_rate": 6.990666873007506e-06,
"loss": 0.2734,
"step": 53100
},
{
"epoch": 4.335071707953064,
"grad_norm": 10.985969543457031,
"learning_rate": 6.977612564967808e-06,
"loss": 0.2958,
"step": 53200
},
{
"epoch": 4.343220338983051,
"grad_norm": 12.808884620666504,
"learning_rate": 6.964542256744986e-06,
"loss": 0.3169,
"step": 53300
},
{
"epoch": 4.351368970013038,
"grad_norm": 21.643781661987305,
"learning_rate": 6.9514560540864095e-06,
"loss": 0.3154,
"step": 53400
},
{
"epoch": 4.3595176010430245,
"grad_norm": 7.609200477600098,
"learning_rate": 6.938354062868041e-06,
"loss": 0.2985,
"step": 53500
},
{
"epoch": 4.367666232073011,
"grad_norm": 13.469466209411621,
"learning_rate": 6.925236389093588e-06,
"loss": 0.3063,
"step": 53600
},
{
"epoch": 4.375814863102999,
"grad_norm": 12.873883247375488,
"learning_rate": 6.912103138893636e-06,
"loss": 0.2903,
"step": 53700
},
{
"epoch": 4.383963494132986,
"grad_norm": 8.953607559204102,
"learning_rate": 6.898954418524797e-06,
"loss": 0.2897,
"step": 53800
},
{
"epoch": 4.3921121251629724,
"grad_norm": 21.484949111938477,
"learning_rate": 6.885790334368844e-06,
"loss": 0.2989,
"step": 53900
},
{
"epoch": 4.400260756192959,
"grad_norm": 8.624776840209961,
"learning_rate": 6.872610992931857e-06,
"loss": 0.2811,
"step": 54000
},
{
"epoch": 4.408409387222947,
"grad_norm": 13.120560646057129,
"learning_rate": 6.859416500843351e-06,
"loss": 0.3003,
"step": 54100
},
{
"epoch": 4.416558018252934,
"grad_norm": 8.616204261779785,
"learning_rate": 6.846206964855426e-06,
"loss": 0.3191,
"step": 54200
},
{
"epoch": 4.42470664928292,
"grad_norm": 7.0158233642578125,
"learning_rate": 6.832982491841894e-06,
"loss": 0.31,
"step": 54300
},
{
"epoch": 4.432855280312907,
"grad_norm": 9.716617584228516,
"learning_rate": 6.819743188797419e-06,
"loss": 0.2949,
"step": 54400
},
{
"epoch": 4.441003911342895,
"grad_norm": 10.602276802062988,
"learning_rate": 6.806489162836649e-06,
"loss": 0.3037,
"step": 54500
},
{
"epoch": 4.4491525423728815,
"grad_norm": 8.699592590332031,
"learning_rate": 6.793220521193347e-06,
"loss": 0.313,
"step": 54600
},
{
"epoch": 4.457301173402868,
"grad_norm": 8.307058334350586,
"learning_rate": 6.779937371219532e-06,
"loss": 0.2924,
"step": 54700
},
{
"epoch": 4.465449804432855,
"grad_norm": 10.045998573303223,
"learning_rate": 6.766639820384602e-06,
"loss": 0.3124,
"step": 54800
},
{
"epoch": 4.473598435462843,
"grad_norm": 15.478697776794434,
"learning_rate": 6.753327976274467e-06,
"loss": 0.2892,
"step": 54900
},
{
"epoch": 4.481747066492829,
"grad_norm": 12.46609878540039,
"learning_rate": 6.740001946590675e-06,
"loss": 0.2809,
"step": 55000
},
{
"epoch": 4.489895697522816,
"grad_norm": 11.292198181152344,
"learning_rate": 6.726661839149556e-06,
"loss": 0.2915,
"step": 55100
},
{
"epoch": 4.498044328552803,
"grad_norm": 15.23190689086914,
"learning_rate": 6.71330776188133e-06,
"loss": 0.306,
"step": 55200
},
{
"epoch": 4.5061929595827905,
"grad_norm": 11.232503890991211,
"learning_rate": 6.69993982282924e-06,
"loss": 0.2979,
"step": 55300
},
{
"epoch": 4.514341590612777,
"grad_norm": 11.436495780944824,
"learning_rate": 6.686558130148687e-06,
"loss": 0.2976,
"step": 55400
},
{
"epoch": 4.522490221642764,
"grad_norm": 11.90659236907959,
"learning_rate": 6.673162792106341e-06,
"loss": 0.3106,
"step": 55500
},
{
"epoch": 4.530638852672751,
"grad_norm": 9.979248046875,
"learning_rate": 6.6597539170792795e-06,
"loss": 0.2948,
"step": 55600
},
{
"epoch": 4.5387874837027375,
"grad_norm": 19.104442596435547,
"learning_rate": 6.646331613554094e-06,
"loss": 0.3248,
"step": 55700
},
{
"epoch": 4.546936114732725,
"grad_norm": 9.139418601989746,
"learning_rate": 6.632895990126028e-06,
"loss": 0.2996,
"step": 55800
},
{
"epoch": 4.555084745762712,
"grad_norm": 9.373650550842285,
"learning_rate": 6.619447155498091e-06,
"loss": 0.3127,
"step": 55900
},
{
"epoch": 4.563233376792699,
"grad_norm": 12.213810920715332,
"learning_rate": 6.605985218480179e-06,
"loss": 0.3113,
"step": 56000
},
{
"epoch": 4.5713820078226854,
"grad_norm": 9.15962028503418,
"learning_rate": 6.5925102879881915e-06,
"loss": 0.311,
"step": 56100
},
{
"epoch": 4.579530638852673,
"grad_norm": 11.712223052978516,
"learning_rate": 6.579022473043159e-06,
"loss": 0.3074,
"step": 56200
},
{
"epoch": 4.58767926988266,
"grad_norm": 9.559146881103516,
"learning_rate": 6.565521882770355e-06,
"loss": 0.3065,
"step": 56300
},
{
"epoch": 4.595827900912647,
"grad_norm": 8.07590389251709,
"learning_rate": 6.552008626398409e-06,
"loss": 0.3195,
"step": 56400
},
{
"epoch": 4.603976531942633,
"grad_norm": 13.063721656799316,
"learning_rate": 6.5384828132584335e-06,
"loss": 0.2778,
"step": 56500
},
{
"epoch": 4.612125162972621,
"grad_norm": 13.26430892944336,
"learning_rate": 6.524944552783129e-06,
"loss": 0.3081,
"step": 56600
},
{
"epoch": 4.620273794002608,
"grad_norm": 14.221997261047363,
"learning_rate": 6.511393954505906e-06,
"loss": 0.3072,
"step": 56700
},
{
"epoch": 4.6284224250325945,
"grad_norm": 10.34438705444336,
"learning_rate": 6.497831128059993e-06,
"loss": 0.3078,
"step": 56800
},
{
"epoch": 4.636571056062581,
"grad_norm": 15.65034294128418,
"learning_rate": 6.4842561831775575e-06,
"loss": 0.3035,
"step": 56900
},
{
"epoch": 4.644719687092568,
"grad_norm": 10.238895416259766,
"learning_rate": 6.470669229688809e-06,
"loss": 0.2962,
"step": 57000
},
{
"epoch": 4.652868318122556,
"grad_norm": 16.671092987060547,
"learning_rate": 6.457070377521111e-06,
"loss": 0.307,
"step": 57100
},
{
"epoch": 4.661016949152542,
"grad_norm": 11.118473052978516,
"learning_rate": 6.443459736698106e-06,
"loss": 0.3079,
"step": 57200
},
{
"epoch": 4.669165580182529,
"grad_norm": 7.511115550994873,
"learning_rate": 6.429837417338804e-06,
"loss": 0.2959,
"step": 57300
},
{
"epoch": 4.677314211212517,
"grad_norm": 14.2573881149292,
"learning_rate": 6.416203529656707e-06,
"loss": 0.2948,
"step": 57400
},
{
"epoch": 4.6854628422425035,
"grad_norm": 11.03162956237793,
"learning_rate": 6.40255818395891e-06,
"loss": 0.3095,
"step": 57500
},
{
"epoch": 4.69361147327249,
"grad_norm": 11.995973587036133,
"learning_rate": 6.388901490645214e-06,
"loss": 0.3099,
"step": 57600
},
{
"epoch": 4.701760104302477,
"grad_norm": 9.43193244934082,
"learning_rate": 6.375233560207229e-06,
"loss": 0.3276,
"step": 57700
},
{
"epoch": 4.709908735332464,
"grad_norm": 10.617565155029297,
"learning_rate": 6.361554503227475e-06,
"loss": 0.3149,
"step": 57800
},
{
"epoch": 4.718057366362451,
"grad_norm": 16.004545211791992,
"learning_rate": 6.347864430378501e-06,
"loss": 0.2907,
"step": 57900
},
{
"epoch": 4.726205997392438,
"grad_norm": 18.075027465820312,
"learning_rate": 6.334163452421978e-06,
"loss": 0.3168,
"step": 58000
},
{
"epoch": 4.734354628422425,
"grad_norm": 19.736661911010742,
"learning_rate": 6.320451680207805e-06,
"loss": 0.3077,
"step": 58100
},
{
"epoch": 4.742503259452412,
"grad_norm": 6.202484607696533,
"learning_rate": 6.306729224673217e-06,
"loss": 0.3022,
"step": 58200
},
{
"epoch": 4.750651890482399,
"grad_norm": 4.973538398742676,
"learning_rate": 6.29299619684188e-06,
"loss": 0.3032,
"step": 58300
},
{
"epoch": 4.758800521512386,
"grad_norm": 9.67834186553955,
"learning_rate": 6.2792527078230024e-06,
"loss": 0.2937,
"step": 58400
},
{
"epoch": 4.766949152542373,
"grad_norm": 7.5604777336120605,
"learning_rate": 6.265498868810424e-06,
"loss": 0.3132,
"step": 58500
},
{
"epoch": 4.77509778357236,
"grad_norm": 11.391521453857422,
"learning_rate": 6.251734791081728e-06,
"loss": 0.3249,
"step": 58600
},
{
"epoch": 4.783246414602347,
"grad_norm": 16.40961265563965,
"learning_rate": 6.237960585997334e-06,
"loss": 0.2951,
"step": 58700
},
{
"epoch": 4.791395045632334,
"grad_norm": 4.114518165588379,
"learning_rate": 6.224176364999595e-06,
"loss": 0.3091,
"step": 58800
},
{
"epoch": 4.799543676662321,
"grad_norm": 9.569024085998535,
"learning_rate": 6.210382239611906e-06,
"loss": 0.3093,
"step": 58900
},
{
"epoch": 4.8076923076923075,
"grad_norm": 30.753637313842773,
"learning_rate": 6.1965783214377895e-06,
"loss": 0.2982,
"step": 59000
},
{
"epoch": 4.815840938722294,
"grad_norm": 7.500620365142822,
"learning_rate": 6.18276472216e-06,
"loss": 0.2956,
"step": 59100
},
{
"epoch": 4.823989569752282,
"grad_norm": 14.710212707519531,
"learning_rate": 6.16894155353962e-06,
"loss": 0.3078,
"step": 59200
},
{
"epoch": 4.832138200782269,
"grad_norm": 7.550549507141113,
"learning_rate": 6.1551089274151525e-06,
"loss": 0.3093,
"step": 59300
},
{
"epoch": 4.840286831812255,
"grad_norm": 8.313648223876953,
"learning_rate": 6.141266955701616e-06,
"loss": 0.2872,
"step": 59400
},
{
"epoch": 4.848435462842242,
"grad_norm": 3.505223274230957,
"learning_rate": 6.127415750389645e-06,
"loss": 0.2991,
"step": 59500
},
{
"epoch": 4.85658409387223,
"grad_norm": 10.405817985534668,
"learning_rate": 6.113555423544576e-06,
"loss": 0.3083,
"step": 59600
},
{
"epoch": 4.8647327249022165,
"grad_norm": 9.818922996520996,
"learning_rate": 6.0996860873055505e-06,
"loss": 0.3131,
"step": 59700
},
{
"epoch": 4.872881355932203,
"grad_norm": 8.345934867858887,
"learning_rate": 6.085807853884595e-06,
"loss": 0.2963,
"step": 59800
},
{
"epoch": 4.88102998696219,
"grad_norm": 10.804642677307129,
"learning_rate": 6.071920835565724e-06,
"loss": 0.315,
"step": 59900
},
{
"epoch": 4.889178617992178,
"grad_norm": 10.550320625305176,
"learning_rate": 6.058025144704026e-06,
"loss": 0.288,
"step": 60000
},
{
"epoch": 4.897327249022164,
"grad_norm": 7.386425018310547,
"learning_rate": 6.044120893724758e-06,
"loss": 0.3175,
"step": 60100
},
{
"epoch": 4.905475880052151,
"grad_norm": 16.652528762817383,
"learning_rate": 6.030208195122433e-06,
"loss": 0.3218,
"step": 60200
},
{
"epoch": 4.913624511082138,
"grad_norm": 15.053431510925293,
"learning_rate": 6.016287161459907e-06,
"loss": 0.2769,
"step": 60300
},
{
"epoch": 4.921773142112125,
"grad_norm": 7.756086349487305,
"learning_rate": 6.002357905367481e-06,
"loss": 0.289,
"step": 60400
},
{
"epoch": 4.929921773142112,
"grad_norm": 10.426520347595215,
"learning_rate": 5.9884205395419725e-06,
"loss": 0.3169,
"step": 60500
},
{
"epoch": 4.938070404172099,
"grad_norm": 12.334880828857422,
"learning_rate": 5.974475176745813e-06,
"loss": 0.3093,
"step": 60600
},
{
"epoch": 4.946219035202086,
"grad_norm": 14.239689826965332,
"learning_rate": 5.960521929806141e-06,
"loss": 0.3036,
"step": 60700
},
{
"epoch": 4.9543676662320735,
"grad_norm": 12.593892097473145,
"learning_rate": 5.946560911613877e-06,
"loss": 0.2911,
"step": 60800
},
{
"epoch": 4.96251629726206,
"grad_norm": 4.950251579284668,
"learning_rate": 5.9325922351228186e-06,
"loss": 0.2942,
"step": 60900
},
{
"epoch": 4.970664928292047,
"grad_norm": 10.60743522644043,
"learning_rate": 5.918616013348719e-06,
"loss": 0.302,
"step": 61000
},
{
"epoch": 4.978813559322034,
"grad_norm": 18.459735870361328,
"learning_rate": 5.904632359368388e-06,
"loss": 0.2806,
"step": 61100
},
{
"epoch": 4.9869621903520205,
"grad_norm": 10.454113006591797,
"learning_rate": 5.890641386318756e-06,
"loss": 0.3009,
"step": 61200
},
{
"epoch": 4.995110821382008,
"grad_norm": 12.8052396774292,
"learning_rate": 5.876643207395976e-06,
"loss": 0.3122,
"step": 61300
},
{
"epoch": 5.0,
"eval_accuracy": 0.8285140562248996,
"eval_loss": 0.5347269773483276,
"eval_runtime": 6.815,
"eval_samples_per_second": 365.37,
"eval_steps_per_second": 45.781,
"step": 61360
},
{
"epoch": 5.003259452411995,
"grad_norm": 3.523259162902832,
"learning_rate": 5.862637935854502e-06,
"loss": 0.289,
"step": 61400
},
{
"epoch": 5.011408083441982,
"grad_norm": 14.498679161071777,
"learning_rate": 5.848625685006164e-06,
"loss": 0.2673,
"step": 61500
},
{
"epoch": 5.019556714471968,
"grad_norm": 15.165558815002441,
"learning_rate": 5.834606568219269e-06,
"loss": 0.2499,
"step": 61600
},
{
"epoch": 5.027705345501956,
"grad_norm": 12.705721855163574,
"learning_rate": 5.820580698917666e-06,
"loss": 0.2486,
"step": 61700
},
{
"epoch": 5.035853976531943,
"grad_norm": 15.987256050109863,
"learning_rate": 5.806548190579842e-06,
"loss": 0.2417,
"step": 61800
},
{
"epoch": 5.0440026075619295,
"grad_norm": 8.831116676330566,
"learning_rate": 5.792509156737997e-06,
"loss": 0.2265,
"step": 61900
},
{
"epoch": 5.052151238591916,
"grad_norm": 12.182964324951172,
"learning_rate": 5.7784637109771225e-06,
"loss": 0.2538,
"step": 62000
},
{
"epoch": 5.060299869621904,
"grad_norm": 10.809981346130371,
"learning_rate": 5.764411966934092e-06,
"loss": 0.2603,
"step": 62100
},
{
"epoch": 5.068448500651891,
"grad_norm": 5.705296039581299,
"learning_rate": 5.750354038296733e-06,
"loss": 0.2438,
"step": 62200
},
{
"epoch": 5.076597131681877,
"grad_norm": 9.95255184173584,
"learning_rate": 5.736290038802911e-06,
"loss": 0.234,
"step": 62300
},
{
"epoch": 5.084745762711864,
"grad_norm": 7.724064350128174,
"learning_rate": 5.722220082239608e-06,
"loss": 0.2488,
"step": 62400
},
{
"epoch": 5.092894393741851,
"grad_norm": 10.82822036743164,
"learning_rate": 5.708144282442006e-06,
"loss": 0.2591,
"step": 62500
},
{
"epoch": 5.101043024771839,
"grad_norm": 8.642077445983887,
"learning_rate": 5.694062753292559e-06,
"loss": 0.2581,
"step": 62600
},
{
"epoch": 5.109191655801825,
"grad_norm": 10.630475044250488,
"learning_rate": 5.679975608720078e-06,
"loss": 0.2408,
"step": 62700
},
{
"epoch": 5.117340286831812,
"grad_norm": 10.559286117553711,
"learning_rate": 5.665882962698801e-06,
"loss": 0.2417,
"step": 62800
},
{
"epoch": 5.125488917861799,
"grad_norm": 6.505354881286621,
"learning_rate": 5.651784929247486e-06,
"loss": 0.2517,
"step": 62900
},
{
"epoch": 5.1336375488917865,
"grad_norm": 10.710380554199219,
"learning_rate": 5.637681622428468e-06,
"loss": 0.235,
"step": 63000
},
{
"epoch": 5.141786179921773,
"grad_norm": 4.721646785736084,
"learning_rate": 5.6235731563467535e-06,
"loss": 0.2577,
"step": 63100
},
{
"epoch": 5.14993481095176,
"grad_norm": 8.588154792785645,
"learning_rate": 5.609459645149089e-06,
"loss": 0.2728,
"step": 63200
},
{
"epoch": 5.158083441981747,
"grad_norm": 19.248777389526367,
"learning_rate": 5.595341203023044e-06,
"loss": 0.2371,
"step": 63300
},
{
"epoch": 5.166232073011734,
"grad_norm": 8.45293140411377,
"learning_rate": 5.581217944196071e-06,
"loss": 0.264,
"step": 63400
},
{
"epoch": 5.174380704041721,
"grad_norm": 8.333393096923828,
"learning_rate": 5.567089982934605e-06,
"loss": 0.2558,
"step": 63500
},
{
"epoch": 5.182529335071708,
"grad_norm": 14.054290771484375,
"learning_rate": 5.552957433543119e-06,
"loss": 0.2524,
"step": 63600
},
{
"epoch": 5.190677966101695,
"grad_norm": 12.668076515197754,
"learning_rate": 5.538820410363214e-06,
"loss": 0.2408,
"step": 63700
},
{
"epoch": 5.198826597131682,
"grad_norm": 9.344785690307617,
"learning_rate": 5.524679027772676e-06,
"loss": 0.2538,
"step": 63800
},
{
"epoch": 5.206975228161669,
"grad_norm": 9.552376747131348,
"learning_rate": 5.510533400184572e-06,
"loss": 0.2535,
"step": 63900
},
{
"epoch": 5.215123859191656,
"grad_norm": 10.270748138427734,
"learning_rate": 5.496383642046311e-06,
"loss": 0.2672,
"step": 64000
},
{
"epoch": 5.2232724902216425,
"grad_norm": 15.067427635192871,
"learning_rate": 5.4822298678387174e-06,
"loss": 0.2455,
"step": 64100
},
{
"epoch": 5.23142112125163,
"grad_norm": 5.5667948722839355,
"learning_rate": 5.468072192075111e-06,
"loss": 0.2539,
"step": 64200
},
{
"epoch": 5.239569752281617,
"grad_norm": 11.088788032531738,
"learning_rate": 5.453910729300378e-06,
"loss": 0.2523,
"step": 64300
},
{
"epoch": 5.247718383311604,
"grad_norm": 24.676876068115234,
"learning_rate": 5.439745594090042e-06,
"loss": 0.2488,
"step": 64400
},
{
"epoch": 5.25586701434159,
"grad_norm": 9.937374114990234,
"learning_rate": 5.425576901049342e-06,
"loss": 0.2575,
"step": 64500
},
{
"epoch": 5.264015645371577,
"grad_norm": 13.66021728515625,
"learning_rate": 5.411404764812299e-06,
"loss": 0.2396,
"step": 64600
},
{
"epoch": 5.272164276401565,
"grad_norm": 11.568852424621582,
"learning_rate": 5.3972293000407945e-06,
"loss": 0.2398,
"step": 64700
},
{
"epoch": 5.280312907431552,
"grad_norm": 9.292428970336914,
"learning_rate": 5.383050621423639e-06,
"loss": 0.2696,
"step": 64800
},
{
"epoch": 5.288461538461538,
"grad_norm": 21.01643180847168,
"learning_rate": 5.368868843675642e-06,
"loss": 0.2522,
"step": 64900
},
{
"epoch": 5.296610169491525,
"grad_norm": 7.557727813720703,
"learning_rate": 5.354684081536693e-06,
"loss": 0.2709,
"step": 65000
},
{
"epoch": 5.304758800521513,
"grad_norm": 7.703597545623779,
"learning_rate": 5.340496449770824e-06,
"loss": 0.2561,
"step": 65100
},
{
"epoch": 5.3129074315514995,
"grad_norm": 11.133892059326172,
"learning_rate": 5.3263060631652805e-06,
"loss": 0.2595,
"step": 65200
},
{
"epoch": 5.321056062581486,
"grad_norm": 15.144754409790039,
"learning_rate": 5.312113036529604e-06,
"loss": 0.2506,
"step": 65300
},
{
"epoch": 5.329204693611473,
"grad_norm": 7.959693431854248,
"learning_rate": 5.297917484694692e-06,
"loss": 0.2644,
"step": 65400
},
{
"epoch": 5.337353324641461,
"grad_norm": 15.450654029846191,
"learning_rate": 5.28371952251187e-06,
"loss": 0.2533,
"step": 65500
},
{
"epoch": 5.345501955671447,
"grad_norm": 6.035745620727539,
"learning_rate": 5.269519264851967e-06,
"loss": 0.2507,
"step": 65600
},
{
"epoch": 5.353650586701434,
"grad_norm": 8.266439437866211,
"learning_rate": 5.255316826604385e-06,
"loss": 0.2588,
"step": 65700
},
{
"epoch": 5.361799217731421,
"grad_norm": 9.542835235595703,
"learning_rate": 5.24111232267617e-06,
"loss": 0.2584,
"step": 65800
},
{
"epoch": 5.369947848761408,
"grad_norm": 11.433173179626465,
"learning_rate": 5.2269058679910735e-06,
"loss": 0.2451,
"step": 65900
},
{
"epoch": 5.378096479791395,
"grad_norm": 12.72153377532959,
"learning_rate": 5.212697577488635e-06,
"loss": 0.2496,
"step": 66000
},
{
"epoch": 5.386245110821382,
"grad_norm": 9.416111946105957,
"learning_rate": 5.1984875661232495e-06,
"loss": 0.2567,
"step": 66100
},
{
"epoch": 5.394393741851369,
"grad_norm": 15.701902389526367,
"learning_rate": 5.184275948863231e-06,
"loss": 0.2432,
"step": 66200
},
{
"epoch": 5.4025423728813555,
"grad_norm": 7.241784572601318,
"learning_rate": 5.1700628406898835e-06,
"loss": 0.2441,
"step": 66300
},
{
"epoch": 5.410691003911343,
"grad_norm": 21.102312088012695,
"learning_rate": 5.155848356596581e-06,
"loss": 0.2695,
"step": 66400
},
{
"epoch": 5.41883963494133,
"grad_norm": 12.834817886352539,
"learning_rate": 5.1416326115878255e-06,
"loss": 0.2705,
"step": 66500
},
{
"epoch": 5.426988265971317,
"grad_norm": 29.203624725341797,
"learning_rate": 5.127415720678319e-06,
"loss": 0.2354,
"step": 66600
},
{
"epoch": 5.435136897001303,
"grad_norm": 13.500927925109863,
"learning_rate": 5.113197798892038e-06,
"loss": 0.2508,
"step": 66700
},
{
"epoch": 5.443285528031291,
"grad_norm": 7.524002552032471,
"learning_rate": 5.098978961261296e-06,
"loss": 0.2494,
"step": 66800
},
{
"epoch": 5.451434159061278,
"grad_norm": 17.00074577331543,
"learning_rate": 5.084759322825821e-06,
"loss": 0.241,
"step": 66900
},
{
"epoch": 5.459582790091265,
"grad_norm": 11.755769729614258,
"learning_rate": 5.070538998631813e-06,
"loss": 0.2658,
"step": 67000
},
{
"epoch": 5.467731421121251,
"grad_norm": 13.64929485321045,
"learning_rate": 5.056318103731028e-06,
"loss": 0.2515,
"step": 67100
},
{
"epoch": 5.475880052151239,
"grad_norm": 6.673364639282227,
"learning_rate": 5.042096753179835e-06,
"loss": 0.2505,
"step": 67200
},
{
"epoch": 5.484028683181226,
"grad_norm": 6.764876365661621,
"learning_rate": 5.02787506203829e-06,
"loss": 0.2584,
"step": 67300
},
{
"epoch": 5.4921773142112125,
"grad_norm": 11.133795738220215,
"learning_rate": 5.013653145369204e-06,
"loss": 0.2598,
"step": 67400
},
{
"epoch": 5.500325945241199,
"grad_norm": 11.689901351928711,
"learning_rate": 4.9994311182372145e-06,
"loss": 0.2397,
"step": 67500
},
{
"epoch": 5.508474576271187,
"grad_norm": 18.084266662597656,
"learning_rate": 4.985209095707852e-06,
"loss": 0.265,
"step": 67600
},
{
"epoch": 5.516623207301174,
"grad_norm": 12.136353492736816,
"learning_rate": 4.970987192846609e-06,
"loss": 0.2372,
"step": 67700
},
{
"epoch": 5.52477183833116,
"grad_norm": 14.060345649719238,
"learning_rate": 4.95676552471801e-06,
"loss": 0.2657,
"step": 67800
},
{
"epoch": 5.532920469361147,
"grad_norm": 5.493065357208252,
"learning_rate": 4.942544206384682e-06,
"loss": 0.2377,
"step": 67900
},
{
"epoch": 5.541069100391134,
"grad_norm": 13.543553352355957,
"learning_rate": 4.928323352906421e-06,
"loss": 0.2456,
"step": 68000
},
{
"epoch": 5.5492177314211215,
"grad_norm": 12.011448860168457,
"learning_rate": 4.9141030793392595e-06,
"loss": 0.2695,
"step": 68100
},
{
"epoch": 5.557366362451108,
"grad_norm": 7.862688064575195,
"learning_rate": 4.899883500734542e-06,
"loss": 0.2668,
"step": 68200
},
{
"epoch": 5.565514993481095,
"grad_norm": 11.895374298095703,
"learning_rate": 4.885664732137988e-06,
"loss": 0.2581,
"step": 68300
},
{
"epoch": 5.573663624511082,
"grad_norm": 19.049335479736328,
"learning_rate": 4.871446888588762e-06,
"loss": 0.2581,
"step": 68400
},
{
"epoch": 5.581812255541069,
"grad_norm": 15.173524856567383,
"learning_rate": 4.85723008511855e-06,
"loss": 0.2374,
"step": 68500
},
{
"epoch": 5.589960886571056,
"grad_norm": 15.82532024383545,
"learning_rate": 4.84301443675062e-06,
"loss": 0.2548,
"step": 68600
},
{
"epoch": 5.598109517601043,
"grad_norm": 9.289793014526367,
"learning_rate": 4.828800058498889e-06,
"loss": 0.2585,
"step": 68700
},
{
"epoch": 5.60625814863103,
"grad_norm": 13.010422706604004,
"learning_rate": 4.814587065367009e-06,
"loss": 0.264,
"step": 68800
},
{
"epoch": 5.614406779661017,
"grad_norm": 10.556730270385742,
"learning_rate": 4.800375572347414e-06,
"loss": 0.2436,
"step": 68900
},
{
"epoch": 5.622555410691004,
"grad_norm": 13.723767280578613,
"learning_rate": 4.786165694420408e-06,
"loss": 0.2477,
"step": 69000
},
{
"epoch": 5.630704041720991,
"grad_norm": 11.722618103027344,
"learning_rate": 4.771957546553226e-06,
"loss": 0.2581,
"step": 69100
},
{
"epoch": 5.638852672750978,
"grad_norm": 10.373120307922363,
"learning_rate": 4.757751243699109e-06,
"loss": 0.2606,
"step": 69200
},
{
"epoch": 5.647001303780964,
"grad_norm": 15.857172966003418,
"learning_rate": 4.743546900796364e-06,
"loss": 0.2723,
"step": 69300
},
{
"epoch": 5.655149934810952,
"grad_norm": 22.450532913208008,
"learning_rate": 4.729344632767446e-06,
"loss": 0.235,
"step": 69400
},
{
"epoch": 5.663298565840939,
"grad_norm": 15.469109535217285,
"learning_rate": 4.71514455451802e-06,
"loss": 0.2455,
"step": 69500
},
{
"epoch": 5.6714471968709255,
"grad_norm": 21.650880813598633,
"learning_rate": 4.7009467809360375e-06,
"loss": 0.2597,
"step": 69600
},
{
"epoch": 5.679595827900913,
"grad_norm": 16.47661590576172,
"learning_rate": 4.6867514268907995e-06,
"loss": 0.2555,
"step": 69700
},
{
"epoch": 5.6877444589309,
"grad_norm": 16.370121002197266,
"learning_rate": 4.672558607232033e-06,
"loss": 0.2411,
"step": 69800
},
{
"epoch": 5.695893089960887,
"grad_norm": 10.867352485656738,
"learning_rate": 4.658368436788963e-06,
"loss": 0.2638,
"step": 69900
},
{
"epoch": 5.704041720990873,
"grad_norm": 13.257880210876465,
"learning_rate": 4.644181030369378e-06,
"loss": 0.233,
"step": 70000
},
{
"epoch": 5.71219035202086,
"grad_norm": 16.66828155517578,
"learning_rate": 4.629996502758703e-06,
"loss": 0.2549,
"step": 70100
},
{
"epoch": 5.720338983050848,
"grad_norm": 8.0230712890625,
"learning_rate": 4.615814968719071e-06,
"loss": 0.251,
"step": 70200
},
{
"epoch": 5.7284876140808345,
"grad_norm": 20.61688804626465,
"learning_rate": 4.6016365429884e-06,
"loss": 0.2617,
"step": 70300
},
{
"epoch": 5.736636245110821,
"grad_norm": 4.916039943695068,
"learning_rate": 4.587461340279457e-06,
"loss": 0.2772,
"step": 70400
},
{
"epoch": 5.744784876140808,
"grad_norm": 13.59726333618164,
"learning_rate": 4.573289475278927e-06,
"loss": 0.2654,
"step": 70500
},
{
"epoch": 5.752933507170796,
"grad_norm": 21.178253173828125,
"learning_rate": 4.559121062646499e-06,
"loss": 0.237,
"step": 70600
},
{
"epoch": 5.761082138200782,
"grad_norm": 15.958664894104004,
"learning_rate": 4.544956217013927e-06,
"loss": 0.2447,
"step": 70700
},
{
"epoch": 5.769230769230769,
"grad_norm": 7.610626220703125,
"learning_rate": 4.530795052984104e-06,
"loss": 0.239,
"step": 70800
},
{
"epoch": 5.777379400260756,
"grad_norm": 10.934889793395996,
"learning_rate": 4.5166376851301385e-06,
"loss": 0.2562,
"step": 70900
},
{
"epoch": 5.7855280312907436,
"grad_norm": 7.9625244140625,
"learning_rate": 4.502484227994426e-06,
"loss": 0.2606,
"step": 71000
},
{
"epoch": 5.79367666232073,
"grad_norm": 15.313315391540527,
"learning_rate": 4.488334796087719e-06,
"loss": 0.2454,
"step": 71100
},
{
"epoch": 5.801825293350717,
"grad_norm": 16.183135986328125,
"learning_rate": 4.474189503888207e-06,
"loss": 0.2591,
"step": 71200
},
{
"epoch": 5.809973924380704,
"grad_norm": 8.89918041229248,
"learning_rate": 4.4600484658405815e-06,
"loss": 0.2577,
"step": 71300
},
{
"epoch": 5.818122555410691,
"grad_norm": 8.31811237335205,
"learning_rate": 4.445911796355119e-06,
"loss": 0.2382,
"step": 71400
},
{
"epoch": 5.826271186440678,
"grad_norm": 9.141270637512207,
"learning_rate": 4.431779609806751e-06,
"loss": 0.2401,
"step": 71500
},
{
"epoch": 5.834419817470665,
"grad_norm": 8.92165756225586,
"learning_rate": 4.4176520205341365e-06,
"loss": 0.2133,
"step": 71600
},
{
"epoch": 5.842568448500652,
"grad_norm": 14.15666675567627,
"learning_rate": 4.403529142838745e-06,
"loss": 0.2536,
"step": 71700
},
{
"epoch": 5.8507170795306385,
"grad_norm": 8.742586135864258,
"learning_rate": 4.38941109098392e-06,
"loss": 0.261,
"step": 71800
},
{
"epoch": 5.858865710560626,
"grad_norm": 8.7103853225708,
"learning_rate": 4.375297979193965e-06,
"loss": 0.2331,
"step": 71900
},
{
"epoch": 5.867014341590613,
"grad_norm": 13.822142601013184,
"learning_rate": 4.361189921653215e-06,
"loss": 0.2583,
"step": 72000
},
{
"epoch": 5.8751629726206,
"grad_norm": 9.043753623962402,
"learning_rate": 4.3470870325051084e-06,
"loss": 0.2635,
"step": 72100
},
{
"epoch": 5.883311603650586,
"grad_norm": 10.288004875183105,
"learning_rate": 4.332989425851273e-06,
"loss": 0.2644,
"step": 72200
},
{
"epoch": 5.891460234680574,
"grad_norm": 18.826217651367188,
"learning_rate": 4.318897215750593e-06,
"loss": 0.2515,
"step": 72300
},
{
"epoch": 5.899608865710561,
"grad_norm": 11.778913497924805,
"learning_rate": 4.304810516218298e-06,
"loss": 0.2628,
"step": 72400
},
{
"epoch": 5.9077574967405475,
"grad_norm": 16.54121971130371,
"learning_rate": 4.290729441225027e-06,
"loss": 0.2792,
"step": 72500
},
{
"epoch": 5.915906127770534,
"grad_norm": 12.631098747253418,
"learning_rate": 4.276654104695915e-06,
"loss": 0.2503,
"step": 72600
},
{
"epoch": 5.924054758800521,
"grad_norm": 10.706419944763184,
"learning_rate": 4.262584620509669e-06,
"loss": 0.2564,
"step": 72700
},
{
"epoch": 5.932203389830509,
"grad_norm": 8.69650650024414,
"learning_rate": 4.248521102497649e-06,
"loss": 0.2569,
"step": 72800
},
{
"epoch": 5.940352020860495,
"grad_norm": 12.438202857971191,
"learning_rate": 4.23446366444294e-06,
"loss": 0.2531,
"step": 72900
},
{
"epoch": 5.948500651890482,
"grad_norm": 22.02505874633789,
"learning_rate": 4.220412420079438e-06,
"loss": 0.2692,
"step": 73000
},
{
"epoch": 5.95664928292047,
"grad_norm": 13.650114059448242,
"learning_rate": 4.206367483090931e-06,
"loss": 0.2663,
"step": 73100
},
{
"epoch": 5.9647979139504566,
"grad_norm": 13.705251693725586,
"learning_rate": 4.192328967110172e-06,
"loss": 0.2295,
"step": 73200
},
{
"epoch": 5.972946544980443,
"grad_norm": 7.683305263519287,
"learning_rate": 4.178296985717967e-06,
"loss": 0.2622,
"step": 73300
},
{
"epoch": 5.98109517601043,
"grad_norm": 7.798497676849365,
"learning_rate": 4.16427165244225e-06,
"loss": 0.2431,
"step": 73400
},
{
"epoch": 5.989243807040417,
"grad_norm": 8.129569053649902,
"learning_rate": 4.150253080757172e-06,
"loss": 0.2372,
"step": 73500
},
{
"epoch": 5.9973924380704045,
"grad_norm": 14.516979217529297,
"learning_rate": 4.136241384082174e-06,
"loss": 0.2801,
"step": 73600
},
{
"epoch": 6.0,
"eval_accuracy": 0.8168674698795181,
"eval_loss": 0.6053332686424255,
"eval_runtime": 7.0202,
"eval_samples_per_second": 354.691,
"eval_steps_per_second": 44.443,
"step": 73632
},
{
"epoch": 6.005541069100391,
"grad_norm": 11.174201965332031,
"learning_rate": 4.122236675781071e-06,
"loss": 0.224,
"step": 73700
},
{
"epoch": 6.013689700130378,
"grad_norm": 24.070091247558594,
"learning_rate": 4.108239069161147e-06,
"loss": 0.2289,
"step": 73800
},
{
"epoch": 6.021838331160365,
"grad_norm": 14.804594993591309,
"learning_rate": 4.09424867747222e-06,
"loss": 0.2017,
"step": 73900
},
{
"epoch": 6.029986962190352,
"grad_norm": 20.014951705932617,
"learning_rate": 4.0802656139057385e-06,
"loss": 0.2203,
"step": 74000
},
{
"epoch": 6.038135593220339,
"grad_norm": 11.608116149902344,
"learning_rate": 4.066289991593859e-06,
"loss": 0.1983,
"step": 74100
},
{
"epoch": 6.046284224250326,
"grad_norm": 10.88152027130127,
"learning_rate": 4.052321923608539e-06,
"loss": 0.2167,
"step": 74200
},
{
"epoch": 6.054432855280313,
"grad_norm": 9.91988754272461,
"learning_rate": 4.038361522960609e-06,
"loss": 0.2114,
"step": 74300
},
{
"epoch": 6.0625814863103,
"grad_norm": 10.7438383102417,
"learning_rate": 4.024408902598871e-06,
"loss": 0.2126,
"step": 74400
},
{
"epoch": 6.070730117340287,
"grad_norm": 13.341911315917969,
"learning_rate": 4.01046417540918e-06,
"loss": 0.2099,
"step": 74500
},
{
"epoch": 6.078878748370274,
"grad_norm": 14.30612564086914,
"learning_rate": 3.996527454213522e-06,
"loss": 0.2159,
"step": 74600
},
{
"epoch": 6.0870273794002605,
"grad_norm": 14.352286338806152,
"learning_rate": 3.98259885176912e-06,
"loss": 0.2314,
"step": 74700
},
{
"epoch": 6.095176010430248,
"grad_norm": 10.346816062927246,
"learning_rate": 3.968678480767503e-06,
"loss": 0.2111,
"step": 74800
},
{
"epoch": 6.103324641460235,
"grad_norm": 16.672042846679688,
"learning_rate": 3.954766453833608e-06,
"loss": 0.199,
"step": 74900
},
{
"epoch": 6.111473272490222,
"grad_norm": 14.719056129455566,
"learning_rate": 3.94086288352486e-06,
"loss": 0.1996,
"step": 75000
},
{
"epoch": 6.119621903520208,
"grad_norm": 15.159549713134766,
"learning_rate": 3.926967882330262e-06,
"loss": 0.2246,
"step": 75100
},
{
"epoch": 6.127770534550195,
"grad_norm": 8.278336524963379,
"learning_rate": 3.913081562669492e-06,
"loss": 0.229,
"step": 75200
},
{
"epoch": 6.135919165580183,
"grad_norm": 17.559757232666016,
"learning_rate": 3.899204036891989e-06,
"loss": 0.2012,
"step": 75300
},
{
"epoch": 6.1440677966101696,
"grad_norm": 11.502748489379883,
"learning_rate": 3.885335417276037e-06,
"loss": 0.202,
"step": 75400
},
{
"epoch": 6.152216427640156,
"grad_norm": 10.84666633605957,
"learning_rate": 3.871475816027868e-06,
"loss": 0.2142,
"step": 75500
},
{
"epoch": 6.160365058670143,
"grad_norm": 15.855389595031738,
"learning_rate": 3.857625345280751e-06,
"loss": 0.2287,
"step": 75600
},
{
"epoch": 6.168513689700131,
"grad_norm": 12.554780960083008,
"learning_rate": 3.843784117094081e-06,
"loss": 0.1949,
"step": 75700
},
{
"epoch": 6.1766623207301175,
"grad_norm": 7.536383628845215,
"learning_rate": 3.829952243452475e-06,
"loss": 0.2062,
"step": 75800
},
{
"epoch": 6.184810951760104,
"grad_norm": 13.602145195007324,
"learning_rate": 3.816129836264864e-06,
"loss": 0.2211,
"step": 75900
},
{
"epoch": 6.192959582790091,
"grad_norm": 10.88949966430664,
"learning_rate": 3.802317007363593e-06,
"loss": 0.2141,
"step": 76000
},
{
"epoch": 6.201108213820079,
"grad_norm": 3.1079776287078857,
"learning_rate": 3.7885138685035113e-06,
"loss": 0.2121,
"step": 76100
},
{
"epoch": 6.209256844850065,
"grad_norm": 10.546631813049316,
"learning_rate": 3.774720531361063e-06,
"loss": 0.2272,
"step": 76200
},
{
"epoch": 6.217405475880052,
"grad_norm": 22.11454200744629,
"learning_rate": 3.7609371075334e-06,
"loss": 0.2118,
"step": 76300
},
{
"epoch": 6.225554106910039,
"grad_norm": 16.33343505859375,
"learning_rate": 3.7471637085374614e-06,
"loss": 0.227,
"step": 76400
},
{
"epoch": 6.2337027379400265,
"grad_norm": 14.43807315826416,
"learning_rate": 3.7334004458090833e-06,
"loss": 0.2287,
"step": 76500
},
{
"epoch": 6.241851368970013,
"grad_norm": 14.813934326171875,
"learning_rate": 3.719647430702089e-06,
"loss": 0.2064,
"step": 76600
},
{
"epoch": 6.25,
"grad_norm": 5.587681770324707,
"learning_rate": 3.705904774487396e-06,
"loss": 0.2051,
"step": 76700
},
{
"epoch": 6.258148631029987,
"grad_norm": 7.330463409423828,
"learning_rate": 3.6921725883521087e-06,
"loss": 0.2225,
"step": 76800
},
{
"epoch": 6.2662972620599735,
"grad_norm": 19.726444244384766,
"learning_rate": 3.678450983398623e-06,
"loss": 0.2131,
"step": 76900
},
{
"epoch": 6.274445893089961,
"grad_norm": 15.526715278625488,
"learning_rate": 3.664740070643723e-06,
"loss": 0.2257,
"step": 77000
},
{
"epoch": 6.282594524119948,
"grad_norm": 9.113424301147461,
"learning_rate": 3.6510399610176906e-06,
"loss": 0.2075,
"step": 77100
},
{
"epoch": 6.290743155149935,
"grad_norm": 11.527823448181152,
"learning_rate": 3.6373507653634e-06,
"loss": 0.1921,
"step": 77200
},
{
"epoch": 6.298891786179921,
"grad_norm": 5.839615345001221,
"learning_rate": 3.6236725944354245e-06,
"loss": 0.2426,
"step": 77300
},
{
"epoch": 6.307040417209909,
"grad_norm": 16.31635284423828,
"learning_rate": 3.6100055588991435e-06,
"loss": 0.206,
"step": 77400
},
{
"epoch": 6.315189048239896,
"grad_norm": 13.138345718383789,
"learning_rate": 3.5963497693298386e-06,
"loss": 0.2223,
"step": 77500
},
{
"epoch": 6.3233376792698825,
"grad_norm": 3.202862024307251,
"learning_rate": 3.5827053362118085e-06,
"loss": 0.2095,
"step": 77600
},
{
"epoch": 6.331486310299869,
"grad_norm": 11.949639320373535,
"learning_rate": 3.5690723699374697e-06,
"loss": 0.2176,
"step": 77700
},
{
"epoch": 6.339634941329857,
"grad_norm": 17.555377960205078,
"learning_rate": 3.5554509808064602e-06,
"loss": 0.2204,
"step": 77800
},
{
"epoch": 6.347783572359844,
"grad_norm": 6.945880889892578,
"learning_rate": 3.5418412790247575e-06,
"loss": 0.2006,
"step": 77900
},
{
"epoch": 6.3559322033898304,
"grad_norm": 29.10856056213379,
"learning_rate": 3.528243374703776e-06,
"loss": 0.2089,
"step": 78000
},
{
"epoch": 6.364080834419817,
"grad_norm": 21.48233413696289,
"learning_rate": 3.5146573778594855e-06,
"loss": 0.2091,
"step": 78100
},
{
"epoch": 6.372229465449805,
"grad_norm": 10.77776050567627,
"learning_rate": 3.5010833984115135e-06,
"loss": 0.1919,
"step": 78200
},
{
"epoch": 6.380378096479792,
"grad_norm": 22.155200958251953,
"learning_rate": 3.4875215461822574e-06,
"loss": 0.2269,
"step": 78300
},
{
"epoch": 6.388526727509778,
"grad_norm": 12.029594421386719,
"learning_rate": 3.473971930896001e-06,
"loss": 0.2328,
"step": 78400
},
{
"epoch": 6.396675358539765,
"grad_norm": 8.563623428344727,
"learning_rate": 3.460434662178024e-06,
"loss": 0.2202,
"step": 78500
},
{
"epoch": 6.404823989569753,
"grad_norm": 6.394750118255615,
"learning_rate": 3.4469098495537063e-06,
"loss": 0.2324,
"step": 78600
},
{
"epoch": 6.4129726205997395,
"grad_norm": 15.485038757324219,
"learning_rate": 3.433397602447659e-06,
"loss": 0.2152,
"step": 78700
},
{
"epoch": 6.421121251629726,
"grad_norm": 15.386170387268066,
"learning_rate": 3.4198980301828256e-06,
"loss": 0.2065,
"step": 78800
},
{
"epoch": 6.429269882659713,
"grad_norm": 11.893247604370117,
"learning_rate": 3.406411241979603e-06,
"loss": 0.2235,
"step": 78900
},
{
"epoch": 6.4374185136897,
"grad_norm": 12.216060638427734,
"learning_rate": 3.3929373469549554e-06,
"loss": 0.211,
"step": 79000
},
{
"epoch": 6.445567144719687,
"grad_norm": 9.018731117248535,
"learning_rate": 3.379476454121533e-06,
"loss": 0.2253,
"step": 79100
},
{
"epoch": 6.453715775749674,
"grad_norm": 18.289003372192383,
"learning_rate": 3.366028672386792e-06,
"loss": 0.2265,
"step": 79200
},
{
"epoch": 6.461864406779661,
"grad_norm": 6.403520584106445,
"learning_rate": 3.35259411055211e-06,
"loss": 0.2241,
"step": 79300
},
{
"epoch": 6.470013037809648,
"grad_norm": 6.311509609222412,
"learning_rate": 3.3391728773119037e-06,
"loss": 0.2204,
"step": 79400
},
{
"epoch": 6.478161668839635,
"grad_norm": 16.21648597717285,
"learning_rate": 3.3257650812527566e-06,
"loss": 0.2083,
"step": 79500
},
{
"epoch": 6.486310299869622,
"grad_norm": 2.8797686100006104,
"learning_rate": 3.3123708308525354e-06,
"loss": 0.2152,
"step": 79600
},
{
"epoch": 6.494458930899609,
"grad_norm": 12.40995979309082,
"learning_rate": 3.298990234479514e-06,
"loss": 0.2061,
"step": 79700
},
{
"epoch": 6.5026075619295955,
"grad_norm": 13.1309814453125,
"learning_rate": 3.2856234003914945e-06,
"loss": 0.196,
"step": 79800
},
{
"epoch": 6.510756192959583,
"grad_norm": 11.270479202270508,
"learning_rate": 3.2722704367349357e-06,
"loss": 0.1969,
"step": 79900
},
{
"epoch": 6.51890482398957,
"grad_norm": 5.54075813293457,
"learning_rate": 3.258931451544075e-06,
"loss": 0.2345,
"step": 80000
},
{
"epoch": 6.527053455019557,
"grad_norm": 9.90404987335205,
"learning_rate": 3.245606552740053e-06,
"loss": 0.2223,
"step": 80100
},
{
"epoch": 6.5352020860495434,
"grad_norm": 16.18077850341797,
"learning_rate": 3.2322958481300426e-06,
"loss": 0.2163,
"step": 80200
},
{
"epoch": 6.54335071707953,
"grad_norm": 6.288787841796875,
"learning_rate": 3.2189994454063776e-06,
"loss": 0.2093,
"step": 80300
},
{
"epoch": 6.551499348109518,
"grad_norm": 21.265981674194336,
"learning_rate": 3.205717452145679e-06,
"loss": 0.1972,
"step": 80400
},
{
"epoch": 6.559647979139505,
"grad_norm": 14.27213191986084,
"learning_rate": 3.1924499758079863e-06,
"loss": 0.2211,
"step": 80500
},
{
"epoch": 6.567796610169491,
"grad_norm": 6.663931369781494,
"learning_rate": 3.1791971237358893e-06,
"loss": 0.2037,
"step": 80600
},
{
"epoch": 6.575945241199479,
"grad_norm": 20.920997619628906,
"learning_rate": 3.1659590031536546e-06,
"loss": 0.2016,
"step": 80700
},
{
"epoch": 6.584093872229466,
"grad_norm": 5.427749156951904,
"learning_rate": 3.1527357211663647e-06,
"loss": 0.2145,
"step": 80800
},
{
"epoch": 6.5922425032594525,
"grad_norm": 5.944066524505615,
"learning_rate": 3.1395273847590444e-06,
"loss": 0.2243,
"step": 80900
},
{
"epoch": 6.600391134289439,
"grad_norm": 4.4831366539001465,
"learning_rate": 3.1263341007958015e-06,
"loss": 0.2251,
"step": 81000
},
{
"epoch": 6.608539765319426,
"grad_norm": 7.92203950881958,
"learning_rate": 3.113155976018959e-06,
"loss": 0.2202,
"step": 81100
},
{
"epoch": 6.616688396349414,
"grad_norm": 7.755978584289551,
"learning_rate": 3.0999931170481922e-06,
"loss": 0.2099,
"step": 81200
},
{
"epoch": 6.6248370273794,
"grad_norm": 12.123492240905762,
"learning_rate": 3.086845630379668e-06,
"loss": 0.2279,
"step": 81300
},
{
"epoch": 6.632985658409387,
"grad_norm": 8.695425987243652,
"learning_rate": 3.073713622385177e-06,
"loss": 0.2171,
"step": 81400
},
{
"epoch": 6.641134289439374,
"grad_norm": 12.858569145202637,
"learning_rate": 3.0605971993112805e-06,
"loss": 0.21,
"step": 81500
},
{
"epoch": 6.6492829204693615,
"grad_norm": 20.741817474365234,
"learning_rate": 3.0474964672784456e-06,
"loss": 0.2101,
"step": 81600
},
{
"epoch": 6.657431551499348,
"grad_norm": 12.694851875305176,
"learning_rate": 3.034411532280193e-06,
"loss": 0.2119,
"step": 81700
},
{
"epoch": 6.665580182529335,
"grad_norm": 11.025914192199707,
"learning_rate": 3.0213425001822266e-06,
"loss": 0.1936,
"step": 81800
},
{
"epoch": 6.673728813559322,
"grad_norm": 8.600627899169922,
"learning_rate": 3.008289476721594e-06,
"loss": 0.2239,
"step": 81900
},
{
"epoch": 6.681877444589309,
"grad_norm": 5.949343681335449,
"learning_rate": 2.9952525675058175e-06,
"loss": 0.2103,
"step": 82000
},
{
"epoch": 6.690026075619296,
"grad_norm": 9.281770706176758,
"learning_rate": 2.9822318780120463e-06,
"loss": 0.2252,
"step": 82100
},
{
"epoch": 6.698174706649283,
"grad_norm": 8.222912788391113,
"learning_rate": 2.9692275135862002e-06,
"loss": 0.199,
"step": 82200
},
{
"epoch": 6.70632333767927,
"grad_norm": 10.598749160766602,
"learning_rate": 2.9562395794421193e-06,
"loss": 0.2244,
"step": 82300
},
{
"epoch": 6.7144719687092564,
"grad_norm": 11.608291625976562,
"learning_rate": 2.9432681806607145e-06,
"loss": 0.2176,
"step": 82400
},
{
"epoch": 6.722620599739244,
"grad_norm": 9.24106216430664,
"learning_rate": 2.9303134221891106e-06,
"loss": 0.2222,
"step": 82500
},
{
"epoch": 6.730769230769231,
"grad_norm": 2.6706371307373047,
"learning_rate": 2.917375408839803e-06,
"loss": 0.2159,
"step": 82600
},
{
"epoch": 6.738917861799218,
"grad_norm": 11.834959030151367,
"learning_rate": 2.904454245289805e-06,
"loss": 0.216,
"step": 82700
},
{
"epoch": 6.747066492829204,
"grad_norm": 3.9120168685913086,
"learning_rate": 2.8915500360798117e-06,
"loss": 0.2051,
"step": 82800
},
{
"epoch": 6.755215123859192,
"grad_norm": 9.347685813903809,
"learning_rate": 2.8786628856133404e-06,
"loss": 0.238,
"step": 82900
},
{
"epoch": 6.763363754889179,
"grad_norm": 7.142603874206543,
"learning_rate": 2.8657928981558926e-06,
"loss": 0.2076,
"step": 83000
},
{
"epoch": 6.7715123859191655,
"grad_norm": 15.814796447753906,
"learning_rate": 2.852940177834111e-06,
"loss": 0.2018,
"step": 83100
},
{
"epoch": 6.779661016949152,
"grad_norm": 11.722209930419922,
"learning_rate": 2.8401048286349353e-06,
"loss": 0.2275,
"step": 83200
},
{
"epoch": 6.78780964797914,
"grad_norm": 10.187668800354004,
"learning_rate": 2.8272869544047622e-06,
"loss": 0.2093,
"step": 83300
},
{
"epoch": 6.795958279009127,
"grad_norm": 15.927581787109375,
"learning_rate": 2.814486658848603e-06,
"loss": 0.2065,
"step": 83400
},
{
"epoch": 6.804106910039113,
"grad_norm": 12.883095741271973,
"learning_rate": 2.8017040455292465e-06,
"loss": 0.2108,
"step": 83500
},
{
"epoch": 6.8122555410691,
"grad_norm": 7.530974864959717,
"learning_rate": 2.788939217866422e-06,
"loss": 0.2139,
"step": 83600
},
{
"epoch": 6.820404172099087,
"grad_norm": 20.07868766784668,
"learning_rate": 2.7761922791359596e-06,
"loss": 0.2205,
"step": 83700
},
{
"epoch": 6.8285528031290745,
"grad_norm": 7.615067481994629,
"learning_rate": 2.7634633324689563e-06,
"loss": 0.2067,
"step": 83800
},
{
"epoch": 6.836701434159061,
"grad_norm": 10.10435962677002,
"learning_rate": 2.7507524808509416e-06,
"loss": 0.2284,
"step": 83900
},
{
"epoch": 6.844850065189048,
"grad_norm": 12.469111442565918,
"learning_rate": 2.738059827121046e-06,
"loss": 0.2086,
"step": 84000
},
{
"epoch": 6.852998696219036,
"grad_norm": 8.140021324157715,
"learning_rate": 2.7253854739711634e-06,
"loss": 0.2162,
"step": 84100
},
{
"epoch": 6.861147327249022,
"grad_norm": 14.818914413452148,
"learning_rate": 2.7127295239451273e-06,
"loss": 0.2153,
"step": 84200
},
{
"epoch": 6.869295958279009,
"grad_norm": 8.947492599487305,
"learning_rate": 2.700092079437877e-06,
"loss": 0.2073,
"step": 84300
},
{
"epoch": 6.877444589308996,
"grad_norm": 8.173857688903809,
"learning_rate": 2.687473242694629e-06,
"loss": 0.2136,
"step": 84400
},
{
"epoch": 6.885593220338983,
"grad_norm": 4.175146579742432,
"learning_rate": 2.6748731158100528e-06,
"loss": 0.2082,
"step": 84500
},
{
"epoch": 6.89374185136897,
"grad_norm": 8.696370124816895,
"learning_rate": 2.6622918007274406e-06,
"loss": 0.2128,
"step": 84600
},
{
"epoch": 6.901890482398957,
"grad_norm": 8.253527641296387,
"learning_rate": 2.649729399237886e-06,
"loss": 0.1985,
"step": 84700
},
{
"epoch": 6.910039113428944,
"grad_norm": 9.825946807861328,
"learning_rate": 2.6371860129794585e-06,
"loss": 0.2084,
"step": 84800
},
{
"epoch": 6.918187744458931,
"grad_norm": 21.79430389404297,
"learning_rate": 2.624661743436383e-06,
"loss": 0.2154,
"step": 84900
},
{
"epoch": 6.926336375488918,
"grad_norm": 17.554534912109375,
"learning_rate": 2.6121566919382168e-06,
"loss": 0.2073,
"step": 85000
},
{
"epoch": 6.934485006518905,
"grad_norm": 14.525189399719238,
"learning_rate": 2.599670959659032e-06,
"loss": 0.2136,
"step": 85100
},
{
"epoch": 6.942633637548892,
"grad_norm": 17.66045570373535,
"learning_rate": 2.5872046476165926e-06,
"loss": 0.2259,
"step": 85200
},
{
"epoch": 6.9507822685788785,
"grad_norm": 12.12194538116455,
"learning_rate": 2.574757856671542e-06,
"loss": 0.2303,
"step": 85300
},
{
"epoch": 6.958930899608866,
"grad_norm": 16.121667861938477,
"learning_rate": 2.5623306875265865e-06,
"loss": 0.209,
"step": 85400
},
{
"epoch": 6.967079530638853,
"grad_norm": 37.0359001159668,
"learning_rate": 2.5499232407256764e-06,
"loss": 0.2135,
"step": 85500
},
{
"epoch": 6.97522816166884,
"grad_norm": 9.753621101379395,
"learning_rate": 2.5375356166531974e-06,
"loss": 0.2246,
"step": 85600
},
{
"epoch": 6.983376792698826,
"grad_norm": 11.933328628540039,
"learning_rate": 2.525167915533153e-06,
"loss": 0.2083,
"step": 85700
},
{
"epoch": 6.991525423728813,
"grad_norm": 11.32873821258545,
"learning_rate": 2.512820237428366e-06,
"loss": 0.221,
"step": 85800
},
{
"epoch": 6.999674054758801,
"grad_norm": 10.335704803466797,
"learning_rate": 2.5004926822396468e-06,
"loss": 0.218,
"step": 85900
},
{
"epoch": 7.0,
"eval_accuracy": 0.8200803212851405,
"eval_loss": 0.6657418608665466,
"eval_runtime": 6.9032,
"eval_samples_per_second": 360.703,
"eval_steps_per_second": 45.196,
"step": 85904
},
{
"epoch": 7.0078226857887875,
"grad_norm": 13.04452133178711,
"learning_rate": 2.4881853497050074e-06,
"loss": 0.1828,
"step": 86000
},
{
"epoch": 7.015971316818774,
"grad_norm": 11.350065231323242,
"learning_rate": 2.475898339398842e-06,
"loss": 0.1981,
"step": 86100
},
{
"epoch": 7.024119947848761,
"grad_norm": 3.5544838905334473,
"learning_rate": 2.463631750731125e-06,
"loss": 0.1873,
"step": 86200
},
{
"epoch": 7.032268578878749,
"grad_norm": 6.474255084991455,
"learning_rate": 2.451385682946606e-06,
"loss": 0.205,
"step": 86300
},
{
"epoch": 7.040417209908735,
"grad_norm": 10.676136016845703,
"learning_rate": 2.43916023512401e-06,
"loss": 0.1702,
"step": 86400
},
{
"epoch": 7.048565840938722,
"grad_norm": 6.142400741577148,
"learning_rate": 2.4269555061752303e-06,
"loss": 0.2017,
"step": 86500
},
{
"epoch": 7.056714471968709,
"grad_norm": 16.273656845092773,
"learning_rate": 2.4147715948445323e-06,
"loss": 0.1776,
"step": 86600
},
{
"epoch": 7.064863102998697,
"grad_norm": 22.690208435058594,
"learning_rate": 2.4026085997077486e-06,
"loss": 0.1762,
"step": 86700
},
{
"epoch": 7.073011734028683,
"grad_norm": 14.49307632446289,
"learning_rate": 2.390466619171492e-06,
"loss": 0.1664,
"step": 86800
},
{
"epoch": 7.08116036505867,
"grad_norm": 14.948646545410156,
"learning_rate": 2.378345751472351e-06,
"loss": 0.1953,
"step": 86900
},
{
"epoch": 7.089308996088657,
"grad_norm": 12.674484252929688,
"learning_rate": 2.3662460946760962e-06,
"loss": 0.1932,
"step": 87000
},
{
"epoch": 7.0974576271186445,
"grad_norm": 14.729815483093262,
"learning_rate": 2.354167746676892e-06,
"loss": 0.1814,
"step": 87100
},
{
"epoch": 7.105606258148631,
"grad_norm": 16.739356994628906,
"learning_rate": 2.3421108051964974e-06,
"loss": 0.1761,
"step": 87200
},
{
"epoch": 7.113754889178618,
"grad_norm": 16.266368865966797,
"learning_rate": 2.330075367783479e-06,
"loss": 0.1947,
"step": 87300
},
{
"epoch": 7.121903520208605,
"grad_norm": 12.137019157409668,
"learning_rate": 2.318061531812422e-06,
"loss": 0.2017,
"step": 87400
},
{
"epoch": 7.130052151238592,
"grad_norm": 7.073469161987305,
"learning_rate": 2.3060693944831404e-06,
"loss": 0.1746,
"step": 87500
},
{
"epoch": 7.138200782268579,
"grad_norm": 7.888490200042725,
"learning_rate": 2.294099052819893e-06,
"loss": 0.1882,
"step": 87600
},
{
"epoch": 7.146349413298566,
"grad_norm": 18.83835792541504,
"learning_rate": 2.282150603670596e-06,
"loss": 0.182,
"step": 87700
},
{
"epoch": 7.154498044328553,
"grad_norm": 9.491145133972168,
"learning_rate": 2.2702241437060463e-06,
"loss": 0.1817,
"step": 87800
},
{
"epoch": 7.162646675358539,
"grad_norm": 11.629495620727539,
"learning_rate": 2.2583197694191272e-06,
"loss": 0.1737,
"step": 87900
},
{
"epoch": 7.170795306388527,
"grad_norm": 3.3986611366271973,
"learning_rate": 2.246437577124038e-06,
"loss": 0.1839,
"step": 88000
},
{
"epoch": 7.178943937418514,
"grad_norm": 3.2696523666381836,
"learning_rate": 2.2345776629555085e-06,
"loss": 0.1896,
"step": 88100
},
{
"epoch": 7.1870925684485005,
"grad_norm": 9.869660377502441,
"learning_rate": 2.2227401228680275e-06,
"loss": 0.2028,
"step": 88200
},
{
"epoch": 7.195241199478487,
"grad_norm": 8.699070930480957,
"learning_rate": 2.2109250526350584e-06,
"loss": 0.2025,
"step": 88300
},
{
"epoch": 7.203389830508475,
"grad_norm": 3.9306254386901855,
"learning_rate": 2.1991325478482695e-06,
"loss": 0.1827,
"step": 88400
},
{
"epoch": 7.211538461538462,
"grad_norm": 18.14926528930664,
"learning_rate": 2.187362703916766e-06,
"loss": 0.1843,
"step": 88500
},
{
"epoch": 7.219687092568448,
"grad_norm": 15.083455085754395,
"learning_rate": 2.175615616066305e-06,
"loss": 0.1932,
"step": 88600
},
{
"epoch": 7.227835723598435,
"grad_norm": 14.958844184875488,
"learning_rate": 2.163891379338535e-06,
"loss": 0.1839,
"step": 88700
},
{
"epoch": 7.235984354628423,
"grad_norm": 9.219823837280273,
"learning_rate": 2.1521900885902214e-06,
"loss": 0.205,
"step": 88800
},
{
"epoch": 7.24413298565841,
"grad_norm": 10.361544609069824,
"learning_rate": 2.1405118384924858e-06,
"loss": 0.1942,
"step": 88900
},
{
"epoch": 7.252281616688396,
"grad_norm": 7.847745418548584,
"learning_rate": 2.128856723530033e-06,
"loss": 0.2046,
"step": 89000
},
{
"epoch": 7.260430247718383,
"grad_norm": 8.953947067260742,
"learning_rate": 2.1172248380003853e-06,
"loss": 0.1903,
"step": 89100
},
{
"epoch": 7.26857887874837,
"grad_norm": 6.825370788574219,
"learning_rate": 2.105616276013133e-06,
"loss": 0.178,
"step": 89200
},
{
"epoch": 7.2767275097783575,
"grad_norm": 10.48969554901123,
"learning_rate": 2.0940311314891574e-06,
"loss": 0.1778,
"step": 89300
},
{
"epoch": 7.284876140808344,
"grad_norm": 13.994695663452148,
"learning_rate": 2.082469498159879e-06,
"loss": 0.1673,
"step": 89400
},
{
"epoch": 7.293024771838331,
"grad_norm": 17.321313858032227,
"learning_rate": 2.0709314695664957e-06,
"loss": 0.2043,
"step": 89500
},
{
"epoch": 7.301173402868318,
"grad_norm": 10.52856731414795,
"learning_rate": 2.0594171390592294e-06,
"loss": 0.1942,
"step": 89600
},
{
"epoch": 7.309322033898305,
"grad_norm": 23.261329650878906,
"learning_rate": 2.047926599796568e-06,
"loss": 0.1816,
"step": 89700
},
{
"epoch": 7.317470664928292,
"grad_norm": 6.534886360168457,
"learning_rate": 2.0364599447445126e-06,
"loss": 0.1808,
"step": 89800
},
{
"epoch": 7.325619295958279,
"grad_norm": 12.067914962768555,
"learning_rate": 2.0250172666758267e-06,
"loss": 0.187,
"step": 89900
},
{
"epoch": 7.333767926988266,
"grad_norm": 11.018478393554688,
"learning_rate": 2.0135986581692817e-06,
"loss": 0.1865,
"step": 90000
},
{
"epoch": 7.341916558018253,
"grad_norm": 9.79710865020752,
"learning_rate": 2.002204211608913e-06,
"loss": 0.1987,
"step": 90100
},
{
"epoch": 7.35006518904824,
"grad_norm": 15.164643287658691,
"learning_rate": 1.990834019183268e-06,
"loss": 0.1973,
"step": 90200
},
{
"epoch": 7.358213820078227,
"grad_norm": 22.170740127563477,
"learning_rate": 1.9794881728846642e-06,
"loss": 0.1702,
"step": 90300
},
{
"epoch": 7.3663624511082135,
"grad_norm": 8.200043678283691,
"learning_rate": 1.968166764508442e-06,
"loss": 0.183,
"step": 90400
},
{
"epoch": 7.374511082138201,
"grad_norm": 6.145725250244141,
"learning_rate": 1.9568698856522215e-06,
"loss": 0.1906,
"step": 90500
},
{
"epoch": 7.382659713168188,
"grad_norm": 22.14548683166504,
"learning_rate": 1.945597627715166e-06,
"loss": 0.1947,
"step": 90600
},
{
"epoch": 7.390808344198175,
"grad_norm": 10.075164794921875,
"learning_rate": 1.934350081897237e-06,
"loss": 0.171,
"step": 90700
},
{
"epoch": 7.398956975228161,
"grad_norm": 6.933922290802002,
"learning_rate": 1.923127339198459e-06,
"loss": 0.1845,
"step": 90800
},
{
"epoch": 7.407105606258149,
"grad_norm": 26.223041534423828,
"learning_rate": 1.9119294904181847e-06,
"loss": 0.1852,
"step": 90900
},
{
"epoch": 7.415254237288136,
"grad_norm": 4.778967380523682,
"learning_rate": 1.900756626154356e-06,
"loss": 0.1958,
"step": 91000
},
{
"epoch": 7.423402868318123,
"grad_norm": 29.773698806762695,
"learning_rate": 1.889608836802776e-06,
"loss": 0.1809,
"step": 91100
},
{
"epoch": 7.431551499348109,
"grad_norm": 8.9940767288208,
"learning_rate": 1.8784862125563734e-06,
"loss": 0.1869,
"step": 91200
},
{
"epoch": 7.439700130378096,
"grad_norm": 15.34753704071045,
"learning_rate": 1.8673888434044756e-06,
"loss": 0.1863,
"step": 91300
},
{
"epoch": 7.447848761408084,
"grad_norm": 19.44320297241211,
"learning_rate": 1.8563168191320823e-06,
"loss": 0.1798,
"step": 91400
},
{
"epoch": 7.4559973924380705,
"grad_norm": 12.468984603881836,
"learning_rate": 1.8452702293191339e-06,
"loss": 0.1808,
"step": 91500
},
{
"epoch": 7.464146023468057,
"grad_norm": 8.79600715637207,
"learning_rate": 1.8342491633397863e-06,
"loss": 0.1823,
"step": 91600
},
{
"epoch": 7.472294654498044,
"grad_norm": 15.76307487487793,
"learning_rate": 1.8232537103616953e-06,
"loss": 0.1959,
"step": 91700
},
{
"epoch": 7.480443285528032,
"grad_norm": 9.05780029296875,
"learning_rate": 1.8122839593452902e-06,
"loss": 0.1797,
"step": 91800
},
{
"epoch": 7.488591916558018,
"grad_norm": 11.826004981994629,
"learning_rate": 1.8013399990430525e-06,
"loss": 0.1639,
"step": 91900
},
{
"epoch": 7.496740547588005,
"grad_norm": 20.31383514404297,
"learning_rate": 1.7904219179988007e-06,
"loss": 0.1916,
"step": 92000
},
{
"epoch": 7.504889178617992,
"grad_norm": 18.240629196166992,
"learning_rate": 1.7795298045469766e-06,
"loss": 0.1791,
"step": 92100
},
{
"epoch": 7.5130378096479795,
"grad_norm": 20.392873764038086,
"learning_rate": 1.7686637468119223e-06,
"loss": 0.2021,
"step": 92200
},
{
"epoch": 7.521186440677966,
"grad_norm": 9.732405662536621,
"learning_rate": 1.757823832707175e-06,
"loss": 0.1818,
"step": 92300
},
{
"epoch": 7.529335071707953,
"grad_norm": 21.23190689086914,
"learning_rate": 1.7470101499347498e-06,
"loss": 0.1692,
"step": 92400
},
{
"epoch": 7.53748370273794,
"grad_norm": 7.4514641761779785,
"learning_rate": 1.736222785984435e-06,
"loss": 0.2084,
"step": 92500
},
{
"epoch": 7.5456323337679265,
"grad_norm": 13.29001522064209,
"learning_rate": 1.7254618281330838e-06,
"loss": 0.1897,
"step": 92600
},
{
"epoch": 7.553780964797914,
"grad_norm": 9.683525085449219,
"learning_rate": 1.7147273634439021e-06,
"loss": 0.156,
"step": 92700
},
{
"epoch": 7.561929595827901,
"grad_norm": 12.022348403930664,
"learning_rate": 1.7040194787657566e-06,
"loss": 0.2136,
"step": 92800
},
{
"epoch": 7.570078226857888,
"grad_norm": 11.087843894958496,
"learning_rate": 1.6933382607324572e-06,
"loss": 0.171,
"step": 92900
},
{
"epoch": 7.578226857887875,
"grad_norm": 20.101045608520508,
"learning_rate": 1.6826837957620662e-06,
"loss": 0.2131,
"step": 93000
},
{
"epoch": 7.586375488917862,
"grad_norm": 13.087589263916016,
"learning_rate": 1.672056170056196e-06,
"loss": 0.1791,
"step": 93100
},
{
"epoch": 7.594524119947849,
"grad_norm": 9.458551406860352,
"learning_rate": 1.6614554695993085e-06,
"loss": 0.1746,
"step": 93200
},
{
"epoch": 7.602672750977836,
"grad_norm": 12.884553909301758,
"learning_rate": 1.6508817801580268e-06,
"loss": 0.1673,
"step": 93300
},
{
"epoch": 7.610821382007822,
"grad_norm": 10.40186595916748,
"learning_rate": 1.6403351872804347e-06,
"loss": 0.1659,
"step": 93400
},
{
"epoch": 7.61897001303781,
"grad_norm": 12.832286834716797,
"learning_rate": 1.6298157762953897e-06,
"loss": 0.1693,
"step": 93500
},
{
"epoch": 7.627118644067797,
"grad_norm": 13.989652633666992,
"learning_rate": 1.6193236323118283e-06,
"loss": 0.203,
"step": 93600
},
{
"epoch": 7.6352672750977835,
"grad_norm": 13.184144020080566,
"learning_rate": 1.6088588402180783e-06,
"loss": 0.1983,
"step": 93700
},
{
"epoch": 7.64341590612777,
"grad_norm": 50.71080017089844,
"learning_rate": 1.5984214846811735e-06,
"loss": 0.1837,
"step": 93800
},
{
"epoch": 7.651564537157758,
"grad_norm": 8.608222007751465,
"learning_rate": 1.588011650146169e-06,
"loss": 0.1786,
"step": 93900
},
{
"epoch": 7.659713168187745,
"grad_norm": 9.973206520080566,
"learning_rate": 1.5776294208354537e-06,
"loss": 0.1873,
"step": 94000
},
{
"epoch": 7.667861799217731,
"grad_norm": 3.6279351711273193,
"learning_rate": 1.5672748807480736e-06,
"loss": 0.1754,
"step": 94100
},
{
"epoch": 7.676010430247718,
"grad_norm": 13.710479736328125,
"learning_rate": 1.5569481136590554e-06,
"loss": 0.1973,
"step": 94200
},
{
"epoch": 7.684159061277706,
"grad_norm": 20.849790573120117,
"learning_rate": 1.5466492031187174e-06,
"loss": 0.1953,
"step": 94300
},
{
"epoch": 7.6923076923076925,
"grad_norm": 16.05866241455078,
"learning_rate": 1.5363782324520033e-06,
"loss": 0.1834,
"step": 94400
},
{
"epoch": 7.700456323337679,
"grad_norm": 10.594083786010742,
"learning_rate": 1.5261352847578044e-06,
"loss": 0.196,
"step": 94500
},
{
"epoch": 7.708604954367666,
"grad_norm": 14.200790405273438,
"learning_rate": 1.5159204429082874e-06,
"loss": 0.1793,
"step": 94600
},
{
"epoch": 7.716753585397653,
"grad_norm": 3.8873071670532227,
"learning_rate": 1.5057337895482255e-06,
"loss": 0.1865,
"step": 94700
},
{
"epoch": 7.72490221642764,
"grad_norm": 13.96704387664795,
"learning_rate": 1.4955754070943268e-06,
"loss": 0.1653,
"step": 94800
},
{
"epoch": 7.733050847457627,
"grad_norm": 23.539247512817383,
"learning_rate": 1.48544537773457e-06,
"loss": 0.1713,
"step": 94900
},
{
"epoch": 7.741199478487614,
"grad_norm": 14.154293060302734,
"learning_rate": 1.4753437834275397e-06,
"loss": 0.1894,
"step": 95000
},
{
"epoch": 7.749348109517601,
"grad_norm": 8.608110427856445,
"learning_rate": 1.4652707059017607e-06,
"loss": 0.1887,
"step": 95100
},
{
"epoch": 7.757496740547588,
"grad_norm": 9.453892707824707,
"learning_rate": 1.4552262266550382e-06,
"loss": 0.1769,
"step": 95200
},
{
"epoch": 7.765645371577575,
"grad_norm": 12.239083290100098,
"learning_rate": 1.4452104269538009e-06,
"loss": 0.1699,
"step": 95300
},
{
"epoch": 7.773794002607562,
"grad_norm": 10.937909126281738,
"learning_rate": 1.4352233878324384e-06,
"loss": 0.1667,
"step": 95400
},
{
"epoch": 7.781942633637549,
"grad_norm": 21.223346710205078,
"learning_rate": 1.4252651900926496e-06,
"loss": 0.182,
"step": 95500
},
{
"epoch": 7.790091264667536,
"grad_norm": 7.070313453674316,
"learning_rate": 1.4153359143027879e-06,
"loss": 0.1896,
"step": 95600
},
{
"epoch": 7.798239895697523,
"grad_norm": 14.346339225769043,
"learning_rate": 1.4054356407972086e-06,
"loss": 0.1743,
"step": 95700
},
{
"epoch": 7.80638852672751,
"grad_norm": 15.966556549072266,
"learning_rate": 1.3955644496756199e-06,
"loss": 0.1902,
"step": 95800
},
{
"epoch": 7.8145371577574965,
"grad_norm": 16.198644638061523,
"learning_rate": 1.3857224208024345e-06,
"loss": 0.1945,
"step": 95900
},
{
"epoch": 7.822685788787483,
"grad_norm": 8.803377151489258,
"learning_rate": 1.3759096338061222e-06,
"loss": 0.1793,
"step": 96000
},
{
"epoch": 7.830834419817471,
"grad_norm": 19.771717071533203,
"learning_rate": 1.3661261680785693e-06,
"loss": 0.1809,
"step": 96100
},
{
"epoch": 7.838983050847458,
"grad_norm": 11.52552318572998,
"learning_rate": 1.3563721027744309e-06,
"loss": 0.1887,
"step": 96200
},
{
"epoch": 7.847131681877444,
"grad_norm": 17.998104095458984,
"learning_rate": 1.3466475168104953e-06,
"loss": 0.2107,
"step": 96300
},
{
"epoch": 7.855280312907432,
"grad_norm": 6.081639289855957,
"learning_rate": 1.3369524888650437e-06,
"loss": 0.1849,
"step": 96400
},
{
"epoch": 7.863428943937419,
"grad_norm": 6.099484443664551,
"learning_rate": 1.3272870973772118e-06,
"loss": 0.1847,
"step": 96500
},
{
"epoch": 7.8715775749674055,
"grad_norm": 19.433902740478516,
"learning_rate": 1.3176514205463586e-06,
"loss": 0.2,
"step": 96600
},
{
"epoch": 7.879726205997392,
"grad_norm": 6.365217208862305,
"learning_rate": 1.3080455363314309e-06,
"loss": 0.2062,
"step": 96700
},
{
"epoch": 7.887874837027379,
"grad_norm": 9.893994331359863,
"learning_rate": 1.2984695224503351e-06,
"loss": 0.1721,
"step": 96800
},
{
"epoch": 7.896023468057367,
"grad_norm": 22.75550079345703,
"learning_rate": 1.2889234563793058e-06,
"loss": 0.204,
"step": 96900
},
{
"epoch": 7.904172099087353,
"grad_norm": 2.8168067932128906,
"learning_rate": 1.279407415352279e-06,
"loss": 0.1963,
"step": 97000
},
{
"epoch": 7.91232073011734,
"grad_norm": 19.346757888793945,
"learning_rate": 1.2699214763602741e-06,
"loss": 0.1845,
"step": 97100
},
{
"epoch": 7.920469361147327,
"grad_norm": 13.861513137817383,
"learning_rate": 1.2604657161507566e-06,
"loss": 0.1934,
"step": 97200
},
{
"epoch": 7.9286179921773146,
"grad_norm": 12.996659278869629,
"learning_rate": 1.2510402112270326e-06,
"loss": 0.1808,
"step": 97300
},
{
"epoch": 7.936766623207301,
"grad_norm": 16.255569458007812,
"learning_rate": 1.2416450378476196e-06,
"loss": 0.1919,
"step": 97400
},
{
"epoch": 7.944915254237288,
"grad_norm": 9.47265625,
"learning_rate": 1.2322802720256355e-06,
"loss": 0.1887,
"step": 97500
},
{
"epoch": 7.953063885267275,
"grad_norm": 13.006512641906738,
"learning_rate": 1.2229459895281787e-06,
"loss": 0.1927,
"step": 97600
},
{
"epoch": 7.9612125162972625,
"grad_norm": 13.849684715270996,
"learning_rate": 1.213642265875718e-06,
"loss": 0.1906,
"step": 97700
},
{
"epoch": 7.969361147327249,
"grad_norm": 25.117225646972656,
"learning_rate": 1.2043691763414844e-06,
"loss": 0.1659,
"step": 97800
},
{
"epoch": 7.977509778357236,
"grad_norm": 9.633444786071777,
"learning_rate": 1.1951267959508562e-06,
"loss": 0.1923,
"step": 97900
},
{
"epoch": 7.985658409387223,
"grad_norm": 9.853534698486328,
"learning_rate": 1.185915199480751e-06,
"loss": 0.1969,
"step": 98000
},
{
"epoch": 7.9938070404172095,
"grad_norm": 12.424792289733887,
"learning_rate": 1.1767344614590303e-06,
"loss": 0.1772,
"step": 98100
},
{
"epoch": 8.0,
"eval_accuracy": 0.8200803212851405,
"eval_loss": 0.7215536236763,
"eval_runtime": 7.0555,
"eval_samples_per_second": 352.917,
"eval_steps_per_second": 44.221,
"step": 98176
},
{
"epoch": 8.001955671447197,
"grad_norm": 13.437636375427246,
"learning_rate": 1.167584656163887e-06,
"loss": 0.1774,
"step": 98200
},
{
"epoch": 8.010104302477183,
"grad_norm": 14.577449798583984,
"learning_rate": 1.1584658576232482e-06,
"loss": 0.1693,
"step": 98300
},
{
"epoch": 8.01825293350717,
"grad_norm": 18.45952606201172,
"learning_rate": 1.1493781396141795e-06,
"loss": 0.17,
"step": 98400
},
{
"epoch": 8.026401564537158,
"grad_norm": 18.29120635986328,
"learning_rate": 1.1403215756622804e-06,
"loss": 0.178,
"step": 98500
},
{
"epoch": 8.034550195567144,
"grad_norm": 11.486896514892578,
"learning_rate": 1.1312962390410954e-06,
"loss": 0.1815,
"step": 98600
},
{
"epoch": 8.042698826597132,
"grad_norm": 19.90141487121582,
"learning_rate": 1.1223022027715197e-06,
"loss": 0.1682,
"step": 98700
},
{
"epoch": 8.05084745762712,
"grad_norm": 11.248079299926758,
"learning_rate": 1.1133395396212048e-06,
"loss": 0.169,
"step": 98800
},
{
"epoch": 8.058996088657105,
"grad_norm": 7.839399814605713,
"learning_rate": 1.104408322103978e-06,
"loss": 0.1684,
"step": 98900
},
{
"epoch": 8.067144719687093,
"grad_norm": 8.082372665405273,
"learning_rate": 1.095508622479247e-06,
"loss": 0.1769,
"step": 99000
},
{
"epoch": 8.075293350717079,
"grad_norm": 9.952238082885742,
"learning_rate": 1.0866405127514234e-06,
"loss": 0.1866,
"step": 99100
},
{
"epoch": 8.083441981747066,
"grad_norm": 5.250309467315674,
"learning_rate": 1.0778040646693316e-06,
"loss": 0.162,
"step": 99200
},
{
"epoch": 8.091590612777054,
"grad_norm": 9.988779067993164,
"learning_rate": 1.0689993497256336e-06,
"loss": 0.177,
"step": 99300
},
{
"epoch": 8.09973924380704,
"grad_norm": 8.978513717651367,
"learning_rate": 1.0602264391562506e-06,
"loss": 0.151,
"step": 99400
},
{
"epoch": 8.107887874837028,
"grad_norm": 23.60556983947754,
"learning_rate": 1.051485403939786e-06,
"loss": 0.1734,
"step": 99500
},
{
"epoch": 8.116036505867015,
"grad_norm": 10.938061714172363,
"learning_rate": 1.0427763147969467e-06,
"loss": 0.1733,
"step": 99600
},
{
"epoch": 8.124185136897001,
"grad_norm": 5.527510643005371,
"learning_rate": 1.0340992421899776e-06,
"loss": 0.1565,
"step": 99700
},
{
"epoch": 8.132333767926989,
"grad_norm": 9.493518829345703,
"learning_rate": 1.0254542563220922e-06,
"loss": 0.181,
"step": 99800
},
{
"epoch": 8.140482398956975,
"grad_norm": 7.9793548583984375,
"learning_rate": 1.0168414271368953e-06,
"loss": 0.1837,
"step": 99900
},
{
"epoch": 8.148631029986962,
"grad_norm": 11.252303123474121,
"learning_rate": 1.0082608243178276e-06,
"loss": 0.1708,
"step": 100000
},
{
"epoch": 8.15677966101695,
"grad_norm": 14.102470397949219,
"learning_rate": 9.997125172875943e-07,
"loss": 0.1884,
"step": 100100
},
{
"epoch": 8.164928292046936,
"grad_norm": 38.51998519897461,
"learning_rate": 9.91196575207608e-07,
"loss": 0.184,
"step": 100200
},
{
"epoch": 8.173076923076923,
"grad_norm": 7.0270466804504395,
"learning_rate": 9.82713066977427e-07,
"loss": 0.1489,
"step": 100300
},
{
"epoch": 8.18122555410691,
"grad_norm": 14.944999694824219,
"learning_rate": 9.742620612341992e-07,
"loss": 0.1835,
"step": 100400
},
{
"epoch": 8.189374185136897,
"grad_norm": 7.147238731384277,
"learning_rate": 9.658436263521048e-07,
"loss": 0.1512,
"step": 100500
},
{
"epoch": 8.197522816166884,
"grad_norm": 5.465837001800537,
"learning_rate": 9.574578304418063e-07,
"loss": 0.1702,
"step": 100600
},
{
"epoch": 8.20567144719687,
"grad_norm": 4.3965630531311035,
"learning_rate": 9.491047413498933e-07,
"loss": 0.1619,
"step": 100700
},
{
"epoch": 8.213820078226858,
"grad_norm": 21.602157592773438,
"learning_rate": 9.407844266583377e-07,
"loss": 0.1726,
"step": 100800
},
{
"epoch": 8.221968709256846,
"grad_norm": 16.533201217651367,
"learning_rate": 9.324969536839435e-07,
"loss": 0.1564,
"step": 100900
},
{
"epoch": 8.230117340286832,
"grad_norm": 17.454898834228516,
"learning_rate": 9.242423894778046e-07,
"loss": 0.1847,
"step": 101000
},
{
"epoch": 8.23826597131682,
"grad_norm": 17.726686477661133,
"learning_rate": 9.160208008247618e-07,
"loss": 0.1695,
"step": 101100
},
{
"epoch": 8.246414602346805,
"grad_norm": 31.844257354736328,
"learning_rate": 9.078322542428597e-07,
"loss": 0.1698,
"step": 101200
},
{
"epoch": 8.254563233376793,
"grad_norm": 9.689949989318848,
"learning_rate": 8.99676815982814e-07,
"loss": 0.153,
"step": 101300
},
{
"epoch": 8.26271186440678,
"grad_norm": 13.61907958984375,
"learning_rate": 8.915545520274699e-07,
"loss": 0.177,
"step": 101400
},
{
"epoch": 8.270860495436766,
"grad_norm": 11.14121150970459,
"learning_rate": 8.834655280912718e-07,
"loss": 0.1674,
"step": 101500
},
{
"epoch": 8.279009126466754,
"grad_norm": 12.197967529296875,
"learning_rate": 8.754098096197312e-07,
"loss": 0.1787,
"step": 101600
},
{
"epoch": 8.28715775749674,
"grad_norm": 12.565035820007324,
"learning_rate": 8.67387461788895e-07,
"loss": 0.1679,
"step": 101700
},
{
"epoch": 8.295306388526727,
"grad_norm": 21.256549835205078,
"learning_rate": 8.593985495048201e-07,
"loss": 0.1695,
"step": 101800
},
{
"epoch": 8.303455019556715,
"grad_norm": 4.485990524291992,
"learning_rate": 8.514431374030496e-07,
"loss": 0.1654,
"step": 101900
},
{
"epoch": 8.3116036505867,
"grad_norm": 13.213761329650879,
"learning_rate": 8.435212898480855e-07,
"loss": 0.1626,
"step": 102000
},
{
"epoch": 8.319752281616688,
"grad_norm": 19.035646438598633,
"learning_rate": 8.356330709328725e-07,
"loss": 0.1611,
"step": 102100
},
{
"epoch": 8.327900912646676,
"grad_norm": 21.1912841796875,
"learning_rate": 8.277785444782765e-07,
"loss": 0.1607,
"step": 102200
},
{
"epoch": 8.336049543676662,
"grad_norm": 19.324132919311523,
"learning_rate": 8.199577740325703e-07,
"loss": 0.1741,
"step": 102300
},
{
"epoch": 8.34419817470665,
"grad_norm": 8.325228691101074,
"learning_rate": 8.121708228709174e-07,
"loss": 0.1808,
"step": 102400
},
{
"epoch": 8.352346805736635,
"grad_norm": 11.028812408447266,
"learning_rate": 8.044177539948617e-07,
"loss": 0.169,
"step": 102500
},
{
"epoch": 8.360495436766623,
"grad_norm": 20.587303161621094,
"learning_rate": 7.966986301318158e-07,
"loss": 0.1569,
"step": 102600
},
{
"epoch": 8.36864406779661,
"grad_norm": 8.49282455444336,
"learning_rate": 7.890135137345589e-07,
"loss": 0.1584,
"step": 102700
},
{
"epoch": 8.376792698826597,
"grad_norm": 14.866241455078125,
"learning_rate": 7.813624669807246e-07,
"loss": 0.1608,
"step": 102800
},
{
"epoch": 8.384941329856584,
"grad_norm": 3.761150598526001,
"learning_rate": 7.73745551772298e-07,
"loss": 0.1533,
"step": 102900
},
{
"epoch": 8.393089960886572,
"grad_norm": 17.36056900024414,
"learning_rate": 7.66162829735122e-07,
"loss": 0.1723,
"step": 103000
},
{
"epoch": 8.401238591916558,
"grad_norm": 14.63774585723877,
"learning_rate": 7.586143622183922e-07,
"loss": 0.1769,
"step": 103100
},
{
"epoch": 8.409387222946545,
"grad_norm": 15.453008651733398,
"learning_rate": 7.511002102941639e-07,
"loss": 0.1845,
"step": 103200
},
{
"epoch": 8.417535853976531,
"grad_norm": 23.958969116210938,
"learning_rate": 7.436204347568548e-07,
"loss": 0.1829,
"step": 103300
},
{
"epoch": 8.425684485006519,
"grad_norm": 22.29449462890625,
"learning_rate": 7.361750961227587e-07,
"loss": 0.1722,
"step": 103400
},
{
"epoch": 8.433833116036507,
"grad_norm": 12.636420249938965,
"learning_rate": 7.287642546295487e-07,
"loss": 0.1614,
"step": 103500
},
{
"epoch": 8.441981747066492,
"grad_norm": 12.580671310424805,
"learning_rate": 7.213879702357951e-07,
"loss": 0.1713,
"step": 103600
},
{
"epoch": 8.45013037809648,
"grad_norm": 9.213543891906738,
"learning_rate": 7.140463026204764e-07,
"loss": 0.1619,
"step": 103700
},
{
"epoch": 8.458279009126466,
"grad_norm": 15.926830291748047,
"learning_rate": 7.067393111825016e-07,
"loss": 0.1748,
"step": 103800
},
{
"epoch": 8.466427640156454,
"grad_norm": 22.008920669555664,
"learning_rate": 6.994670550402249e-07,
"loss": 0.1926,
"step": 103900
},
{
"epoch": 8.474576271186441,
"grad_norm": 4.002703666687012,
"learning_rate": 6.922295930309691e-07,
"loss": 0.1613,
"step": 104000
},
{
"epoch": 8.482724902216427,
"grad_norm": 10.932751655578613,
"learning_rate": 6.850269837105522e-07,
"loss": 0.1635,
"step": 104100
},
{
"epoch": 8.490873533246415,
"grad_norm": 20.70867347717285,
"learning_rate": 6.778592853528077e-07,
"loss": 0.1708,
"step": 104200
},
{
"epoch": 8.499022164276402,
"grad_norm": 9.567403793334961,
"learning_rate": 6.707265559491188e-07,
"loss": 0.1814,
"step": 104300
},
{
"epoch": 8.507170795306388,
"grad_norm": 24.9285888671875,
"learning_rate": 6.63628853207946e-07,
"loss": 0.1746,
"step": 104400
},
{
"epoch": 8.515319426336376,
"grad_norm": 12.97628402709961,
"learning_rate": 6.565662345543595e-07,
"loss": 0.17,
"step": 104500
},
{
"epoch": 8.523468057366362,
"grad_norm": 5.221209526062012,
"learning_rate": 6.495387571295785e-07,
"loss": 0.1726,
"step": 104600
},
{
"epoch": 8.53161668839635,
"grad_norm": 12.438835144042969,
"learning_rate": 6.42546477790506e-07,
"loss": 0.1703,
"step": 104700
},
{
"epoch": 8.539765319426337,
"grad_norm": 9.98957633972168,
"learning_rate": 6.355894531092705e-07,
"loss": 0.1883,
"step": 104800
},
{
"epoch": 8.547913950456323,
"grad_norm": 8.844900131225586,
"learning_rate": 6.286677393727653e-07,
"loss": 0.1623,
"step": 104900
},
{
"epoch": 8.55606258148631,
"grad_norm": 5.921658039093018,
"learning_rate": 6.217813925821958e-07,
"loss": 0.16,
"step": 105000
},
{
"epoch": 8.564211212516298,
"grad_norm": 12.132319450378418,
"learning_rate": 6.149304684526253e-07,
"loss": 0.1843,
"step": 105100
},
{
"epoch": 8.572359843546284,
"grad_norm": 13.31769847869873,
"learning_rate": 6.081150224125254e-07,
"loss": 0.1586,
"step": 105200
},
{
"epoch": 8.580508474576272,
"grad_norm": 21.240800857543945,
"learning_rate": 6.013351096033254e-07,
"loss": 0.1783,
"step": 105300
},
{
"epoch": 8.588657105606258,
"grad_norm": 9.178833961486816,
"learning_rate": 5.945907848789667e-07,
"loss": 0.1847,
"step": 105400
},
{
"epoch": 8.596805736636245,
"grad_norm": 7.893414497375488,
"learning_rate": 5.878821028054637e-07,
"loss": 0.1474,
"step": 105500
},
{
"epoch": 8.604954367666233,
"grad_norm": 17.363147735595703,
"learning_rate": 5.812091176604551e-07,
"loss": 0.1567,
"step": 105600
},
{
"epoch": 8.613102998696219,
"grad_norm": 7.612610340118408,
"learning_rate": 5.745718834327679e-07,
"loss": 0.158,
"step": 105700
},
{
"epoch": 8.621251629726206,
"grad_norm": 12.395828247070312,
"learning_rate": 5.679704538219827e-07,
"loss": 0.1817,
"step": 105800
},
{
"epoch": 8.629400260756192,
"grad_norm": 2.951467514038086,
"learning_rate": 5.614048822379947e-07,
"loss": 0.1731,
"step": 105900
},
{
"epoch": 8.63754889178618,
"grad_norm": 14.023295402526855,
"learning_rate": 5.548752218005882e-07,
"loss": 0.1638,
"step": 106000
},
{
"epoch": 8.645697522816167,
"grad_norm": 21.505937576293945,
"learning_rate": 5.483815253389957e-07,
"loss": 0.1529,
"step": 106100
},
{
"epoch": 8.653846153846153,
"grad_norm": 8.31225299835205,
"learning_rate": 5.41923845391486e-07,
"loss": 0.1563,
"step": 106200
},
{
"epoch": 8.661994784876141,
"grad_norm": 9.446884155273438,
"learning_rate": 5.355022342049249e-07,
"loss": 0.1622,
"step": 106300
},
{
"epoch": 8.670143415906129,
"grad_norm": 21.06761360168457,
"learning_rate": 5.291167437343608e-07,
"loss": 0.1602,
"step": 106400
},
{
"epoch": 8.678292046936114,
"grad_norm": 13.025223731994629,
"learning_rate": 5.227674256426002e-07,
"loss": 0.1611,
"step": 106500
},
{
"epoch": 8.686440677966102,
"grad_norm": 6.65778923034668,
"learning_rate": 5.164543312997922e-07,
"loss": 0.1677,
"step": 106600
},
{
"epoch": 8.694589308996088,
"grad_norm": 25.8751220703125,
"learning_rate": 5.101775117830121e-07,
"loss": 0.1639,
"step": 106700
},
{
"epoch": 8.702737940026076,
"grad_norm": 18.437524795532227,
"learning_rate": 5.039370178758485e-07,
"loss": 0.1651,
"step": 106800
},
{
"epoch": 8.710886571056063,
"grad_norm": 31.746627807617188,
"learning_rate": 4.977329000679903e-07,
"loss": 0.1758,
"step": 106900
},
{
"epoch": 8.719035202086049,
"grad_norm": 12.55679988861084,
"learning_rate": 4.915652085548217e-07,
"loss": 0.1571,
"step": 107000
},
{
"epoch": 8.727183833116037,
"grad_norm": 1.4074722528457642,
"learning_rate": 4.854339932370134e-07,
"loss": 0.1526,
"step": 107100
},
{
"epoch": 8.735332464146023,
"grad_norm": 5.811018466949463,
"learning_rate": 4.793393037201194e-07,
"loss": 0.1745,
"step": 107200
},
{
"epoch": 8.74348109517601,
"grad_norm": 2.8639020919799805,
"learning_rate": 4.7328118931417753e-07,
"loss": 0.1695,
"step": 107300
},
{
"epoch": 8.751629726205998,
"grad_norm": 20.180130004882812,
"learning_rate": 4.672596990333073e-07,
"loss": 0.1758,
"step": 107400
},
{
"epoch": 8.759778357235984,
"grad_norm": 19.003700256347656,
"learning_rate": 4.6127488159531495e-07,
"loss": 0.1669,
"step": 107500
},
{
"epoch": 8.767926988265971,
"grad_norm": 12.393278121948242,
"learning_rate": 4.553267854213017e-07,
"loss": 0.1827,
"step": 107600
},
{
"epoch": 8.776075619295959,
"grad_norm": 23.79950714111328,
"learning_rate": 4.494154586352667e-07,
"loss": 0.1571,
"step": 107700
},
{
"epoch": 8.784224250325945,
"grad_norm": 21.107633590698242,
"learning_rate": 4.435409490637227e-07,
"loss": 0.1744,
"step": 107800
},
{
"epoch": 8.792372881355933,
"grad_norm": 15.573356628417969,
"learning_rate": 4.3770330423530626e-07,
"loss": 0.1675,
"step": 107900
},
{
"epoch": 8.800521512385918,
"grad_norm": 14.63633918762207,
"learning_rate": 4.3190257138039313e-07,
"loss": 0.1667,
"step": 108000
},
{
"epoch": 8.808670143415906,
"grad_norm": 15.823701858520508,
"learning_rate": 4.2613879743071907e-07,
"loss": 0.164,
"step": 108100
},
{
"epoch": 8.816818774445894,
"grad_norm": 7.163984775543213,
"learning_rate": 4.204120290189956e-07,
"loss": 0.1648,
"step": 108200
},
{
"epoch": 8.82496740547588,
"grad_norm": 10.87267780303955,
"learning_rate": 4.147223124785366e-07,
"loss": 0.1767,
"step": 108300
},
{
"epoch": 8.833116036505867,
"grad_norm": 13.024577140808105,
"learning_rate": 4.0906969384288396e-07,
"loss": 0.1561,
"step": 108400
},
{
"epoch": 8.841264667535853,
"grad_norm": 15.831514358520508,
"learning_rate": 4.034542188454282e-07,
"loss": 0.2002,
"step": 108500
},
{
"epoch": 8.84941329856584,
"grad_norm": 8.199058532714844,
"learning_rate": 3.9787593291904793e-07,
"loss": 0.1823,
"step": 108600
},
{
"epoch": 8.857561929595828,
"grad_norm": 14.69583511352539,
"learning_rate": 3.9233488119573506e-07,
"loss": 0.1779,
"step": 108700
},
{
"epoch": 8.865710560625814,
"grad_norm": 12.765257835388184,
"learning_rate": 3.868311085062337e-07,
"loss": 0.1626,
"step": 108800
},
{
"epoch": 8.873859191655802,
"grad_norm": 31.990026473999023,
"learning_rate": 3.8136465937967657e-07,
"loss": 0.1856,
"step": 108900
},
{
"epoch": 8.88200782268579,
"grad_norm": 24.627126693725586,
"learning_rate": 3.7593557804322167e-07,
"loss": 0.1518,
"step": 109000
},
{
"epoch": 8.890156453715775,
"grad_norm": 32.763092041015625,
"learning_rate": 3.705439084217016e-07,
"loss": 0.1526,
"step": 109100
},
{
"epoch": 8.898305084745763,
"grad_norm": 14.418821334838867,
"learning_rate": 3.6518969413725905e-07,
"loss": 0.1602,
"step": 109200
},
{
"epoch": 8.906453715775749,
"grad_norm": 9.382340431213379,
"learning_rate": 3.5987297850900217e-07,
"loss": 0.1742,
"step": 109300
},
{
"epoch": 8.914602346805736,
"grad_norm": 22.482595443725586,
"learning_rate": 3.5459380455264594e-07,
"loss": 0.1737,
"step": 109400
},
{
"epoch": 8.922750977835724,
"grad_norm": 18.5339412689209,
"learning_rate": 3.4935221498017316e-07,
"loss": 0.1581,
"step": 109500
},
{
"epoch": 8.93089960886571,
"grad_norm": 21.965267181396484,
"learning_rate": 3.4414825219948153e-07,
"loss": 0.1597,
"step": 109600
},
{
"epoch": 8.939048239895698,
"grad_norm": 13.353527069091797,
"learning_rate": 3.3898195831404354e-07,
"loss": 0.1747,
"step": 109700
},
{
"epoch": 8.947196870925685,
"grad_norm": 7.977973461151123,
"learning_rate": 3.3385337512256863e-07,
"loss": 0.1562,
"step": 109800
},
{
"epoch": 8.955345501955671,
"grad_norm": 9.263310432434082,
"learning_rate": 3.287625441186576e-07,
"loss": 0.1772,
"step": 109900
},
{
"epoch": 8.963494132985659,
"grad_norm": 13.787714958190918,
"learning_rate": 3.2370950649047383e-07,
"loss": 0.1976,
"step": 110000
},
{
"epoch": 8.971642764015645,
"grad_norm": 20.066761016845703,
"learning_rate": 3.1869430312040816e-07,
"loss": 0.1596,
"step": 110100
},
{
"epoch": 8.979791395045632,
"grad_norm": 20.64689826965332,
"learning_rate": 3.137169745847435e-07,
"loss": 0.1704,
"step": 110200
},
{
"epoch": 8.98794002607562,
"grad_norm": 46.617713928222656,
"learning_rate": 3.08777561153335e-07,
"loss": 0.1889,
"step": 110300
},
{
"epoch": 8.996088657105606,
"grad_norm": 14.401327133178711,
"learning_rate": 3.0387610278927725e-07,
"loss": 0.1702,
"step": 110400
},
{
"epoch": 9.0,
"eval_accuracy": 0.8196787148594378,
"eval_loss": 0.7465346455574036,
"eval_runtime": 7.168,
"eval_samples_per_second": 347.378,
"eval_steps_per_second": 43.527,
"step": 110448
},
{
"epoch": 9.004237288135593,
"grad_norm": 15.593995094299316,
"learning_rate": 2.990126391485848e-07,
"loss": 0.1722,
"step": 110500
},
{
"epoch": 9.01238591916558,
"grad_norm": 4.0746636390686035,
"learning_rate": 2.941872095798698e-07,
"loss": 0.1346,
"step": 110600
},
{
"epoch": 9.020534550195567,
"grad_norm": 6.78621768951416,
"learning_rate": 2.893998531240222e-07,
"loss": 0.1819,
"step": 110700
},
{
"epoch": 9.028683181225555,
"grad_norm": 16.810945510864258,
"learning_rate": 2.8465060851389725e-07,
"loss": 0.152,
"step": 110800
},
{
"epoch": 9.03683181225554,
"grad_norm": 2.5170655250549316,
"learning_rate": 2.7993951417400025e-07,
"loss": 0.1737,
"step": 110900
},
{
"epoch": 9.044980443285528,
"grad_norm": 5.630674362182617,
"learning_rate": 2.752666082201727e-07,
"loss": 0.1703,
"step": 111000
},
{
"epoch": 9.053129074315516,
"grad_norm": 29.249120712280273,
"learning_rate": 2.7063192845929286e-07,
"loss": 0.1648,
"step": 111100
},
{
"epoch": 9.061277705345502,
"grad_norm": 7.27542781829834,
"learning_rate": 2.660355123889585e-07,
"loss": 0.1483,
"step": 111200
},
{
"epoch": 9.06942633637549,
"grad_norm": 27.242809295654297,
"learning_rate": 2.614773971971929e-07,
"loss": 0.1693,
"step": 111300
},
{
"epoch": 9.077574967405475,
"grad_norm": 15.899724006652832,
"learning_rate": 2.5695761976213704e-07,
"loss": 0.1562,
"step": 111400
},
{
"epoch": 9.085723598435463,
"grad_norm": 20.975248336791992,
"learning_rate": 2.5247621665175636e-07,
"loss": 0.1558,
"step": 111500
},
{
"epoch": 9.09387222946545,
"grad_norm": 17.303001403808594,
"learning_rate": 2.4803322412354227e-07,
"loss": 0.1594,
"step": 111600
},
{
"epoch": 9.102020860495436,
"grad_norm": 14.3364839553833,
"learning_rate": 2.436286781242192e-07,
"loss": 0.1558,
"step": 111700
},
{
"epoch": 9.110169491525424,
"grad_norm": 18.47357940673828,
"learning_rate": 2.3926261428945386e-07,
"loss": 0.1713,
"step": 111800
},
{
"epoch": 9.118318122555412,
"grad_norm": 2.021436929702759,
"learning_rate": 2.3493506794356745e-07,
"loss": 0.1577,
"step": 111900
},
{
"epoch": 9.126466753585397,
"grad_norm": 4.512004852294922,
"learning_rate": 2.3064607409924888e-07,
"loss": 0.1552,
"step": 112000
},
{
"epoch": 9.134615384615385,
"grad_norm": 21.13969612121582,
"learning_rate": 2.2639566745727203e-07,
"loss": 0.1504,
"step": 112100
},
{
"epoch": 9.142764015645371,
"grad_norm": 17.030675888061523,
"learning_rate": 2.2218388240621558e-07,
"loss": 0.1785,
"step": 112200
},
{
"epoch": 9.150912646675359,
"grad_norm": 11.586610794067383,
"learning_rate": 2.1801075302218423e-07,
"loss": 0.174,
"step": 112300
},
{
"epoch": 9.159061277705346,
"grad_norm": 19.795167922973633,
"learning_rate": 2.1387631306853174e-07,
"loss": 0.1672,
"step": 112400
},
{
"epoch": 9.167209908735332,
"grad_norm": 23.909713745117188,
"learning_rate": 2.0978059599559065e-07,
"loss": 0.1684,
"step": 112500
},
{
"epoch": 9.17535853976532,
"grad_norm": 5.545074939727783,
"learning_rate": 2.057236349403985e-07,
"loss": 0.165,
"step": 112600
},
{
"epoch": 9.183507170795306,
"grad_norm": 12.588091850280762,
"learning_rate": 2.0170546272643256e-07,
"loss": 0.167,
"step": 112700
},
{
"epoch": 9.191655801825293,
"grad_norm": 12.73204517364502,
"learning_rate": 1.9772611186334168e-07,
"loss": 0.1535,
"step": 112800
},
{
"epoch": 9.19980443285528,
"grad_norm": 11.712594985961914,
"learning_rate": 1.9378561454668598e-07,
"loss": 0.1629,
"step": 112900
},
{
"epoch": 9.207953063885267,
"grad_norm": 6.922073841094971,
"learning_rate": 1.8988400265767316e-07,
"loss": 0.1544,
"step": 113000
},
{
"epoch": 9.216101694915254,
"grad_norm": 14.258295059204102,
"learning_rate": 1.8602130776290362e-07,
"loss": 0.1575,
"step": 113100
},
{
"epoch": 9.224250325945242,
"grad_norm": 20.113460540771484,
"learning_rate": 1.8219756111411357e-07,
"loss": 0.151,
"step": 113200
},
{
"epoch": 9.232398956975228,
"grad_norm": 9.496116638183594,
"learning_rate": 1.784127936479213e-07,
"loss": 0.1791,
"step": 113300
},
{
"epoch": 9.240547588005215,
"grad_norm": 7.643208026885986,
"learning_rate": 1.7466703598557898e-07,
"loss": 0.1752,
"step": 113400
},
{
"epoch": 9.248696219035201,
"grad_norm": 21.511184692382812,
"learning_rate": 1.709603184327241e-07,
"loss": 0.1538,
"step": 113500
},
{
"epoch": 9.256844850065189,
"grad_norm": 18.147607803344727,
"learning_rate": 1.6729267097913338e-07,
"loss": 0.1606,
"step": 113600
},
{
"epoch": 9.264993481095177,
"grad_norm": 13.48155689239502,
"learning_rate": 1.6366412329848035e-07,
"loss": 0.1661,
"step": 113700
},
{
"epoch": 9.273142112125162,
"grad_norm": 21.713895797729492,
"learning_rate": 1.6007470474809772e-07,
"loss": 0.157,
"step": 113800
},
{
"epoch": 9.28129074315515,
"grad_norm": 11.30298137664795,
"learning_rate": 1.565244443687347e-07,
"loss": 0.1802,
"step": 113900
},
{
"epoch": 9.289439374185136,
"grad_norm": 15.809433937072754,
"learning_rate": 1.5301337088432787e-07,
"loss": 0.1723,
"step": 114000
},
{
"epoch": 9.297588005215124,
"grad_norm": 8.747072219848633,
"learning_rate": 1.4954151270176686e-07,
"loss": 0.1616,
"step": 114100
},
{
"epoch": 9.305736636245111,
"grad_norm": 1.6549293994903564,
"learning_rate": 1.4610889791066008e-07,
"loss": 0.1732,
"step": 114200
},
{
"epoch": 9.313885267275097,
"grad_norm": 13.10067367553711,
"learning_rate": 1.4271555428311323e-07,
"loss": 0.1618,
"step": 114300
},
{
"epoch": 9.322033898305085,
"grad_norm": 13.006690979003906,
"learning_rate": 1.39361509273504e-07,
"loss": 0.1806,
"step": 114400
},
{
"epoch": 9.330182529335072,
"grad_norm": 23.973905563354492,
"learning_rate": 1.3604679001825605e-07,
"loss": 0.1678,
"step": 114500
},
{
"epoch": 9.338331160365058,
"grad_norm": 10.249641418457031,
"learning_rate": 1.3277142333562253e-07,
"loss": 0.1646,
"step": 114600
},
{
"epoch": 9.346479791395046,
"grad_norm": 30.132413864135742,
"learning_rate": 1.2953543572546968e-07,
"loss": 0.1635,
"step": 114700
},
{
"epoch": 9.354628422425032,
"grad_norm": 13.259139060974121,
"learning_rate": 1.2633885336906014e-07,
"loss": 0.172,
"step": 114800
},
{
"epoch": 9.36277705345502,
"grad_norm": 19.1724853515625,
"learning_rate": 1.2318170212884285e-07,
"loss": 0.1633,
"step": 114900
},
{
"epoch": 9.370925684485007,
"grad_norm": 14.311450004577637,
"learning_rate": 1.2006400754824177e-07,
"loss": 0.1747,
"step": 115000
},
{
"epoch": 9.379074315514993,
"grad_norm": 8.39560317993164,
"learning_rate": 1.1698579485145134e-07,
"loss": 0.1441,
"step": 115100
},
{
"epoch": 9.38722294654498,
"grad_norm": 10.600957870483398,
"learning_rate": 1.1394708894323314e-07,
"loss": 0.1923,
"step": 115200
},
{
"epoch": 9.395371577574968,
"grad_norm": 9.45894718170166,
"learning_rate": 1.1094791440871e-07,
"loss": 0.1476,
"step": 115300
},
{
"epoch": 9.403520208604954,
"grad_norm": 6.497547149658203,
"learning_rate": 1.079882955131728e-07,
"loss": 0.1621,
"step": 115400
},
{
"epoch": 9.411668839634942,
"grad_norm": 5.700404644012451,
"learning_rate": 1.0506825620187954e-07,
"loss": 0.1569,
"step": 115500
},
{
"epoch": 9.419817470664928,
"grad_norm": 5.055960655212402,
"learning_rate": 1.0218782009986494e-07,
"loss": 0.1439,
"step": 115600
},
{
"epoch": 9.427966101694915,
"grad_norm": 0.8036000728607178,
"learning_rate": 9.93470105117461e-08,
"loss": 0.163,
"step": 115700
},
{
"epoch": 9.436114732724903,
"grad_norm": 21.1984920501709,
"learning_rate": 9.654585042153663e-08,
"loss": 0.153,
"step": 115800
},
{
"epoch": 9.444263363754889,
"grad_norm": 3.3010308742523193,
"learning_rate": 9.378436249245892e-08,
"loss": 0.1584,
"step": 115900
},
{
"epoch": 9.452411994784876,
"grad_norm": 9.636171340942383,
"learning_rate": 9.106256906676159e-08,
"loss": 0.1765,
"step": 116000
},
{
"epoch": 9.460560625814864,
"grad_norm": 1.7043323516845703,
"learning_rate": 8.838049216554123e-08,
"loss": 0.1604,
"step": 116100
},
{
"epoch": 9.46870925684485,
"grad_norm": 9.73293399810791,
"learning_rate": 8.573815348855818e-08,
"loss": 0.1703,
"step": 116200
},
{
"epoch": 9.476857887874838,
"grad_norm": 7.777896404266357,
"learning_rate": 8.313557441406606e-08,
"loss": 0.1632,
"step": 116300
},
{
"epoch": 9.485006518904823,
"grad_norm": 17.46415901184082,
"learning_rate": 8.057277599863744e-08,
"loss": 0.1536,
"step": 116400
},
{
"epoch": 9.493155149934811,
"grad_norm": 10.912395477294922,
"learning_rate": 7.804977897699295e-08,
"loss": 0.1611,
"step": 116500
},
{
"epoch": 9.501303780964799,
"grad_norm": 12.858296394348145,
"learning_rate": 7.556660376183301e-08,
"loss": 0.1458,
"step": 116600
},
{
"epoch": 9.509452411994785,
"grad_norm": 7.577301025390625,
"learning_rate": 7.312327044367463e-08,
"loss": 0.1408,
"step": 116700
},
{
"epoch": 9.517601043024772,
"grad_norm": 13.470318794250488,
"learning_rate": 7.071979879068769e-08,
"loss": 0.1568,
"step": 116800
},
{
"epoch": 9.525749674054758,
"grad_norm": 16.199295043945312,
"learning_rate": 6.835620824853451e-08,
"loss": 0.161,
"step": 116900
},
{
"epoch": 9.533898305084746,
"grad_norm": 15.154216766357422,
"learning_rate": 6.603251794021381e-08,
"loss": 0.1783,
"step": 117000
},
{
"epoch": 9.542046936114733,
"grad_norm": 9.926989555358887,
"learning_rate": 6.374874666590369e-08,
"loss": 0.149,
"step": 117100
},
{
"epoch": 9.55019556714472,
"grad_norm": 14.719680786132812,
"learning_rate": 6.15049129028128e-08,
"loss": 0.1459,
"step": 117200
},
{
"epoch": 9.558344198174707,
"grad_norm": 23.45909881591797,
"learning_rate": 5.93010348050288e-08,
"loss": 0.1624,
"step": 117300
},
{
"epoch": 9.566492829204694,
"grad_norm": 22.256080627441406,
"learning_rate": 5.7137130203370194e-08,
"loss": 0.1536,
"step": 117400
},
{
"epoch": 9.57464146023468,
"grad_norm": 5.540316581726074,
"learning_rate": 5.501321660524583e-08,
"loss": 0.1541,
"step": 117500
},
{
"epoch": 9.582790091264668,
"grad_norm": 3.839772939682007,
"learning_rate": 5.292931119451006e-08,
"loss": 0.1577,
"step": 117600
},
{
"epoch": 9.590938722294654,
"grad_norm": 4.665050029754639,
"learning_rate": 5.088543083132502e-08,
"loss": 0.1547,
"step": 117700
},
{
"epoch": 9.599087353324641,
"grad_norm": 18.975759506225586,
"learning_rate": 4.888159205202303e-08,
"loss": 0.1652,
"step": 117800
},
{
"epoch": 9.607235984354629,
"grad_norm": 13.844809532165527,
"learning_rate": 4.691781106897497e-08,
"loss": 0.1528,
"step": 117900
},
{
"epoch": 9.615384615384615,
"grad_norm": 5.203334331512451,
"learning_rate": 4.499410377045765e-08,
"loss": 0.1484,
"step": 118000
},
{
"epoch": 9.623533246414603,
"grad_norm": 17.595108032226562,
"learning_rate": 4.311048572052501e-08,
"loss": 0.1547,
"step": 118100
},
{
"epoch": 9.631681877444588,
"grad_norm": 10.652242660522461,
"learning_rate": 4.1266972158883204e-08,
"loss": 0.1658,
"step": 118200
},
{
"epoch": 9.639830508474576,
"grad_norm": 15.711381912231445,
"learning_rate": 3.9463578000765724e-08,
"loss": 0.1493,
"step": 118300
},
{
"epoch": 9.647979139504564,
"grad_norm": 18.064918518066406,
"learning_rate": 3.7700317836814605e-08,
"loss": 0.1558,
"step": 118400
},
{
"epoch": 9.65612777053455,
"grad_norm": 11.699357986450195,
"learning_rate": 3.5977205932962164e-08,
"loss": 0.1465,
"step": 118500
},
{
"epoch": 9.664276401564537,
"grad_norm": 9.775052070617676,
"learning_rate": 3.429425623031335e-08,
"loss": 0.1456,
"step": 118600
},
{
"epoch": 9.672425032594525,
"grad_norm": 19.886598587036133,
"learning_rate": 3.265148234503579e-08,
"loss": 0.165,
"step": 118700
},
{
"epoch": 9.68057366362451,
"grad_norm": 13.31386661529541,
"learning_rate": 3.104889756824825e-08,
"loss": 0.1682,
"step": 118800
},
{
"epoch": 9.688722294654498,
"grad_norm": 16.752405166625977,
"learning_rate": 2.9486514865912364e-08,
"loss": 0.1498,
"step": 118900
},
{
"epoch": 9.696870925684484,
"grad_norm": 12.920425415039062,
"learning_rate": 2.7964346878729952e-08,
"loss": 0.1573,
"step": 119000
},
{
"epoch": 9.705019556714472,
"grad_norm": 5.0780110359191895,
"learning_rate": 2.64824059220381e-08,
"loss": 0.159,
"step": 119100
},
{
"epoch": 9.71316818774446,
"grad_norm": 13.475509643554688,
"learning_rate": 2.504070398571201e-08,
"loss": 0.1997,
"step": 119200
},
{
"epoch": 9.721316818774445,
"grad_norm": 20.931211471557617,
"learning_rate": 2.3639252734065644e-08,
"loss": 0.1957,
"step": 119300
},
{
"epoch": 9.729465449804433,
"grad_norm": 20.29063606262207,
"learning_rate": 2.227806350575956e-08,
"loss": 0.1388,
"step": 119400
},
{
"epoch": 9.737614080834419,
"grad_norm": 0.7664732336997986,
"learning_rate": 2.0957147313707127e-08,
"loss": 0.166,
"step": 119500
},
{
"epoch": 9.745762711864407,
"grad_norm": 18.868257522583008,
"learning_rate": 1.9676514844987338e-08,
"loss": 0.1618,
"step": 119600
},
{
"epoch": 9.753911342894394,
"grad_norm": 15.741533279418945,
"learning_rate": 1.8436176460756572e-08,
"loss": 0.1589,
"step": 119700
},
{
"epoch": 9.76205997392438,
"grad_norm": 11.955362319946289,
"learning_rate": 1.723614219616754e-08,
"loss": 0.168,
"step": 119800
},
{
"epoch": 9.770208604954368,
"grad_norm": 26.171483993530273,
"learning_rate": 1.6076421760283234e-08,
"loss": 0.157,
"step": 119900
},
{
"epoch": 9.778357235984355,
"grad_norm": 14.887884140014648,
"learning_rate": 1.4957024536003674e-08,
"loss": 0.1383,
"step": 120000
},
{
"epoch": 9.786505867014341,
"grad_norm": 9.518312454223633,
"learning_rate": 1.3877959579985944e-08,
"loss": 0.1385,
"step": 120100
},
{
"epoch": 9.794654498044329,
"grad_norm": 18.155826568603516,
"learning_rate": 1.283923562257483e-08,
"loss": 0.1623,
"step": 120200
},
{
"epoch": 9.802803129074315,
"grad_norm": 17.2945613861084,
"learning_rate": 1.1840861067727306e-08,
"loss": 0.1551,
"step": 120300
},
{
"epoch": 9.810951760104302,
"grad_norm": 24.658214569091797,
"learning_rate": 1.0882843992949255e-08,
"loss": 0.1499,
"step": 120400
},
{
"epoch": 9.81910039113429,
"grad_norm": 6.880736351013184,
"learning_rate": 9.9651921492272e-09,
"loss": 0.1501,
"step": 120500
},
{
"epoch": 9.827249022164276,
"grad_norm": 25.12505531311035,
"learning_rate": 9.087912960967227e-09,
"loss": 0.1571,
"step": 120600
},
{
"epoch": 9.835397653194264,
"grad_norm": 28.05438995361328,
"learning_rate": 8.251013525932273e-09,
"loss": 0.1637,
"step": 120700
},
{
"epoch": 9.843546284224251,
"grad_norm": 10.58689022064209,
"learning_rate": 7.454500615188264e-09,
"loss": 0.1509,
"step": 120800
},
{
"epoch": 9.851694915254237,
"grad_norm": 24.10919761657715,
"learning_rate": 6.698380673048066e-09,
"loss": 0.1691,
"step": 120900
},
{
"epoch": 9.859843546284225,
"grad_norm": 0.43672606348991394,
"learning_rate": 5.982659817017067e-09,
"loss": 0.1746,
"step": 121000
},
{
"epoch": 9.86799217731421,
"grad_norm": 12.899723052978516,
"learning_rate": 5.307343837747115e-09,
"loss": 0.1497,
"step": 121100
},
{
"epoch": 9.876140808344198,
"grad_norm": 18.292190551757812,
"learning_rate": 4.672438198987661e-09,
"loss": 0.1594,
"step": 121200
},
{
"epoch": 9.884289439374186,
"grad_norm": 18.396923065185547,
"learning_rate": 4.077948037541357e-09,
"loss": 0.1574,
"step": 121300
},
{
"epoch": 9.892438070404172,
"grad_norm": 22.605993270874023,
"learning_rate": 3.5238781632240813e-09,
"loss": 0.1642,
"step": 121400
},
{
"epoch": 9.90058670143416,
"grad_norm": 23.427574157714844,
"learning_rate": 3.010233058824419e-09,
"loss": 0.1765,
"step": 121500
},
{
"epoch": 9.908735332464147,
"grad_norm": 3.2891268730163574,
"learning_rate": 2.5370168800681325e-09,
"loss": 0.1743,
"step": 121600
},
{
"epoch": 9.916883963494133,
"grad_norm": 19.58220100402832,
"learning_rate": 2.1042334555848585e-09,
"loss": 0.1596,
"step": 121700
},
{
"epoch": 9.92503259452412,
"grad_norm": 15.260977745056152,
"learning_rate": 1.711886286876463e-09,
"loss": 0.1486,
"step": 121800
},
{
"epoch": 9.933181225554106,
"grad_norm": 5.988215446472168,
"learning_rate": 1.3599785482881767e-09,
"loss": 0.1518,
"step": 121900
},
{
"epoch": 9.941329856584094,
"grad_norm": 5.850574970245361,
"learning_rate": 1.0485130869858362e-09,
"loss": 0.1588,
"step": 122000
},
{
"epoch": 9.949478487614082,
"grad_norm": 11.288055419921875,
"learning_rate": 7.774924229281278e-10,
"loss": 0.1734,
"step": 122100
},
{
"epoch": 9.957627118644067,
"grad_norm": 10.142143249511719,
"learning_rate": 5.469187488510441e-10,
"loss": 0.173,
"step": 122200
},
{
"epoch": 9.965775749674055,
"grad_norm": 14.460721015930176,
"learning_rate": 3.5679393024623533e-10,
"loss": 0.1603,
"step": 122300
},
{
"epoch": 9.973924380704041,
"grad_norm": 23.698572158813477,
"learning_rate": 2.071195053482411e-10,
"loss": 0.1616,
"step": 122400
},
{
"epoch": 9.982073011734029,
"grad_norm": 9.158120155334473,
"learning_rate": 9.789668512116823e-11,
"loss": 0.1702,
"step": 122500
},
{
"epoch": 9.990221642764016,
"grad_norm": 32.79683303833008,
"learning_rate": 2.912635325036384e-11,
"loss": 0.1718,
"step": 122600
},
{
"epoch": 9.998370273794002,
"grad_norm": 17.385313034057617,
"learning_rate": 8.090661318682636e-13,
"loss": 0.1656,
"step": 122700
},
{
"epoch": 10.0,
"eval_accuracy": 0.8172690763052208,
"eval_loss": 0.7754501700401306,
"eval_runtime": 7.1115,
"eval_samples_per_second": 350.138,
"eval_steps_per_second": 43.873,
"step": 122720
},
{
"epoch": 10.0,
"step": 122720,
"total_flos": 1.617427903829713e+17,
"train_loss": 0.3309724763735177,
"train_runtime": 41426.0671,
"train_samples_per_second": 94.796,
"train_steps_per_second": 2.962
}
],
"logging_steps": 100,
"max_steps": 122720,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.617427903829713e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}