arco-mini-run-75k / checkpoint-4218 /trainer_state.json
appvoid's picture
Upload folder using huggingface_hub
11861e3 verified
raw
history blame
73.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998222222222222,
"eval_steps": 5000,
"global_step": 4218,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0023703703703703703,
"grad_norm": 0.542885959148407,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.4997,
"step": 10
},
{
"epoch": 0.004740740740740741,
"grad_norm": 0.44538265466690063,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.4925,
"step": 20
},
{
"epoch": 0.0071111111111111115,
"grad_norm": 0.4945567548274994,
"learning_rate": 3e-06,
"loss": 1.4985,
"step": 30
},
{
"epoch": 0.009481481481481481,
"grad_norm": 0.4560663402080536,
"learning_rate": 4.000000000000001e-06,
"loss": 1.4031,
"step": 40
},
{
"epoch": 0.011851851851851851,
"grad_norm": 0.4691298007965088,
"learning_rate": 5e-06,
"loss": 1.4175,
"step": 50
},
{
"epoch": 0.014222222222222223,
"grad_norm": 0.44202300906181335,
"learning_rate": 6e-06,
"loss": 1.4337,
"step": 60
},
{
"epoch": 0.016592592592592593,
"grad_norm": 0.5069476366043091,
"learning_rate": 7.000000000000001e-06,
"loss": 1.4629,
"step": 70
},
{
"epoch": 0.018962962962962963,
"grad_norm": 0.4806945025920868,
"learning_rate": 8.000000000000001e-06,
"loss": 1.5706,
"step": 80
},
{
"epoch": 0.021333333333333333,
"grad_norm": 0.5269841551780701,
"learning_rate": 9e-06,
"loss": 1.5625,
"step": 90
},
{
"epoch": 0.023703703703703703,
"grad_norm": 0.37831586599349976,
"learning_rate": 1e-05,
"loss": 1.4083,
"step": 100
},
{
"epoch": 0.026074074074074072,
"grad_norm": 0.442981094121933,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.3799,
"step": 110
},
{
"epoch": 0.028444444444444446,
"grad_norm": 0.47675761580467224,
"learning_rate": 1.2e-05,
"loss": 1.5356,
"step": 120
},
{
"epoch": 0.030814814814814816,
"grad_norm": 0.5033993721008301,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.504,
"step": 130
},
{
"epoch": 0.033185185185185186,
"grad_norm": 0.4628155827522278,
"learning_rate": 1.4000000000000001e-05,
"loss": 1.3474,
"step": 140
},
{
"epoch": 0.035555555555555556,
"grad_norm": 0.41637757420539856,
"learning_rate": 1.5e-05,
"loss": 1.4352,
"step": 150
},
{
"epoch": 0.037925925925925925,
"grad_norm": 0.5029244422912598,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.3224,
"step": 160
},
{
"epoch": 0.040296296296296295,
"grad_norm": 0.6434731483459473,
"learning_rate": 1.7000000000000003e-05,
"loss": 1.5611,
"step": 170
},
{
"epoch": 0.042666666666666665,
"grad_norm": 0.42424359917640686,
"learning_rate": 1.8e-05,
"loss": 1.4191,
"step": 180
},
{
"epoch": 0.045037037037037035,
"grad_norm": 0.4729703962802887,
"learning_rate": 1.9e-05,
"loss": 1.3284,
"step": 190
},
{
"epoch": 0.047407407407407405,
"grad_norm": 0.48806190490722656,
"learning_rate": 2e-05,
"loss": 1.48,
"step": 200
},
{
"epoch": 0.049777777777777775,
"grad_norm": 0.4987320303916931,
"learning_rate": 2.1e-05,
"loss": 1.4535,
"step": 210
},
{
"epoch": 0.052148148148148145,
"grad_norm": 0.46912866830825806,
"learning_rate": 2.2000000000000003e-05,
"loss": 1.4846,
"step": 220
},
{
"epoch": 0.05451851851851852,
"grad_norm": 0.4369196593761444,
"learning_rate": 2.3000000000000003e-05,
"loss": 1.5564,
"step": 230
},
{
"epoch": 0.05688888888888889,
"grad_norm": 0.48074963688850403,
"learning_rate": 2.4e-05,
"loss": 1.3142,
"step": 240
},
{
"epoch": 0.05925925925925926,
"grad_norm": 0.450253427028656,
"learning_rate": 2.5e-05,
"loss": 1.3877,
"step": 250
},
{
"epoch": 0.06162962962962963,
"grad_norm": 0.4517356753349304,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.3969,
"step": 260
},
{
"epoch": 0.064,
"grad_norm": 0.47781577706336975,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.5352,
"step": 270
},
{
"epoch": 0.06637037037037037,
"grad_norm": 0.5579633712768555,
"learning_rate": 2.8000000000000003e-05,
"loss": 1.5436,
"step": 280
},
{
"epoch": 0.06874074074074074,
"grad_norm": 0.4838034510612488,
"learning_rate": 2.9e-05,
"loss": 1.3564,
"step": 290
},
{
"epoch": 0.07111111111111111,
"grad_norm": 0.5685828328132629,
"learning_rate": 3e-05,
"loss": 1.4429,
"step": 300
},
{
"epoch": 0.07348148148148148,
"grad_norm": 0.5230541229248047,
"learning_rate": 3.1e-05,
"loss": 1.3933,
"step": 310
},
{
"epoch": 0.07585185185185185,
"grad_norm": 0.45525529980659485,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.4224,
"step": 320
},
{
"epoch": 0.07822222222222222,
"grad_norm": 0.47926583886146545,
"learning_rate": 3.3e-05,
"loss": 1.4466,
"step": 330
},
{
"epoch": 0.08059259259259259,
"grad_norm": 0.38689500093460083,
"learning_rate": 3.4000000000000007e-05,
"loss": 1.5105,
"step": 340
},
{
"epoch": 0.08296296296296296,
"grad_norm": 0.4488411843776703,
"learning_rate": 3.5e-05,
"loss": 1.3392,
"step": 350
},
{
"epoch": 0.08533333333333333,
"grad_norm": 0.5381152033805847,
"learning_rate": 3.6e-05,
"loss": 1.375,
"step": 360
},
{
"epoch": 0.0877037037037037,
"grad_norm": 0.5788478255271912,
"learning_rate": 3.7e-05,
"loss": 1.3522,
"step": 370
},
{
"epoch": 0.09007407407407407,
"grad_norm": 0.501133143901825,
"learning_rate": 3.8e-05,
"loss": 1.3294,
"step": 380
},
{
"epoch": 0.09244444444444444,
"grad_norm": 0.5300689935684204,
"learning_rate": 3.9000000000000006e-05,
"loss": 1.5623,
"step": 390
},
{
"epoch": 0.09481481481481481,
"grad_norm": 0.5409078001976013,
"learning_rate": 4e-05,
"loss": 1.4098,
"step": 400
},
{
"epoch": 0.09718518518518518,
"grad_norm": 0.5598166584968567,
"learning_rate": 4.1e-05,
"loss": 1.4104,
"step": 410
},
{
"epoch": 0.09955555555555555,
"grad_norm": 0.5656659603118896,
"learning_rate": 4.2e-05,
"loss": 1.3782,
"step": 420
},
{
"epoch": 0.10192592592592592,
"grad_norm": 0.5094364881515503,
"learning_rate": 4.3e-05,
"loss": 1.6233,
"step": 430
},
{
"epoch": 0.10429629629629629,
"grad_norm": 0.5540050268173218,
"learning_rate": 4.4000000000000006e-05,
"loss": 1.512,
"step": 440
},
{
"epoch": 0.10666666666666667,
"grad_norm": 0.5693063139915466,
"learning_rate": 4.5e-05,
"loss": 1.641,
"step": 450
},
{
"epoch": 0.10903703703703704,
"grad_norm": 0.5556958913803101,
"learning_rate": 4.600000000000001e-05,
"loss": 1.4532,
"step": 460
},
{
"epoch": 0.11140740740740741,
"grad_norm": 0.5052928924560547,
"learning_rate": 4.7e-05,
"loss": 1.4863,
"step": 470
},
{
"epoch": 0.11377777777777778,
"grad_norm": 0.5321051478385925,
"learning_rate": 4.8e-05,
"loss": 1.353,
"step": 480
},
{
"epoch": 0.11614814814814815,
"grad_norm": 0.658074140548706,
"learning_rate": 4.9e-05,
"loss": 1.563,
"step": 490
},
{
"epoch": 0.11851851851851852,
"grad_norm": 0.4817732870578766,
"learning_rate": 5e-05,
"loss": 1.5347,
"step": 500
},
{
"epoch": 0.12088888888888889,
"grad_norm": 0.6876205801963806,
"learning_rate": 4.9865519096288324e-05,
"loss": 1.6524,
"step": 510
},
{
"epoch": 0.12325925925925926,
"grad_norm": 0.5238626003265381,
"learning_rate": 4.973103819257665e-05,
"loss": 1.5766,
"step": 520
},
{
"epoch": 0.12562962962962962,
"grad_norm": 0.4588116705417633,
"learning_rate": 4.959655728886498e-05,
"loss": 1.4258,
"step": 530
},
{
"epoch": 0.128,
"grad_norm": 0.529692530632019,
"learning_rate": 4.946207638515331e-05,
"loss": 1.5574,
"step": 540
},
{
"epoch": 0.13037037037037036,
"grad_norm": 0.475524365901947,
"learning_rate": 4.932759548144163e-05,
"loss": 1.377,
"step": 550
},
{
"epoch": 0.13274074074074074,
"grad_norm": 0.48722413182258606,
"learning_rate": 4.919311457772996e-05,
"loss": 1.3156,
"step": 560
},
{
"epoch": 0.1351111111111111,
"grad_norm": 0.6309683322906494,
"learning_rate": 4.905863367401829e-05,
"loss": 1.5581,
"step": 570
},
{
"epoch": 0.13748148148148148,
"grad_norm": 0.5029247999191284,
"learning_rate": 4.892415277030662e-05,
"loss": 1.5968,
"step": 580
},
{
"epoch": 0.13985185185185184,
"grad_norm": 0.421310156583786,
"learning_rate": 4.878967186659494e-05,
"loss": 1.4881,
"step": 590
},
{
"epoch": 0.14222222222222222,
"grad_norm": 0.5082572102546692,
"learning_rate": 4.865519096288327e-05,
"loss": 1.6574,
"step": 600
},
{
"epoch": 0.1445925925925926,
"grad_norm": 0.5082793235778809,
"learning_rate": 4.85207100591716e-05,
"loss": 1.3606,
"step": 610
},
{
"epoch": 0.14696296296296296,
"grad_norm": 0.5169036388397217,
"learning_rate": 4.838622915545993e-05,
"loss": 1.4457,
"step": 620
},
{
"epoch": 0.14933333333333335,
"grad_norm": 0.5509771704673767,
"learning_rate": 4.825174825174825e-05,
"loss": 1.4754,
"step": 630
},
{
"epoch": 0.1517037037037037,
"grad_norm": 0.630851149559021,
"learning_rate": 4.811726734803658e-05,
"loss": 1.5453,
"step": 640
},
{
"epoch": 0.15407407407407409,
"grad_norm": 0.5343595147132874,
"learning_rate": 4.798278644432491e-05,
"loss": 1.4822,
"step": 650
},
{
"epoch": 0.15644444444444444,
"grad_norm": 0.5070016980171204,
"learning_rate": 4.7848305540613237e-05,
"loss": 1.3835,
"step": 660
},
{
"epoch": 0.15881481481481483,
"grad_norm": 0.6097332835197449,
"learning_rate": 4.771382463690156e-05,
"loss": 1.546,
"step": 670
},
{
"epoch": 0.16118518518518518,
"grad_norm": 0.5894319415092468,
"learning_rate": 4.757934373318989e-05,
"loss": 1.3605,
"step": 680
},
{
"epoch": 0.16355555555555557,
"grad_norm": 0.4879942238330841,
"learning_rate": 4.7444862829478216e-05,
"loss": 1.4391,
"step": 690
},
{
"epoch": 0.16592592592592592,
"grad_norm": 0.49390801787376404,
"learning_rate": 4.7310381925766545e-05,
"loss": 1.2228,
"step": 700
},
{
"epoch": 0.1682962962962963,
"grad_norm": 0.6193021535873413,
"learning_rate": 4.717590102205487e-05,
"loss": 1.7474,
"step": 710
},
{
"epoch": 0.17066666666666666,
"grad_norm": 0.4410654604434967,
"learning_rate": 4.7041420118343196e-05,
"loss": 1.4255,
"step": 720
},
{
"epoch": 0.17303703703703704,
"grad_norm": 0.5690642595291138,
"learning_rate": 4.6906939214631525e-05,
"loss": 1.4252,
"step": 730
},
{
"epoch": 0.1754074074074074,
"grad_norm": 0.4561966359615326,
"learning_rate": 4.6772458310919854e-05,
"loss": 1.4214,
"step": 740
},
{
"epoch": 0.17777777777777778,
"grad_norm": 0.5871363282203674,
"learning_rate": 4.6637977407208176e-05,
"loss": 1.5272,
"step": 750
},
{
"epoch": 0.18014814814814814,
"grad_norm": 0.5293774604797363,
"learning_rate": 4.6503496503496505e-05,
"loss": 1.5507,
"step": 760
},
{
"epoch": 0.18251851851851852,
"grad_norm": 0.483826220035553,
"learning_rate": 4.636901559978483e-05,
"loss": 1.4058,
"step": 770
},
{
"epoch": 0.18488888888888888,
"grad_norm": 0.551902174949646,
"learning_rate": 4.623453469607316e-05,
"loss": 1.3707,
"step": 780
},
{
"epoch": 0.18725925925925926,
"grad_norm": 0.5492023825645447,
"learning_rate": 4.6100053792361484e-05,
"loss": 1.4901,
"step": 790
},
{
"epoch": 0.18962962962962962,
"grad_norm": 0.5409772396087646,
"learning_rate": 4.596557288864981e-05,
"loss": 1.6356,
"step": 800
},
{
"epoch": 0.192,
"grad_norm": 0.5469648838043213,
"learning_rate": 4.583109198493814e-05,
"loss": 1.4574,
"step": 810
},
{
"epoch": 0.19437037037037036,
"grad_norm": 0.5523713827133179,
"learning_rate": 4.569661108122647e-05,
"loss": 1.5304,
"step": 820
},
{
"epoch": 0.19674074074074074,
"grad_norm": 0.4884456992149353,
"learning_rate": 4.556213017751479e-05,
"loss": 1.4173,
"step": 830
},
{
"epoch": 0.1991111111111111,
"grad_norm": 0.5865374803543091,
"learning_rate": 4.542764927380312e-05,
"loss": 1.4178,
"step": 840
},
{
"epoch": 0.20148148148148148,
"grad_norm": 0.5571750402450562,
"learning_rate": 4.529316837009145e-05,
"loss": 1.5356,
"step": 850
},
{
"epoch": 0.20385185185185184,
"grad_norm": 0.567616879940033,
"learning_rate": 4.515868746637978e-05,
"loss": 1.4496,
"step": 860
},
{
"epoch": 0.20622222222222222,
"grad_norm": 0.5077497959136963,
"learning_rate": 4.50242065626681e-05,
"loss": 1.4757,
"step": 870
},
{
"epoch": 0.20859259259259258,
"grad_norm": 0.5118802189826965,
"learning_rate": 4.488972565895643e-05,
"loss": 1.3845,
"step": 880
},
{
"epoch": 0.21096296296296296,
"grad_norm": 0.43292248249053955,
"learning_rate": 4.475524475524476e-05,
"loss": 1.421,
"step": 890
},
{
"epoch": 0.21333333333333335,
"grad_norm": 0.5365243554115295,
"learning_rate": 4.462076385153308e-05,
"loss": 1.4586,
"step": 900
},
{
"epoch": 0.2157037037037037,
"grad_norm": 0.4912022054195404,
"learning_rate": 4.448628294782141e-05,
"loss": 1.5385,
"step": 910
},
{
"epoch": 0.2180740740740741,
"grad_norm": 0.5855193734169006,
"learning_rate": 4.435180204410974e-05,
"loss": 1.5718,
"step": 920
},
{
"epoch": 0.22044444444444444,
"grad_norm": 0.5224360227584839,
"learning_rate": 4.421732114039807e-05,
"loss": 1.3853,
"step": 930
},
{
"epoch": 0.22281481481481483,
"grad_norm": 0.4283509850502014,
"learning_rate": 4.408284023668639e-05,
"loss": 1.3758,
"step": 940
},
{
"epoch": 0.22518518518518518,
"grad_norm": 0.44806018471717834,
"learning_rate": 4.394835933297472e-05,
"loss": 1.4089,
"step": 950
},
{
"epoch": 0.22755555555555557,
"grad_norm": 0.4234403967857361,
"learning_rate": 4.381387842926305e-05,
"loss": 1.321,
"step": 960
},
{
"epoch": 0.22992592592592592,
"grad_norm": 0.45570847392082214,
"learning_rate": 4.3679397525551376e-05,
"loss": 1.384,
"step": 970
},
{
"epoch": 0.2322962962962963,
"grad_norm": 0.6098482608795166,
"learning_rate": 4.35449166218397e-05,
"loss": 1.5165,
"step": 980
},
{
"epoch": 0.23466666666666666,
"grad_norm": 0.47981974482536316,
"learning_rate": 4.341043571812803e-05,
"loss": 1.4827,
"step": 990
},
{
"epoch": 0.23703703703703705,
"grad_norm": 0.567845344543457,
"learning_rate": 4.3275954814416356e-05,
"loss": 1.4494,
"step": 1000
},
{
"epoch": 0.2394074074074074,
"grad_norm": 0.5508958697319031,
"learning_rate": 4.3141473910704685e-05,
"loss": 1.5681,
"step": 1010
},
{
"epoch": 0.24177777777777779,
"grad_norm": 0.6119508743286133,
"learning_rate": 4.300699300699301e-05,
"loss": 1.4522,
"step": 1020
},
{
"epoch": 0.24414814814814814,
"grad_norm": 0.654909074306488,
"learning_rate": 4.2872512103281336e-05,
"loss": 1.6317,
"step": 1030
},
{
"epoch": 0.24651851851851853,
"grad_norm": 0.5818801522254944,
"learning_rate": 4.2738031199569664e-05,
"loss": 1.4917,
"step": 1040
},
{
"epoch": 0.24888888888888888,
"grad_norm": 0.5295186638832092,
"learning_rate": 4.260355029585799e-05,
"loss": 1.6393,
"step": 1050
},
{
"epoch": 0.25125925925925924,
"grad_norm": 0.5558478832244873,
"learning_rate": 4.2469069392146315e-05,
"loss": 1.5607,
"step": 1060
},
{
"epoch": 0.25362962962962965,
"grad_norm": 0.5266067385673523,
"learning_rate": 4.2334588488434644e-05,
"loss": 1.4366,
"step": 1070
},
{
"epoch": 0.256,
"grad_norm": 0.4949641823768616,
"learning_rate": 4.220010758472297e-05,
"loss": 1.4392,
"step": 1080
},
{
"epoch": 0.25837037037037036,
"grad_norm": 0.48148399591445923,
"learning_rate": 4.20656266810113e-05,
"loss": 1.5301,
"step": 1090
},
{
"epoch": 0.2607407407407407,
"grad_norm": 0.5564059615135193,
"learning_rate": 4.1931145777299624e-05,
"loss": 1.3645,
"step": 1100
},
{
"epoch": 0.26311111111111113,
"grad_norm": 0.6419994235038757,
"learning_rate": 4.179666487358795e-05,
"loss": 1.4543,
"step": 1110
},
{
"epoch": 0.2654814814814815,
"grad_norm": 0.5205827355384827,
"learning_rate": 4.166218396987628e-05,
"loss": 1.5358,
"step": 1120
},
{
"epoch": 0.26785185185185184,
"grad_norm": 0.45430988073349,
"learning_rate": 4.152770306616461e-05,
"loss": 1.5483,
"step": 1130
},
{
"epoch": 0.2702222222222222,
"grad_norm": 0.5467645525932312,
"learning_rate": 4.139322216245293e-05,
"loss": 1.4702,
"step": 1140
},
{
"epoch": 0.2725925925925926,
"grad_norm": 0.47259363532066345,
"learning_rate": 4.125874125874126e-05,
"loss": 1.451,
"step": 1150
},
{
"epoch": 0.27496296296296296,
"grad_norm": 0.48951438069343567,
"learning_rate": 4.112426035502959e-05,
"loss": 1.4095,
"step": 1160
},
{
"epoch": 0.2773333333333333,
"grad_norm": 0.601701021194458,
"learning_rate": 4.098977945131792e-05,
"loss": 1.6848,
"step": 1170
},
{
"epoch": 0.2797037037037037,
"grad_norm": 0.5379857420921326,
"learning_rate": 4.085529854760624e-05,
"loss": 1.3598,
"step": 1180
},
{
"epoch": 0.2820740740740741,
"grad_norm": 0.6498066186904907,
"learning_rate": 4.072081764389457e-05,
"loss": 1.4655,
"step": 1190
},
{
"epoch": 0.28444444444444444,
"grad_norm": 0.5294344425201416,
"learning_rate": 4.05863367401829e-05,
"loss": 1.3851,
"step": 1200
},
{
"epoch": 0.2868148148148148,
"grad_norm": 0.5410310626029968,
"learning_rate": 4.045185583647123e-05,
"loss": 1.5091,
"step": 1210
},
{
"epoch": 0.2891851851851852,
"grad_norm": 0.5395278334617615,
"learning_rate": 4.031737493275955e-05,
"loss": 1.3487,
"step": 1220
},
{
"epoch": 0.29155555555555557,
"grad_norm": 0.637909471988678,
"learning_rate": 4.018289402904788e-05,
"loss": 1.5848,
"step": 1230
},
{
"epoch": 0.2939259259259259,
"grad_norm": 0.4254130721092224,
"learning_rate": 4.004841312533621e-05,
"loss": 1.4186,
"step": 1240
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.5799821019172668,
"learning_rate": 3.9913932221624536e-05,
"loss": 1.4289,
"step": 1250
},
{
"epoch": 0.2986666666666667,
"grad_norm": 0.43250229954719543,
"learning_rate": 3.977945131791286e-05,
"loss": 1.5458,
"step": 1260
},
{
"epoch": 0.30103703703703705,
"grad_norm": 0.4722803831100464,
"learning_rate": 3.964497041420119e-05,
"loss": 1.4716,
"step": 1270
},
{
"epoch": 0.3034074074074074,
"grad_norm": 0.5686700940132141,
"learning_rate": 3.9510489510489516e-05,
"loss": 1.6452,
"step": 1280
},
{
"epoch": 0.30577777777777776,
"grad_norm": 0.493028461933136,
"learning_rate": 3.9376008606777844e-05,
"loss": 1.4516,
"step": 1290
},
{
"epoch": 0.30814814814814817,
"grad_norm": 0.5100602507591248,
"learning_rate": 3.9241527703066166e-05,
"loss": 1.3366,
"step": 1300
},
{
"epoch": 0.3105185185185185,
"grad_norm": 0.6535771489143372,
"learning_rate": 3.910704679935449e-05,
"loss": 1.4312,
"step": 1310
},
{
"epoch": 0.3128888888888889,
"grad_norm": 0.48823079466819763,
"learning_rate": 3.8972565895642824e-05,
"loss": 1.4888,
"step": 1320
},
{
"epoch": 0.31525925925925924,
"grad_norm": 0.4459994435310364,
"learning_rate": 3.8838084991931146e-05,
"loss": 1.4637,
"step": 1330
},
{
"epoch": 0.31762962962962965,
"grad_norm": 0.5344628691673279,
"learning_rate": 3.8703604088219475e-05,
"loss": 1.6118,
"step": 1340
},
{
"epoch": 0.32,
"grad_norm": 0.44893643260002136,
"learning_rate": 3.85691231845078e-05,
"loss": 1.4521,
"step": 1350
},
{
"epoch": 0.32237037037037036,
"grad_norm": 0.4381811022758484,
"learning_rate": 3.8434642280796126e-05,
"loss": 1.5751,
"step": 1360
},
{
"epoch": 0.3247407407407407,
"grad_norm": 0.5791207551956177,
"learning_rate": 3.8300161377084455e-05,
"loss": 1.5563,
"step": 1370
},
{
"epoch": 0.32711111111111113,
"grad_norm": 0.5584151148796082,
"learning_rate": 3.8165680473372784e-05,
"loss": 1.3673,
"step": 1380
},
{
"epoch": 0.3294814814814815,
"grad_norm": 0.6110686659812927,
"learning_rate": 3.8031199569661106e-05,
"loss": 1.5949,
"step": 1390
},
{
"epoch": 0.33185185185185184,
"grad_norm": 0.5519852638244629,
"learning_rate": 3.7896718665949434e-05,
"loss": 1.5133,
"step": 1400
},
{
"epoch": 0.3342222222222222,
"grad_norm": 0.5732788443565369,
"learning_rate": 3.776223776223776e-05,
"loss": 1.4601,
"step": 1410
},
{
"epoch": 0.3365925925925926,
"grad_norm": 0.5789920091629028,
"learning_rate": 3.762775685852609e-05,
"loss": 1.6461,
"step": 1420
},
{
"epoch": 0.33896296296296297,
"grad_norm": 0.592776358127594,
"learning_rate": 3.7493275954814414e-05,
"loss": 1.5558,
"step": 1430
},
{
"epoch": 0.3413333333333333,
"grad_norm": 0.5435842871665955,
"learning_rate": 3.735879505110274e-05,
"loss": 1.5095,
"step": 1440
},
{
"epoch": 0.3437037037037037,
"grad_norm": 0.6474444270133972,
"learning_rate": 3.722431414739107e-05,
"loss": 1.429,
"step": 1450
},
{
"epoch": 0.3460740740740741,
"grad_norm": 0.4926964342594147,
"learning_rate": 3.70898332436794e-05,
"loss": 1.4543,
"step": 1460
},
{
"epoch": 0.34844444444444445,
"grad_norm": 0.5748719573020935,
"learning_rate": 3.695535233996772e-05,
"loss": 1.5844,
"step": 1470
},
{
"epoch": 0.3508148148148148,
"grad_norm": 0.5535377264022827,
"learning_rate": 3.682087143625605e-05,
"loss": 1.5348,
"step": 1480
},
{
"epoch": 0.35318518518518516,
"grad_norm": 0.4644632339477539,
"learning_rate": 3.668639053254438e-05,
"loss": 1.3157,
"step": 1490
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.5858569145202637,
"learning_rate": 3.655190962883271e-05,
"loss": 1.4678,
"step": 1500
},
{
"epoch": 0.3579259259259259,
"grad_norm": 0.5428529381752014,
"learning_rate": 3.641742872512103e-05,
"loss": 1.5561,
"step": 1510
},
{
"epoch": 0.3602962962962963,
"grad_norm": 0.5255948901176453,
"learning_rate": 3.628294782140936e-05,
"loss": 1.5802,
"step": 1520
},
{
"epoch": 0.3626666666666667,
"grad_norm": 0.534137487411499,
"learning_rate": 3.614846691769769e-05,
"loss": 1.5142,
"step": 1530
},
{
"epoch": 0.36503703703703705,
"grad_norm": 0.5558648705482483,
"learning_rate": 3.601398601398602e-05,
"loss": 1.5613,
"step": 1540
},
{
"epoch": 0.3674074074074074,
"grad_norm": 0.5890410542488098,
"learning_rate": 3.587950511027434e-05,
"loss": 1.4679,
"step": 1550
},
{
"epoch": 0.36977777777777776,
"grad_norm": 0.4830753803253174,
"learning_rate": 3.574502420656267e-05,
"loss": 1.4698,
"step": 1560
},
{
"epoch": 0.3721481481481482,
"grad_norm": 0.5191047787666321,
"learning_rate": 3.5610543302851e-05,
"loss": 1.528,
"step": 1570
},
{
"epoch": 0.37451851851851853,
"grad_norm": 0.5316727161407471,
"learning_rate": 3.5476062399139326e-05,
"loss": 1.4427,
"step": 1580
},
{
"epoch": 0.3768888888888889,
"grad_norm": 0.553815484046936,
"learning_rate": 3.534158149542765e-05,
"loss": 1.548,
"step": 1590
},
{
"epoch": 0.37925925925925924,
"grad_norm": 0.47779569029808044,
"learning_rate": 3.520710059171598e-05,
"loss": 1.4533,
"step": 1600
},
{
"epoch": 0.38162962962962965,
"grad_norm": 0.5595371127128601,
"learning_rate": 3.5072619688004306e-05,
"loss": 1.4503,
"step": 1610
},
{
"epoch": 0.384,
"grad_norm": 0.5166143774986267,
"learning_rate": 3.4938138784292635e-05,
"loss": 1.3783,
"step": 1620
},
{
"epoch": 0.38637037037037036,
"grad_norm": 0.6249716877937317,
"learning_rate": 3.480365788058096e-05,
"loss": 1.4494,
"step": 1630
},
{
"epoch": 0.3887407407407407,
"grad_norm": 0.484937846660614,
"learning_rate": 3.4669176976869286e-05,
"loss": 1.421,
"step": 1640
},
{
"epoch": 0.39111111111111113,
"grad_norm": 0.5464750528335571,
"learning_rate": 3.4534696073157615e-05,
"loss": 1.266,
"step": 1650
},
{
"epoch": 0.3934814814814815,
"grad_norm": 0.48874956369400024,
"learning_rate": 3.440021516944594e-05,
"loss": 1.5355,
"step": 1660
},
{
"epoch": 0.39585185185185184,
"grad_norm": 0.47555652260780334,
"learning_rate": 3.4265734265734265e-05,
"loss": 1.5799,
"step": 1670
},
{
"epoch": 0.3982222222222222,
"grad_norm": 0.49769505858421326,
"learning_rate": 3.4131253362022594e-05,
"loss": 1.3748,
"step": 1680
},
{
"epoch": 0.4005925925925926,
"grad_norm": 0.4664982259273529,
"learning_rate": 3.399677245831092e-05,
"loss": 1.4894,
"step": 1690
},
{
"epoch": 0.40296296296296297,
"grad_norm": 0.5216518044471741,
"learning_rate": 3.3862291554599245e-05,
"loss": 1.4645,
"step": 1700
},
{
"epoch": 0.4053333333333333,
"grad_norm": 0.6157680749893188,
"learning_rate": 3.3727810650887574e-05,
"loss": 1.4002,
"step": 1710
},
{
"epoch": 0.4077037037037037,
"grad_norm": 0.5828937888145447,
"learning_rate": 3.35933297471759e-05,
"loss": 1.4816,
"step": 1720
},
{
"epoch": 0.4100740740740741,
"grad_norm": 0.5792407989501953,
"learning_rate": 3.345884884346423e-05,
"loss": 1.3557,
"step": 1730
},
{
"epoch": 0.41244444444444445,
"grad_norm": 0.4985092580318451,
"learning_rate": 3.3324367939752554e-05,
"loss": 1.4572,
"step": 1740
},
{
"epoch": 0.4148148148148148,
"grad_norm": 0.5901199579238892,
"learning_rate": 3.318988703604088e-05,
"loss": 1.5292,
"step": 1750
},
{
"epoch": 0.41718518518518516,
"grad_norm": 0.5087295174598694,
"learning_rate": 3.305540613232921e-05,
"loss": 1.3405,
"step": 1760
},
{
"epoch": 0.41955555555555557,
"grad_norm": 0.5455463528633118,
"learning_rate": 3.292092522861754e-05,
"loss": 1.4262,
"step": 1770
},
{
"epoch": 0.4219259259259259,
"grad_norm": 0.46563345193862915,
"learning_rate": 3.278644432490586e-05,
"loss": 1.4328,
"step": 1780
},
{
"epoch": 0.4242962962962963,
"grad_norm": 0.545524537563324,
"learning_rate": 3.265196342119419e-05,
"loss": 1.4826,
"step": 1790
},
{
"epoch": 0.4266666666666667,
"grad_norm": 0.4182009994983673,
"learning_rate": 3.251748251748252e-05,
"loss": 1.4909,
"step": 1800
},
{
"epoch": 0.42903703703703705,
"grad_norm": 0.39127054810523987,
"learning_rate": 3.238300161377085e-05,
"loss": 1.4389,
"step": 1810
},
{
"epoch": 0.4314074074074074,
"grad_norm": 0.46866652369499207,
"learning_rate": 3.224852071005917e-05,
"loss": 1.3992,
"step": 1820
},
{
"epoch": 0.43377777777777776,
"grad_norm": 0.5216823816299438,
"learning_rate": 3.21140398063475e-05,
"loss": 1.3525,
"step": 1830
},
{
"epoch": 0.4361481481481482,
"grad_norm": 0.49909713864326477,
"learning_rate": 3.197955890263583e-05,
"loss": 1.5491,
"step": 1840
},
{
"epoch": 0.43851851851851853,
"grad_norm": 0.4957892596721649,
"learning_rate": 3.184507799892416e-05,
"loss": 1.4723,
"step": 1850
},
{
"epoch": 0.4408888888888889,
"grad_norm": 0.518822431564331,
"learning_rate": 3.171059709521248e-05,
"loss": 1.535,
"step": 1860
},
{
"epoch": 0.44325925925925924,
"grad_norm": 0.6380564570426941,
"learning_rate": 3.157611619150081e-05,
"loss": 1.5652,
"step": 1870
},
{
"epoch": 0.44562962962962965,
"grad_norm": 0.49906617403030396,
"learning_rate": 3.144163528778914e-05,
"loss": 1.3624,
"step": 1880
},
{
"epoch": 0.448,
"grad_norm": 0.5234742760658264,
"learning_rate": 3.1307154384077466e-05,
"loss": 1.4192,
"step": 1890
},
{
"epoch": 0.45037037037037037,
"grad_norm": 0.5430870056152344,
"learning_rate": 3.117267348036579e-05,
"loss": 1.4674,
"step": 1900
},
{
"epoch": 0.4527407407407407,
"grad_norm": 0.5488291382789612,
"learning_rate": 3.103819257665412e-05,
"loss": 1.4759,
"step": 1910
},
{
"epoch": 0.45511111111111113,
"grad_norm": 0.4655541181564331,
"learning_rate": 3.0903711672942446e-05,
"loss": 1.4559,
"step": 1920
},
{
"epoch": 0.4574814814814815,
"grad_norm": 0.442128449678421,
"learning_rate": 3.0769230769230774e-05,
"loss": 1.3129,
"step": 1930
},
{
"epoch": 0.45985185185185184,
"grad_norm": 0.5909174084663391,
"learning_rate": 3.0634749865519096e-05,
"loss": 1.5915,
"step": 1940
},
{
"epoch": 0.4622222222222222,
"grad_norm": 0.41102078557014465,
"learning_rate": 3.0500268961807425e-05,
"loss": 1.3717,
"step": 1950
},
{
"epoch": 0.4645925925925926,
"grad_norm": 0.5467662811279297,
"learning_rate": 3.036578805809575e-05,
"loss": 1.4838,
"step": 1960
},
{
"epoch": 0.46696296296296297,
"grad_norm": 0.6555057764053345,
"learning_rate": 3.023130715438408e-05,
"loss": 1.4289,
"step": 1970
},
{
"epoch": 0.4693333333333333,
"grad_norm": 0.4430755078792572,
"learning_rate": 3.0096826250672405e-05,
"loss": 1.5556,
"step": 1980
},
{
"epoch": 0.4717037037037037,
"grad_norm": 0.48016276955604553,
"learning_rate": 2.9962345346960734e-05,
"loss": 1.5461,
"step": 1990
},
{
"epoch": 0.4740740740740741,
"grad_norm": 0.5283887982368469,
"learning_rate": 2.982786444324906e-05,
"loss": 1.4598,
"step": 2000
},
{
"epoch": 0.47644444444444445,
"grad_norm": 0.5336430668830872,
"learning_rate": 2.9693383539537388e-05,
"loss": 1.5837,
"step": 2010
},
{
"epoch": 0.4788148148148148,
"grad_norm": 0.39814135432243347,
"learning_rate": 2.9558902635825713e-05,
"loss": 1.3496,
"step": 2020
},
{
"epoch": 0.48118518518518516,
"grad_norm": 0.6095125079154968,
"learning_rate": 2.9424421732114042e-05,
"loss": 1.5925,
"step": 2030
},
{
"epoch": 0.48355555555555557,
"grad_norm": 0.5880560874938965,
"learning_rate": 2.9289940828402368e-05,
"loss": 1.3603,
"step": 2040
},
{
"epoch": 0.48592592592592593,
"grad_norm": 0.5470516085624695,
"learning_rate": 2.9155459924690697e-05,
"loss": 1.5022,
"step": 2050
},
{
"epoch": 0.4882962962962963,
"grad_norm": 0.45742228627204895,
"learning_rate": 2.9020979020979022e-05,
"loss": 1.4699,
"step": 2060
},
{
"epoch": 0.49066666666666664,
"grad_norm": 0.5314275622367859,
"learning_rate": 2.888649811726735e-05,
"loss": 1.5043,
"step": 2070
},
{
"epoch": 0.49303703703703705,
"grad_norm": 0.5969755053520203,
"learning_rate": 2.8752017213555676e-05,
"loss": 1.4709,
"step": 2080
},
{
"epoch": 0.4954074074074074,
"grad_norm": 0.5115885138511658,
"learning_rate": 2.8617536309844002e-05,
"loss": 1.4031,
"step": 2090
},
{
"epoch": 0.49777777777777776,
"grad_norm": 0.5907914042472839,
"learning_rate": 2.848305540613233e-05,
"loss": 1.3509,
"step": 2100
},
{
"epoch": 0.5001481481481481,
"grad_norm": 0.48430949449539185,
"learning_rate": 2.8348574502420656e-05,
"loss": 1.4393,
"step": 2110
},
{
"epoch": 0.5025185185185185,
"grad_norm": 0.5502893328666687,
"learning_rate": 2.8214093598708985e-05,
"loss": 1.5571,
"step": 2120
},
{
"epoch": 0.5048888888888889,
"grad_norm": 0.48268720507621765,
"learning_rate": 2.807961269499731e-05,
"loss": 1.3895,
"step": 2130
},
{
"epoch": 0.5072592592592593,
"grad_norm": 0.6141895651817322,
"learning_rate": 2.794513179128564e-05,
"loss": 1.5278,
"step": 2140
},
{
"epoch": 0.5096296296296297,
"grad_norm": 0.48447638750076294,
"learning_rate": 2.7810650887573965e-05,
"loss": 1.456,
"step": 2150
},
{
"epoch": 0.512,
"grad_norm": 0.4536721408367157,
"learning_rate": 2.7676169983862293e-05,
"loss": 1.4259,
"step": 2160
},
{
"epoch": 0.5143703703703704,
"grad_norm": 0.5519189238548279,
"learning_rate": 2.754168908015062e-05,
"loss": 1.508,
"step": 2170
},
{
"epoch": 0.5167407407407407,
"grad_norm": 0.4641801416873932,
"learning_rate": 2.7407208176438948e-05,
"loss": 1.5087,
"step": 2180
},
{
"epoch": 0.5191111111111111,
"grad_norm": 0.5566359162330627,
"learning_rate": 2.7272727272727273e-05,
"loss": 1.4994,
"step": 2190
},
{
"epoch": 0.5214814814814814,
"grad_norm": 0.5316601991653442,
"learning_rate": 2.7138246369015602e-05,
"loss": 1.375,
"step": 2200
},
{
"epoch": 0.5238518518518519,
"grad_norm": 0.545514702796936,
"learning_rate": 2.7003765465303927e-05,
"loss": 1.4449,
"step": 2210
},
{
"epoch": 0.5262222222222223,
"grad_norm": 0.5452851057052612,
"learning_rate": 2.6869284561592256e-05,
"loss": 1.639,
"step": 2220
},
{
"epoch": 0.5285925925925926,
"grad_norm": 0.5291896462440491,
"learning_rate": 2.673480365788058e-05,
"loss": 1.3638,
"step": 2230
},
{
"epoch": 0.530962962962963,
"grad_norm": 0.4708302319049835,
"learning_rate": 2.660032275416891e-05,
"loss": 1.1973,
"step": 2240
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.4936722218990326,
"learning_rate": 2.6465841850457236e-05,
"loss": 1.543,
"step": 2250
},
{
"epoch": 0.5357037037037037,
"grad_norm": 0.5722488760948181,
"learning_rate": 2.6331360946745565e-05,
"loss": 1.531,
"step": 2260
},
{
"epoch": 0.538074074074074,
"grad_norm": 0.5386027097702026,
"learning_rate": 2.619688004303389e-05,
"loss": 1.4335,
"step": 2270
},
{
"epoch": 0.5404444444444444,
"grad_norm": 0.5803340673446655,
"learning_rate": 2.606239913932222e-05,
"loss": 1.4049,
"step": 2280
},
{
"epoch": 0.5428148148148149,
"grad_norm": 0.3970150053501129,
"learning_rate": 2.5927918235610544e-05,
"loss": 1.3923,
"step": 2290
},
{
"epoch": 0.5451851851851852,
"grad_norm": 0.45682525634765625,
"learning_rate": 2.5793437331898873e-05,
"loss": 1.4838,
"step": 2300
},
{
"epoch": 0.5475555555555556,
"grad_norm": 0.5088069438934326,
"learning_rate": 2.56589564281872e-05,
"loss": 1.4416,
"step": 2310
},
{
"epoch": 0.5499259259259259,
"grad_norm": 0.5557109713554382,
"learning_rate": 2.5524475524475528e-05,
"loss": 1.4099,
"step": 2320
},
{
"epoch": 0.5522962962962963,
"grad_norm": 0.4954288601875305,
"learning_rate": 2.5389994620763853e-05,
"loss": 1.3828,
"step": 2330
},
{
"epoch": 0.5546666666666666,
"grad_norm": 0.5320334434509277,
"learning_rate": 2.5255513717052182e-05,
"loss": 1.3686,
"step": 2340
},
{
"epoch": 0.557037037037037,
"grad_norm": 0.511646032333374,
"learning_rate": 2.5121032813340507e-05,
"loss": 1.4752,
"step": 2350
},
{
"epoch": 0.5594074074074074,
"grad_norm": 0.4852311909198761,
"learning_rate": 2.4986551909628833e-05,
"loss": 1.5352,
"step": 2360
},
{
"epoch": 0.5617777777777778,
"grad_norm": 0.5558280944824219,
"learning_rate": 2.485207100591716e-05,
"loss": 1.3955,
"step": 2370
},
{
"epoch": 0.5641481481481482,
"grad_norm": 0.5369210243225098,
"learning_rate": 2.4717590102205487e-05,
"loss": 1.4265,
"step": 2380
},
{
"epoch": 0.5665185185185185,
"grad_norm": 0.5134137868881226,
"learning_rate": 2.4583109198493816e-05,
"loss": 1.3688,
"step": 2390
},
{
"epoch": 0.5688888888888889,
"grad_norm": 0.47109952569007874,
"learning_rate": 2.444862829478214e-05,
"loss": 1.4207,
"step": 2400
},
{
"epoch": 0.5712592592592592,
"grad_norm": 0.4982026517391205,
"learning_rate": 2.431414739107047e-05,
"loss": 1.4249,
"step": 2410
},
{
"epoch": 0.5736296296296296,
"grad_norm": 0.5209967494010925,
"learning_rate": 2.4179666487358796e-05,
"loss": 1.4708,
"step": 2420
},
{
"epoch": 0.576,
"grad_norm": 0.5762905478477478,
"learning_rate": 2.4045185583647124e-05,
"loss": 1.4194,
"step": 2430
},
{
"epoch": 0.5783703703703704,
"grad_norm": 0.4918428659439087,
"learning_rate": 2.391070467993545e-05,
"loss": 1.6279,
"step": 2440
},
{
"epoch": 0.5807407407407408,
"grad_norm": 0.5050658583641052,
"learning_rate": 2.377622377622378e-05,
"loss": 1.5029,
"step": 2450
},
{
"epoch": 0.5831111111111111,
"grad_norm": 0.49715667963027954,
"learning_rate": 2.3641742872512104e-05,
"loss": 1.4228,
"step": 2460
},
{
"epoch": 0.5854814814814815,
"grad_norm": 0.4800516664981842,
"learning_rate": 2.3507261968800433e-05,
"loss": 1.3595,
"step": 2470
},
{
"epoch": 0.5878518518518518,
"grad_norm": 0.5617285966873169,
"learning_rate": 2.337278106508876e-05,
"loss": 1.5974,
"step": 2480
},
{
"epoch": 0.5902222222222222,
"grad_norm": 0.5133258700370789,
"learning_rate": 2.3238300161377087e-05,
"loss": 1.4437,
"step": 2490
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.5644205212593079,
"learning_rate": 2.3103819257665413e-05,
"loss": 1.4633,
"step": 2500
},
{
"epoch": 0.5949629629629629,
"grad_norm": 0.5411229133605957,
"learning_rate": 2.296933835395374e-05,
"loss": 1.3376,
"step": 2510
},
{
"epoch": 0.5973333333333334,
"grad_norm": 0.5555963516235352,
"learning_rate": 2.2834857450242067e-05,
"loss": 1.5361,
"step": 2520
},
{
"epoch": 0.5997037037037037,
"grad_norm": 0.48606014251708984,
"learning_rate": 2.2700376546530396e-05,
"loss": 1.4386,
"step": 2530
},
{
"epoch": 0.6020740740740741,
"grad_norm": 0.4902474582195282,
"learning_rate": 2.256589564281872e-05,
"loss": 1.4536,
"step": 2540
},
{
"epoch": 0.6044444444444445,
"grad_norm": 0.5565341114997864,
"learning_rate": 2.243141473910705e-05,
"loss": 1.4801,
"step": 2550
},
{
"epoch": 0.6068148148148148,
"grad_norm": 0.5383167862892151,
"learning_rate": 2.2296933835395375e-05,
"loss": 1.3528,
"step": 2560
},
{
"epoch": 0.6091851851851852,
"grad_norm": 0.4610041379928589,
"learning_rate": 2.2162452931683704e-05,
"loss": 1.5063,
"step": 2570
},
{
"epoch": 0.6115555555555555,
"grad_norm": 0.5119171142578125,
"learning_rate": 2.202797202797203e-05,
"loss": 1.4559,
"step": 2580
},
{
"epoch": 0.6139259259259259,
"grad_norm": 0.46622559428215027,
"learning_rate": 2.189349112426036e-05,
"loss": 1.485,
"step": 2590
},
{
"epoch": 0.6162962962962963,
"grad_norm": 0.5610603094100952,
"learning_rate": 2.1759010220548684e-05,
"loss": 1.3863,
"step": 2600
},
{
"epoch": 0.6186666666666667,
"grad_norm": 0.5185586214065552,
"learning_rate": 2.162452931683701e-05,
"loss": 1.4574,
"step": 2610
},
{
"epoch": 0.621037037037037,
"grad_norm": 0.5091121196746826,
"learning_rate": 2.1490048413125338e-05,
"loss": 1.3745,
"step": 2620
},
{
"epoch": 0.6234074074074074,
"grad_norm": 0.39684295654296875,
"learning_rate": 2.1355567509413664e-05,
"loss": 1.5785,
"step": 2630
},
{
"epoch": 0.6257777777777778,
"grad_norm": 0.5499323606491089,
"learning_rate": 2.1221086605701993e-05,
"loss": 1.4926,
"step": 2640
},
{
"epoch": 0.6281481481481481,
"grad_norm": 0.448824942111969,
"learning_rate": 2.1086605701990318e-05,
"loss": 1.5199,
"step": 2650
},
{
"epoch": 0.6305185185185185,
"grad_norm": 0.5647756457328796,
"learning_rate": 2.0952124798278647e-05,
"loss": 1.493,
"step": 2660
},
{
"epoch": 0.6328888888888888,
"grad_norm": 0.5426878929138184,
"learning_rate": 2.0817643894566972e-05,
"loss": 1.4232,
"step": 2670
},
{
"epoch": 0.6352592592592593,
"grad_norm": 0.5105384588241577,
"learning_rate": 2.0683162990855298e-05,
"loss": 1.5136,
"step": 2680
},
{
"epoch": 0.6376296296296297,
"grad_norm": 0.5488259792327881,
"learning_rate": 2.0548682087143627e-05,
"loss": 1.4877,
"step": 2690
},
{
"epoch": 0.64,
"grad_norm": 0.47102248668670654,
"learning_rate": 2.0414201183431952e-05,
"loss": 1.2976,
"step": 2700
},
{
"epoch": 0.6423703703703704,
"grad_norm": 0.4708435535430908,
"learning_rate": 2.027972027972028e-05,
"loss": 1.4773,
"step": 2710
},
{
"epoch": 0.6447407407407407,
"grad_norm": 0.5073569416999817,
"learning_rate": 2.0145239376008606e-05,
"loss": 1.3551,
"step": 2720
},
{
"epoch": 0.6471111111111111,
"grad_norm": 0.4934346377849579,
"learning_rate": 2.0010758472296935e-05,
"loss": 1.2423,
"step": 2730
},
{
"epoch": 0.6494814814814814,
"grad_norm": 0.5030198097229004,
"learning_rate": 1.987627756858526e-05,
"loss": 1.4272,
"step": 2740
},
{
"epoch": 0.6518518518518519,
"grad_norm": 0.4713825583457947,
"learning_rate": 1.974179666487359e-05,
"loss": 1.4898,
"step": 2750
},
{
"epoch": 0.6542222222222223,
"grad_norm": 0.430649995803833,
"learning_rate": 1.9607315761161915e-05,
"loss": 1.3365,
"step": 2760
},
{
"epoch": 0.6565925925925926,
"grad_norm": 0.58051598072052,
"learning_rate": 1.9472834857450244e-05,
"loss": 1.4567,
"step": 2770
},
{
"epoch": 0.658962962962963,
"grad_norm": 0.46255138516426086,
"learning_rate": 1.933835395373857e-05,
"loss": 1.5205,
"step": 2780
},
{
"epoch": 0.6613333333333333,
"grad_norm": 0.5674681663513184,
"learning_rate": 1.9203873050026898e-05,
"loss": 1.3594,
"step": 2790
},
{
"epoch": 0.6637037037037037,
"grad_norm": 0.5063351392745972,
"learning_rate": 1.9069392146315223e-05,
"loss": 1.2901,
"step": 2800
},
{
"epoch": 0.666074074074074,
"grad_norm": 0.4963226914405823,
"learning_rate": 1.8934911242603552e-05,
"loss": 1.4437,
"step": 2810
},
{
"epoch": 0.6684444444444444,
"grad_norm": 0.5070900917053223,
"learning_rate": 1.8800430338891878e-05,
"loss": 1.376,
"step": 2820
},
{
"epoch": 0.6708148148148149,
"grad_norm": 0.5724377036094666,
"learning_rate": 1.8665949435180206e-05,
"loss": 1.5226,
"step": 2830
},
{
"epoch": 0.6731851851851852,
"grad_norm": 0.5261855125427246,
"learning_rate": 1.8531468531468532e-05,
"loss": 1.438,
"step": 2840
},
{
"epoch": 0.6755555555555556,
"grad_norm": 0.5292350053787231,
"learning_rate": 1.839698762775686e-05,
"loss": 1.4071,
"step": 2850
},
{
"epoch": 0.6779259259259259,
"grad_norm": 0.4596816301345825,
"learning_rate": 1.8262506724045186e-05,
"loss": 1.337,
"step": 2860
},
{
"epoch": 0.6802962962962963,
"grad_norm": 0.5225928425788879,
"learning_rate": 1.8128025820333515e-05,
"loss": 1.4363,
"step": 2870
},
{
"epoch": 0.6826666666666666,
"grad_norm": 0.49359938502311707,
"learning_rate": 1.799354491662184e-05,
"loss": 1.3388,
"step": 2880
},
{
"epoch": 0.685037037037037,
"grad_norm": 0.5156022906303406,
"learning_rate": 1.785906401291017e-05,
"loss": 1.4515,
"step": 2890
},
{
"epoch": 0.6874074074074074,
"grad_norm": 0.5047289133071899,
"learning_rate": 1.7724583109198495e-05,
"loss": 1.4584,
"step": 2900
},
{
"epoch": 0.6897777777777778,
"grad_norm": 0.4975475072860718,
"learning_rate": 1.7590102205486824e-05,
"loss": 1.5042,
"step": 2910
},
{
"epoch": 0.6921481481481482,
"grad_norm": 0.5997641086578369,
"learning_rate": 1.745562130177515e-05,
"loss": 1.5658,
"step": 2920
},
{
"epoch": 0.6945185185185185,
"grad_norm": 0.5376483201980591,
"learning_rate": 1.7321140398063478e-05,
"loss": 1.3429,
"step": 2930
},
{
"epoch": 0.6968888888888889,
"grad_norm": 0.4973870813846588,
"learning_rate": 1.7186659494351803e-05,
"loss": 1.4287,
"step": 2940
},
{
"epoch": 0.6992592592592592,
"grad_norm": 0.5504077076911926,
"learning_rate": 1.7052178590640132e-05,
"loss": 1.5161,
"step": 2950
},
{
"epoch": 0.7016296296296296,
"grad_norm": 0.4603710174560547,
"learning_rate": 1.6917697686928457e-05,
"loss": 1.463,
"step": 2960
},
{
"epoch": 0.704,
"grad_norm": 0.5116856694221497,
"learning_rate": 1.6783216783216786e-05,
"loss": 1.3862,
"step": 2970
},
{
"epoch": 0.7063703703703703,
"grad_norm": 0.49981990456581116,
"learning_rate": 1.6648735879505112e-05,
"loss": 1.4209,
"step": 2980
},
{
"epoch": 0.7087407407407408,
"grad_norm": 0.5085658431053162,
"learning_rate": 1.651425497579344e-05,
"loss": 1.4031,
"step": 2990
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.5499709844589233,
"learning_rate": 1.6379774072081766e-05,
"loss": 1.5093,
"step": 3000
},
{
"epoch": 0.7134814814814815,
"grad_norm": 0.5997831225395203,
"learning_rate": 1.6245293168370095e-05,
"loss": 1.517,
"step": 3010
},
{
"epoch": 0.7158518518518519,
"grad_norm": 0.5795171856880188,
"learning_rate": 1.611081226465842e-05,
"loss": 1.4279,
"step": 3020
},
{
"epoch": 0.7182222222222222,
"grad_norm": 0.5227158069610596,
"learning_rate": 1.5976331360946746e-05,
"loss": 1.5101,
"step": 3030
},
{
"epoch": 0.7205925925925926,
"grad_norm": 0.6352266669273376,
"learning_rate": 1.5841850457235075e-05,
"loss": 1.4849,
"step": 3040
},
{
"epoch": 0.7229629629629629,
"grad_norm": 0.6044921278953552,
"learning_rate": 1.57073695535234e-05,
"loss": 1.5239,
"step": 3050
},
{
"epoch": 0.7253333333333334,
"grad_norm": 0.4992562532424927,
"learning_rate": 1.557288864981173e-05,
"loss": 1.37,
"step": 3060
},
{
"epoch": 0.7277037037037037,
"grad_norm": 0.5439409017562866,
"learning_rate": 1.5438407746100054e-05,
"loss": 1.3724,
"step": 3070
},
{
"epoch": 0.7300740740740741,
"grad_norm": 0.5415698289871216,
"learning_rate": 1.530392684238838e-05,
"loss": 1.4562,
"step": 3080
},
{
"epoch": 0.7324444444444445,
"grad_norm": 0.4034167230129242,
"learning_rate": 1.5169445938676707e-05,
"loss": 1.3212,
"step": 3090
},
{
"epoch": 0.7348148148148148,
"grad_norm": 0.5307872891426086,
"learning_rate": 1.5034965034965034e-05,
"loss": 1.4165,
"step": 3100
},
{
"epoch": 0.7371851851851852,
"grad_norm": 0.5271874666213989,
"learning_rate": 1.4900484131253361e-05,
"loss": 1.4077,
"step": 3110
},
{
"epoch": 0.7395555555555555,
"grad_norm": 0.5995745658874512,
"learning_rate": 1.4766003227541688e-05,
"loss": 1.4732,
"step": 3120
},
{
"epoch": 0.7419259259259259,
"grad_norm": 0.45358097553253174,
"learning_rate": 1.4631522323830015e-05,
"loss": 1.4083,
"step": 3130
},
{
"epoch": 0.7442962962962963,
"grad_norm": 0.47864630818367004,
"learning_rate": 1.4497041420118343e-05,
"loss": 1.4363,
"step": 3140
},
{
"epoch": 0.7466666666666667,
"grad_norm": 0.46107572317123413,
"learning_rate": 1.436256051640667e-05,
"loss": 1.5163,
"step": 3150
},
{
"epoch": 0.7490370370370371,
"grad_norm": 0.5413241386413574,
"learning_rate": 1.4228079612694997e-05,
"loss": 1.3225,
"step": 3160
},
{
"epoch": 0.7514074074074074,
"grad_norm": 0.4649742841720581,
"learning_rate": 1.4093598708983324e-05,
"loss": 1.4101,
"step": 3170
},
{
"epoch": 0.7537777777777778,
"grad_norm": 0.5219136476516724,
"learning_rate": 1.3959117805271651e-05,
"loss": 1.4523,
"step": 3180
},
{
"epoch": 0.7561481481481481,
"grad_norm": 0.5591155886650085,
"learning_rate": 1.3824636901559978e-05,
"loss": 1.5966,
"step": 3190
},
{
"epoch": 0.7585185185185185,
"grad_norm": 0.5293004512786865,
"learning_rate": 1.3690155997848305e-05,
"loss": 1.4613,
"step": 3200
},
{
"epoch": 0.7608888888888888,
"grad_norm": 0.46828821301460266,
"learning_rate": 1.3555675094136632e-05,
"loss": 1.5187,
"step": 3210
},
{
"epoch": 0.7632592592592593,
"grad_norm": 0.4590572416782379,
"learning_rate": 1.342119419042496e-05,
"loss": 1.4418,
"step": 3220
},
{
"epoch": 0.7656296296296297,
"grad_norm": 0.6020212769508362,
"learning_rate": 1.3286713286713287e-05,
"loss": 1.5557,
"step": 3230
},
{
"epoch": 0.768,
"grad_norm": 0.542536199092865,
"learning_rate": 1.3152232383001614e-05,
"loss": 1.3689,
"step": 3240
},
{
"epoch": 0.7703703703703704,
"grad_norm": 0.5394562482833862,
"learning_rate": 1.3017751479289941e-05,
"loss": 1.4272,
"step": 3250
},
{
"epoch": 0.7727407407407407,
"grad_norm": 0.38109496235847473,
"learning_rate": 1.2883270575578268e-05,
"loss": 1.419,
"step": 3260
},
{
"epoch": 0.7751111111111111,
"grad_norm": 0.4617583155632019,
"learning_rate": 1.2748789671866595e-05,
"loss": 1.3547,
"step": 3270
},
{
"epoch": 0.7774814814814814,
"grad_norm": 0.5739762783050537,
"learning_rate": 1.2614308768154922e-05,
"loss": 1.5191,
"step": 3280
},
{
"epoch": 0.7798518518518519,
"grad_norm": 0.42539921402931213,
"learning_rate": 1.247982786444325e-05,
"loss": 1.3347,
"step": 3290
},
{
"epoch": 0.7822222222222223,
"grad_norm": 0.5273600816726685,
"learning_rate": 1.2345346960731577e-05,
"loss": 1.3758,
"step": 3300
},
{
"epoch": 0.7845925925925926,
"grad_norm": 0.4796091616153717,
"learning_rate": 1.2210866057019904e-05,
"loss": 1.3903,
"step": 3310
},
{
"epoch": 0.786962962962963,
"grad_norm": 0.46542009711265564,
"learning_rate": 1.2076385153308231e-05,
"loss": 1.406,
"step": 3320
},
{
"epoch": 0.7893333333333333,
"grad_norm": 0.47980108857154846,
"learning_rate": 1.1941904249596558e-05,
"loss": 1.3991,
"step": 3330
},
{
"epoch": 0.7917037037037037,
"grad_norm": 0.507736086845398,
"learning_rate": 1.1807423345884885e-05,
"loss": 1.4583,
"step": 3340
},
{
"epoch": 0.794074074074074,
"grad_norm": 0.5380430817604065,
"learning_rate": 1.1672942442173212e-05,
"loss": 1.2621,
"step": 3350
},
{
"epoch": 0.7964444444444444,
"grad_norm": 0.6689913272857666,
"learning_rate": 1.153846153846154e-05,
"loss": 1.5355,
"step": 3360
},
{
"epoch": 0.7988148148148149,
"grad_norm": 0.5129537582397461,
"learning_rate": 1.1403980634749865e-05,
"loss": 1.5895,
"step": 3370
},
{
"epoch": 0.8011851851851852,
"grad_norm": 0.5240408182144165,
"learning_rate": 1.1269499731038192e-05,
"loss": 1.4896,
"step": 3380
},
{
"epoch": 0.8035555555555556,
"grad_norm": 0.5004174709320068,
"learning_rate": 1.113501882732652e-05,
"loss": 1.4303,
"step": 3390
},
{
"epoch": 0.8059259259259259,
"grad_norm": 0.46890896558761597,
"learning_rate": 1.1000537923614846e-05,
"loss": 1.3536,
"step": 3400
},
{
"epoch": 0.8082962962962963,
"grad_norm": 0.5150523781776428,
"learning_rate": 1.0866057019903174e-05,
"loss": 1.3952,
"step": 3410
},
{
"epoch": 0.8106666666666666,
"grad_norm": 0.4322206676006317,
"learning_rate": 1.07315761161915e-05,
"loss": 1.2252,
"step": 3420
},
{
"epoch": 0.813037037037037,
"grad_norm": 0.5782944560050964,
"learning_rate": 1.0597095212479828e-05,
"loss": 1.4982,
"step": 3430
},
{
"epoch": 0.8154074074074074,
"grad_norm": 0.6032952666282654,
"learning_rate": 1.0462614308768155e-05,
"loss": 1.4614,
"step": 3440
},
{
"epoch": 0.8177777777777778,
"grad_norm": 0.453756183385849,
"learning_rate": 1.0328133405056482e-05,
"loss": 1.5017,
"step": 3450
},
{
"epoch": 0.8201481481481482,
"grad_norm": 0.5315883159637451,
"learning_rate": 1.019365250134481e-05,
"loss": 1.5354,
"step": 3460
},
{
"epoch": 0.8225185185185185,
"grad_norm": 0.5065041184425354,
"learning_rate": 1.0059171597633136e-05,
"loss": 1.4245,
"step": 3470
},
{
"epoch": 0.8248888888888889,
"grad_norm": 0.542103111743927,
"learning_rate": 9.924690693921463e-06,
"loss": 1.3599,
"step": 3480
},
{
"epoch": 0.8272592592592592,
"grad_norm": 0.5330160856246948,
"learning_rate": 9.79020979020979e-06,
"loss": 1.3502,
"step": 3490
},
{
"epoch": 0.8296296296296296,
"grad_norm": 0.4731038212776184,
"learning_rate": 9.655728886498118e-06,
"loss": 1.2931,
"step": 3500
},
{
"epoch": 0.832,
"grad_norm": 0.4719734787940979,
"learning_rate": 9.521247982786445e-06,
"loss": 1.3185,
"step": 3510
},
{
"epoch": 0.8343703703703703,
"grad_norm": 0.5551607012748718,
"learning_rate": 9.386767079074772e-06,
"loss": 1.4045,
"step": 3520
},
{
"epoch": 0.8367407407407408,
"grad_norm": 0.5661736130714417,
"learning_rate": 9.252286175363099e-06,
"loss": 1.6005,
"step": 3530
},
{
"epoch": 0.8391111111111111,
"grad_norm": 0.5772873759269714,
"learning_rate": 9.117805271651426e-06,
"loss": 1.3975,
"step": 3540
},
{
"epoch": 0.8414814814814815,
"grad_norm": 0.5180752873420715,
"learning_rate": 8.983324367939753e-06,
"loss": 1.5147,
"step": 3550
},
{
"epoch": 0.8438518518518519,
"grad_norm": 0.5256723165512085,
"learning_rate": 8.84884346422808e-06,
"loss": 1.4673,
"step": 3560
},
{
"epoch": 0.8462222222222222,
"grad_norm": 0.4829583168029785,
"learning_rate": 8.714362560516406e-06,
"loss": 1.5102,
"step": 3570
},
{
"epoch": 0.8485925925925926,
"grad_norm": 0.5027347207069397,
"learning_rate": 8.579881656804733e-06,
"loss": 1.6514,
"step": 3580
},
{
"epoch": 0.8509629629629629,
"grad_norm": 0.5117186903953552,
"learning_rate": 8.44540075309306e-06,
"loss": 1.4451,
"step": 3590
},
{
"epoch": 0.8533333333333334,
"grad_norm": 0.4994155466556549,
"learning_rate": 8.310919849381387e-06,
"loss": 1.3408,
"step": 3600
},
{
"epoch": 0.8557037037037037,
"grad_norm": 0.4399481415748596,
"learning_rate": 8.176438945669715e-06,
"loss": 1.4044,
"step": 3610
},
{
"epoch": 0.8580740740740741,
"grad_norm": 0.49395203590393066,
"learning_rate": 8.041958041958042e-06,
"loss": 1.3895,
"step": 3620
},
{
"epoch": 0.8604444444444445,
"grad_norm": 0.4922611713409424,
"learning_rate": 7.907477138246369e-06,
"loss": 1.5503,
"step": 3630
},
{
"epoch": 0.8628148148148148,
"grad_norm": 0.5255241990089417,
"learning_rate": 7.772996234534696e-06,
"loss": 1.4736,
"step": 3640
},
{
"epoch": 0.8651851851851852,
"grad_norm": 0.4713379442691803,
"learning_rate": 7.638515330823023e-06,
"loss": 1.4395,
"step": 3650
},
{
"epoch": 0.8675555555555555,
"grad_norm": 0.4290190637111664,
"learning_rate": 7.50403442711135e-06,
"loss": 1.4265,
"step": 3660
},
{
"epoch": 0.8699259259259259,
"grad_norm": 0.5157113075256348,
"learning_rate": 7.369553523399677e-06,
"loss": 1.4363,
"step": 3670
},
{
"epoch": 0.8722962962962963,
"grad_norm": 0.5694654583930969,
"learning_rate": 7.2350726196880045e-06,
"loss": 1.4602,
"step": 3680
},
{
"epoch": 0.8746666666666667,
"grad_norm": 0.47969937324523926,
"learning_rate": 7.100591715976332e-06,
"loss": 1.2716,
"step": 3690
},
{
"epoch": 0.8770370370370371,
"grad_norm": 0.4939590096473694,
"learning_rate": 6.966110812264659e-06,
"loss": 1.4356,
"step": 3700
},
{
"epoch": 0.8794074074074074,
"grad_norm": 0.4829910695552826,
"learning_rate": 6.831629908552986e-06,
"loss": 1.4482,
"step": 3710
},
{
"epoch": 0.8817777777777778,
"grad_norm": 0.473178505897522,
"learning_rate": 6.697149004841313e-06,
"loss": 1.3772,
"step": 3720
},
{
"epoch": 0.8841481481481481,
"grad_norm": 0.5327422618865967,
"learning_rate": 6.56266810112964e-06,
"loss": 1.5707,
"step": 3730
},
{
"epoch": 0.8865185185185185,
"grad_norm": 0.4069652259349823,
"learning_rate": 6.428187197417967e-06,
"loss": 1.3501,
"step": 3740
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.5368005037307739,
"learning_rate": 6.2937062937062944e-06,
"loss": 1.6791,
"step": 3750
},
{
"epoch": 0.8912592592592593,
"grad_norm": 0.5622044801712036,
"learning_rate": 6.159225389994621e-06,
"loss": 1.2727,
"step": 3760
},
{
"epoch": 0.8936296296296297,
"grad_norm": 0.5526837110519409,
"learning_rate": 6.024744486282948e-06,
"loss": 1.478,
"step": 3770
},
{
"epoch": 0.896,
"grad_norm": 0.5442282557487488,
"learning_rate": 5.890263582571275e-06,
"loss": 1.3917,
"step": 3780
},
{
"epoch": 0.8983703703703704,
"grad_norm": 0.5711065530776978,
"learning_rate": 5.755782678859602e-06,
"loss": 1.4748,
"step": 3790
},
{
"epoch": 0.9007407407407407,
"grad_norm": 0.5068963766098022,
"learning_rate": 5.621301775147929e-06,
"loss": 1.3375,
"step": 3800
},
{
"epoch": 0.9031111111111111,
"grad_norm": 0.6032804846763611,
"learning_rate": 5.486820871436256e-06,
"loss": 1.553,
"step": 3810
},
{
"epoch": 0.9054814814814814,
"grad_norm": 0.4718996286392212,
"learning_rate": 5.352339967724583e-06,
"loss": 1.4586,
"step": 3820
},
{
"epoch": 0.9078518518518518,
"grad_norm": 0.583185076713562,
"learning_rate": 5.21785906401291e-06,
"loss": 1.4294,
"step": 3830
},
{
"epoch": 0.9102222222222223,
"grad_norm": 0.5688157677650452,
"learning_rate": 5.083378160301237e-06,
"loss": 1.3434,
"step": 3840
},
{
"epoch": 0.9125925925925926,
"grad_norm": 0.43667495250701904,
"learning_rate": 4.948897256589564e-06,
"loss": 1.5107,
"step": 3850
},
{
"epoch": 0.914962962962963,
"grad_norm": 0.5924187302589417,
"learning_rate": 4.814416352877891e-06,
"loss": 1.3558,
"step": 3860
},
{
"epoch": 0.9173333333333333,
"grad_norm": 0.538318932056427,
"learning_rate": 4.679935449166218e-06,
"loss": 1.379,
"step": 3870
},
{
"epoch": 0.9197037037037037,
"grad_norm": 0.4840611219406128,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.2776,
"step": 3880
},
{
"epoch": 0.922074074074074,
"grad_norm": 0.5927128195762634,
"learning_rate": 4.410973641742873e-06,
"loss": 1.4598,
"step": 3890
},
{
"epoch": 0.9244444444444444,
"grad_norm": 0.5672274827957153,
"learning_rate": 4.2764927380312e-06,
"loss": 1.467,
"step": 3900
},
{
"epoch": 0.9268148148148149,
"grad_norm": 0.5424569249153137,
"learning_rate": 4.142011834319527e-06,
"loss": 1.532,
"step": 3910
},
{
"epoch": 0.9291851851851852,
"grad_norm": 0.5921524167060852,
"learning_rate": 4.007530930607853e-06,
"loss": 1.5244,
"step": 3920
},
{
"epoch": 0.9315555555555556,
"grad_norm": 0.4295691251754761,
"learning_rate": 3.87305002689618e-06,
"loss": 1.4358,
"step": 3930
},
{
"epoch": 0.9339259259259259,
"grad_norm": 0.5117975473403931,
"learning_rate": 3.738569123184508e-06,
"loss": 1.4241,
"step": 3940
},
{
"epoch": 0.9362962962962963,
"grad_norm": 0.5273323655128479,
"learning_rate": 3.604088219472835e-06,
"loss": 1.3188,
"step": 3950
},
{
"epoch": 0.9386666666666666,
"grad_norm": 0.4947817325592041,
"learning_rate": 3.469607315761162e-06,
"loss": 1.444,
"step": 3960
},
{
"epoch": 0.941037037037037,
"grad_norm": 0.5852746367454529,
"learning_rate": 3.3351264120494893e-06,
"loss": 1.4334,
"step": 3970
},
{
"epoch": 0.9434074074074074,
"grad_norm": 0.5183681845664978,
"learning_rate": 3.2006455083378165e-06,
"loss": 1.4422,
"step": 3980
},
{
"epoch": 0.9457777777777778,
"grad_norm": 0.5193647146224976,
"learning_rate": 3.066164604626143e-06,
"loss": 1.3769,
"step": 3990
},
{
"epoch": 0.9481481481481482,
"grad_norm": 0.5597278475761414,
"learning_rate": 2.9316837009144703e-06,
"loss": 1.5406,
"step": 4000
},
{
"epoch": 0.9505185185185185,
"grad_norm": 0.5173184275627136,
"learning_rate": 2.7972027972027974e-06,
"loss": 1.5082,
"step": 4010
},
{
"epoch": 0.9528888888888889,
"grad_norm": 0.5205141305923462,
"learning_rate": 2.6627218934911246e-06,
"loss": 1.4971,
"step": 4020
},
{
"epoch": 0.9552592592592593,
"grad_norm": 0.48843199014663696,
"learning_rate": 2.5282409897794517e-06,
"loss": 1.4113,
"step": 4030
},
{
"epoch": 0.9576296296296296,
"grad_norm": 0.5854997634887695,
"learning_rate": 2.3937600860677784e-06,
"loss": 1.5666,
"step": 4040
},
{
"epoch": 0.96,
"grad_norm": 0.5264609456062317,
"learning_rate": 2.2592791823561056e-06,
"loss": 1.3334,
"step": 4050
},
{
"epoch": 0.9623703703703703,
"grad_norm": 0.5044777989387512,
"learning_rate": 2.1247982786444327e-06,
"loss": 1.4251,
"step": 4060
},
{
"epoch": 0.9647407407407408,
"grad_norm": 0.5678290128707886,
"learning_rate": 1.99031737493276e-06,
"loss": 1.4359,
"step": 4070
},
{
"epoch": 0.9671111111111111,
"grad_norm": 0.5562720894813538,
"learning_rate": 1.8558364712210868e-06,
"loss": 1.4333,
"step": 4080
},
{
"epoch": 0.9694814814814815,
"grad_norm": 0.5118197798728943,
"learning_rate": 1.7213555675094137e-06,
"loss": 1.4257,
"step": 4090
},
{
"epoch": 0.9718518518518519,
"grad_norm": 0.4556100070476532,
"learning_rate": 1.5868746637977408e-06,
"loss": 1.4414,
"step": 4100
},
{
"epoch": 0.9742222222222222,
"grad_norm": 0.5370482802391052,
"learning_rate": 1.452393760086068e-06,
"loss": 1.6166,
"step": 4110
},
{
"epoch": 0.9765925925925926,
"grad_norm": 0.5851370096206665,
"learning_rate": 1.3179128563743949e-06,
"loss": 1.5649,
"step": 4120
},
{
"epoch": 0.9789629629629629,
"grad_norm": 0.5804405212402344,
"learning_rate": 1.183431952662722e-06,
"loss": 1.3898,
"step": 4130
},
{
"epoch": 0.9813333333333333,
"grad_norm": 0.5411353707313538,
"learning_rate": 1.0489510489510491e-06,
"loss": 1.4885,
"step": 4140
},
{
"epoch": 0.9837037037037037,
"grad_norm": 0.5271933078765869,
"learning_rate": 9.14470145239376e-07,
"loss": 1.3816,
"step": 4150
},
{
"epoch": 0.9860740740740741,
"grad_norm": 0.5039179921150208,
"learning_rate": 7.799892415277031e-07,
"loss": 1.2985,
"step": 4160
},
{
"epoch": 0.9884444444444445,
"grad_norm": 0.47758767008781433,
"learning_rate": 6.455083378160301e-07,
"loss": 1.4848,
"step": 4170
},
{
"epoch": 0.9908148148148148,
"grad_norm": 0.5355851054191589,
"learning_rate": 5.110274341043572e-07,
"loss": 1.5264,
"step": 4180
},
{
"epoch": 0.9931851851851852,
"grad_norm": 0.5918956398963928,
"learning_rate": 3.7654653039268424e-07,
"loss": 1.4337,
"step": 4190
},
{
"epoch": 0.9955555555555555,
"grad_norm": 0.4990881085395813,
"learning_rate": 2.4206562668101127e-07,
"loss": 1.3851,
"step": 4200
},
{
"epoch": 0.9979259259259259,
"grad_norm": 0.49343588948249817,
"learning_rate": 1.0758472296933835e-07,
"loss": 1.5421,
"step": 4210
}
],
"logging_steps": 10,
"max_steps": 4218,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.660058442305372e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}