{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998222222222222, "eval_steps": 5000, "global_step": 4218, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023703703703703703, "grad_norm": 0.542885959148407, "learning_rate": 1.0000000000000002e-06, "loss": 1.4997, "step": 10 }, { "epoch": 0.004740740740740741, "grad_norm": 0.44538265466690063, "learning_rate": 2.0000000000000003e-06, "loss": 1.4925, "step": 20 }, { "epoch": 0.0071111111111111115, "grad_norm": 0.4945567548274994, "learning_rate": 3e-06, "loss": 1.4985, "step": 30 }, { "epoch": 0.009481481481481481, "grad_norm": 0.4560663402080536, "learning_rate": 4.000000000000001e-06, "loss": 1.4031, "step": 40 }, { "epoch": 0.011851851851851851, "grad_norm": 0.4691298007965088, "learning_rate": 5e-06, "loss": 1.4175, "step": 50 }, { "epoch": 0.014222222222222223, "grad_norm": 0.44202300906181335, "learning_rate": 6e-06, "loss": 1.4337, "step": 60 }, { "epoch": 0.016592592592592593, "grad_norm": 0.5069476366043091, "learning_rate": 7.000000000000001e-06, "loss": 1.4629, "step": 70 }, { "epoch": 0.018962962962962963, "grad_norm": 0.4806945025920868, "learning_rate": 8.000000000000001e-06, "loss": 1.5706, "step": 80 }, { "epoch": 0.021333333333333333, "grad_norm": 0.5269841551780701, "learning_rate": 9e-06, "loss": 1.5625, "step": 90 }, { "epoch": 0.023703703703703703, "grad_norm": 0.37831586599349976, "learning_rate": 1e-05, "loss": 1.4083, "step": 100 }, { "epoch": 0.026074074074074072, "grad_norm": 0.442981094121933, "learning_rate": 1.1000000000000001e-05, "loss": 1.3799, "step": 110 }, { "epoch": 0.028444444444444446, "grad_norm": 0.47675761580467224, "learning_rate": 1.2e-05, "loss": 1.5356, "step": 120 }, { "epoch": 0.030814814814814816, "grad_norm": 0.5033993721008301, "learning_rate": 1.3000000000000001e-05, "loss": 1.504, "step": 130 }, { "epoch": 0.033185185185185186, "grad_norm": 0.4628155827522278, "learning_rate": 1.4000000000000001e-05, "loss": 1.3474, "step": 140 }, { "epoch": 0.035555555555555556, "grad_norm": 0.41637757420539856, "learning_rate": 1.5e-05, "loss": 1.4352, "step": 150 }, { "epoch": 0.037925925925925925, "grad_norm": 0.5029244422912598, "learning_rate": 1.6000000000000003e-05, "loss": 1.3224, "step": 160 }, { "epoch": 0.040296296296296295, "grad_norm": 0.6434731483459473, "learning_rate": 1.7000000000000003e-05, "loss": 1.5611, "step": 170 }, { "epoch": 0.042666666666666665, "grad_norm": 0.42424359917640686, "learning_rate": 1.8e-05, "loss": 1.4191, "step": 180 }, { "epoch": 0.045037037037037035, "grad_norm": 0.4729703962802887, "learning_rate": 1.9e-05, "loss": 1.3284, "step": 190 }, { "epoch": 0.047407407407407405, "grad_norm": 0.48806190490722656, "learning_rate": 2e-05, "loss": 1.48, "step": 200 }, { "epoch": 0.049777777777777775, "grad_norm": 0.4987320303916931, "learning_rate": 2.1e-05, "loss": 1.4535, "step": 210 }, { "epoch": 0.052148148148148145, "grad_norm": 0.46912866830825806, "learning_rate": 2.2000000000000003e-05, "loss": 1.4846, "step": 220 }, { "epoch": 0.05451851851851852, "grad_norm": 0.4369196593761444, "learning_rate": 2.3000000000000003e-05, "loss": 1.5564, "step": 230 }, { "epoch": 0.05688888888888889, "grad_norm": 0.48074963688850403, "learning_rate": 2.4e-05, "loss": 1.3142, "step": 240 }, { "epoch": 0.05925925925925926, "grad_norm": 0.450253427028656, "learning_rate": 2.5e-05, "loss": 1.3877, "step": 250 }, { "epoch": 0.06162962962962963, "grad_norm": 0.4517356753349304, "learning_rate": 2.6000000000000002e-05, "loss": 1.3969, "step": 260 }, { "epoch": 0.064, "grad_norm": 0.47781577706336975, "learning_rate": 2.7000000000000002e-05, "loss": 1.5352, "step": 270 }, { "epoch": 0.06637037037037037, "grad_norm": 0.5579633712768555, "learning_rate": 2.8000000000000003e-05, "loss": 1.5436, "step": 280 }, { "epoch": 0.06874074074074074, "grad_norm": 0.4838034510612488, "learning_rate": 2.9e-05, "loss": 1.3564, "step": 290 }, { "epoch": 0.07111111111111111, "grad_norm": 0.5685828328132629, "learning_rate": 3e-05, "loss": 1.4429, "step": 300 }, { "epoch": 0.07348148148148148, "grad_norm": 0.5230541229248047, "learning_rate": 3.1e-05, "loss": 1.3933, "step": 310 }, { "epoch": 0.07585185185185185, "grad_norm": 0.45525529980659485, "learning_rate": 3.2000000000000005e-05, "loss": 1.4224, "step": 320 }, { "epoch": 0.07822222222222222, "grad_norm": 0.47926583886146545, "learning_rate": 3.3e-05, "loss": 1.4466, "step": 330 }, { "epoch": 0.08059259259259259, "grad_norm": 0.38689500093460083, "learning_rate": 3.4000000000000007e-05, "loss": 1.5105, "step": 340 }, { "epoch": 0.08296296296296296, "grad_norm": 0.4488411843776703, "learning_rate": 3.5e-05, "loss": 1.3392, "step": 350 }, { "epoch": 0.08533333333333333, "grad_norm": 0.5381152033805847, "learning_rate": 3.6e-05, "loss": 1.375, "step": 360 }, { "epoch": 0.0877037037037037, "grad_norm": 0.5788478255271912, "learning_rate": 3.7e-05, "loss": 1.3522, "step": 370 }, { "epoch": 0.09007407407407407, "grad_norm": 0.501133143901825, "learning_rate": 3.8e-05, "loss": 1.3294, "step": 380 }, { "epoch": 0.09244444444444444, "grad_norm": 0.5300689935684204, "learning_rate": 3.9000000000000006e-05, "loss": 1.5623, "step": 390 }, { "epoch": 0.09481481481481481, "grad_norm": 0.5409078001976013, "learning_rate": 4e-05, "loss": 1.4098, "step": 400 }, { "epoch": 0.09718518518518518, "grad_norm": 0.5598166584968567, "learning_rate": 4.1e-05, "loss": 1.4104, "step": 410 }, { "epoch": 0.09955555555555555, "grad_norm": 0.5656659603118896, "learning_rate": 4.2e-05, "loss": 1.3782, "step": 420 }, { "epoch": 0.10192592592592592, "grad_norm": 0.5094364881515503, "learning_rate": 4.3e-05, "loss": 1.6233, "step": 430 }, { "epoch": 0.10429629629629629, "grad_norm": 0.5540050268173218, "learning_rate": 4.4000000000000006e-05, "loss": 1.512, "step": 440 }, { "epoch": 0.10666666666666667, "grad_norm": 0.5693063139915466, "learning_rate": 4.5e-05, "loss": 1.641, "step": 450 }, { "epoch": 0.10903703703703704, "grad_norm": 0.5556958913803101, "learning_rate": 4.600000000000001e-05, "loss": 1.4532, "step": 460 }, { "epoch": 0.11140740740740741, "grad_norm": 0.5052928924560547, "learning_rate": 4.7e-05, "loss": 1.4863, "step": 470 }, { "epoch": 0.11377777777777778, "grad_norm": 0.5321051478385925, "learning_rate": 4.8e-05, "loss": 1.353, "step": 480 }, { "epoch": 0.11614814814814815, "grad_norm": 0.658074140548706, "learning_rate": 4.9e-05, "loss": 1.563, "step": 490 }, { "epoch": 0.11851851851851852, "grad_norm": 0.4817732870578766, "learning_rate": 5e-05, "loss": 1.5347, "step": 500 }, { "epoch": 0.12088888888888889, "grad_norm": 0.6876205801963806, "learning_rate": 4.9865519096288324e-05, "loss": 1.6524, "step": 510 }, { "epoch": 0.12325925925925926, "grad_norm": 0.5238626003265381, "learning_rate": 4.973103819257665e-05, "loss": 1.5766, "step": 520 }, { "epoch": 0.12562962962962962, "grad_norm": 0.4588116705417633, "learning_rate": 4.959655728886498e-05, "loss": 1.4258, "step": 530 }, { "epoch": 0.128, "grad_norm": 0.529692530632019, "learning_rate": 4.946207638515331e-05, "loss": 1.5574, "step": 540 }, { "epoch": 0.13037037037037036, "grad_norm": 0.475524365901947, "learning_rate": 4.932759548144163e-05, "loss": 1.377, "step": 550 }, { "epoch": 0.13274074074074074, "grad_norm": 0.48722413182258606, "learning_rate": 4.919311457772996e-05, "loss": 1.3156, "step": 560 }, { "epoch": 0.1351111111111111, "grad_norm": 0.6309683322906494, "learning_rate": 4.905863367401829e-05, "loss": 1.5581, "step": 570 }, { "epoch": 0.13748148148148148, "grad_norm": 0.5029247999191284, "learning_rate": 4.892415277030662e-05, "loss": 1.5968, "step": 580 }, { "epoch": 0.13985185185185184, "grad_norm": 0.421310156583786, "learning_rate": 4.878967186659494e-05, "loss": 1.4881, "step": 590 }, { "epoch": 0.14222222222222222, "grad_norm": 0.5082572102546692, "learning_rate": 4.865519096288327e-05, "loss": 1.6574, "step": 600 }, { "epoch": 0.1445925925925926, "grad_norm": 0.5082793235778809, "learning_rate": 4.85207100591716e-05, "loss": 1.3606, "step": 610 }, { "epoch": 0.14696296296296296, "grad_norm": 0.5169036388397217, "learning_rate": 4.838622915545993e-05, "loss": 1.4457, "step": 620 }, { "epoch": 0.14933333333333335, "grad_norm": 0.5509771704673767, "learning_rate": 4.825174825174825e-05, "loss": 1.4754, "step": 630 }, { "epoch": 0.1517037037037037, "grad_norm": 0.630851149559021, "learning_rate": 4.811726734803658e-05, "loss": 1.5453, "step": 640 }, { "epoch": 0.15407407407407409, "grad_norm": 0.5343595147132874, "learning_rate": 4.798278644432491e-05, "loss": 1.4822, "step": 650 }, { "epoch": 0.15644444444444444, "grad_norm": 0.5070016980171204, "learning_rate": 4.7848305540613237e-05, "loss": 1.3835, "step": 660 }, { "epoch": 0.15881481481481483, "grad_norm": 0.6097332835197449, "learning_rate": 4.771382463690156e-05, "loss": 1.546, "step": 670 }, { "epoch": 0.16118518518518518, "grad_norm": 0.5894319415092468, "learning_rate": 4.757934373318989e-05, "loss": 1.3605, "step": 680 }, { "epoch": 0.16355555555555557, "grad_norm": 0.4879942238330841, "learning_rate": 4.7444862829478216e-05, "loss": 1.4391, "step": 690 }, { "epoch": 0.16592592592592592, "grad_norm": 0.49390801787376404, "learning_rate": 4.7310381925766545e-05, "loss": 1.2228, "step": 700 }, { "epoch": 0.1682962962962963, "grad_norm": 0.6193021535873413, "learning_rate": 4.717590102205487e-05, "loss": 1.7474, "step": 710 }, { "epoch": 0.17066666666666666, "grad_norm": 0.4410654604434967, "learning_rate": 4.7041420118343196e-05, "loss": 1.4255, "step": 720 }, { "epoch": 0.17303703703703704, "grad_norm": 0.5690642595291138, "learning_rate": 4.6906939214631525e-05, "loss": 1.4252, "step": 730 }, { "epoch": 0.1754074074074074, "grad_norm": 0.4561966359615326, "learning_rate": 4.6772458310919854e-05, "loss": 1.4214, "step": 740 }, { "epoch": 0.17777777777777778, "grad_norm": 0.5871363282203674, "learning_rate": 4.6637977407208176e-05, "loss": 1.5272, "step": 750 }, { "epoch": 0.18014814814814814, "grad_norm": 0.5293774604797363, "learning_rate": 4.6503496503496505e-05, "loss": 1.5507, "step": 760 }, { "epoch": 0.18251851851851852, "grad_norm": 0.483826220035553, "learning_rate": 4.636901559978483e-05, "loss": 1.4058, "step": 770 }, { "epoch": 0.18488888888888888, "grad_norm": 0.551902174949646, "learning_rate": 4.623453469607316e-05, "loss": 1.3707, "step": 780 }, { "epoch": 0.18725925925925926, "grad_norm": 0.5492023825645447, "learning_rate": 4.6100053792361484e-05, "loss": 1.4901, "step": 790 }, { "epoch": 0.18962962962962962, "grad_norm": 0.5409772396087646, "learning_rate": 4.596557288864981e-05, "loss": 1.6356, "step": 800 }, { "epoch": 0.192, "grad_norm": 0.5469648838043213, "learning_rate": 4.583109198493814e-05, "loss": 1.4574, "step": 810 }, { "epoch": 0.19437037037037036, "grad_norm": 0.5523713827133179, "learning_rate": 4.569661108122647e-05, "loss": 1.5304, "step": 820 }, { "epoch": 0.19674074074074074, "grad_norm": 0.4884456992149353, "learning_rate": 4.556213017751479e-05, "loss": 1.4173, "step": 830 }, { "epoch": 0.1991111111111111, "grad_norm": 0.5865374803543091, "learning_rate": 4.542764927380312e-05, "loss": 1.4178, "step": 840 }, { "epoch": 0.20148148148148148, "grad_norm": 0.5571750402450562, "learning_rate": 4.529316837009145e-05, "loss": 1.5356, "step": 850 }, { "epoch": 0.20385185185185184, "grad_norm": 0.567616879940033, "learning_rate": 4.515868746637978e-05, "loss": 1.4496, "step": 860 }, { "epoch": 0.20622222222222222, "grad_norm": 0.5077497959136963, "learning_rate": 4.50242065626681e-05, "loss": 1.4757, "step": 870 }, { "epoch": 0.20859259259259258, "grad_norm": 0.5118802189826965, "learning_rate": 4.488972565895643e-05, "loss": 1.3845, "step": 880 }, { "epoch": 0.21096296296296296, "grad_norm": 0.43292248249053955, "learning_rate": 4.475524475524476e-05, "loss": 1.421, "step": 890 }, { "epoch": 0.21333333333333335, "grad_norm": 0.5365243554115295, "learning_rate": 4.462076385153308e-05, "loss": 1.4586, "step": 900 }, { "epoch": 0.2157037037037037, "grad_norm": 0.4912022054195404, "learning_rate": 4.448628294782141e-05, "loss": 1.5385, "step": 910 }, { "epoch": 0.2180740740740741, "grad_norm": 0.5855193734169006, "learning_rate": 4.435180204410974e-05, "loss": 1.5718, "step": 920 }, { "epoch": 0.22044444444444444, "grad_norm": 0.5224360227584839, "learning_rate": 4.421732114039807e-05, "loss": 1.3853, "step": 930 }, { "epoch": 0.22281481481481483, "grad_norm": 0.4283509850502014, "learning_rate": 4.408284023668639e-05, "loss": 1.3758, "step": 940 }, { "epoch": 0.22518518518518518, "grad_norm": 0.44806018471717834, "learning_rate": 4.394835933297472e-05, "loss": 1.4089, "step": 950 }, { "epoch": 0.22755555555555557, "grad_norm": 0.4234403967857361, "learning_rate": 4.381387842926305e-05, "loss": 1.321, "step": 960 }, { "epoch": 0.22992592592592592, "grad_norm": 0.45570847392082214, "learning_rate": 4.3679397525551376e-05, "loss": 1.384, "step": 970 }, { "epoch": 0.2322962962962963, "grad_norm": 0.6098482608795166, "learning_rate": 4.35449166218397e-05, "loss": 1.5165, "step": 980 }, { "epoch": 0.23466666666666666, "grad_norm": 0.47981974482536316, "learning_rate": 4.341043571812803e-05, "loss": 1.4827, "step": 990 }, { "epoch": 0.23703703703703705, "grad_norm": 0.567845344543457, "learning_rate": 4.3275954814416356e-05, "loss": 1.4494, "step": 1000 }, { "epoch": 0.2394074074074074, "grad_norm": 0.5508958697319031, "learning_rate": 4.3141473910704685e-05, "loss": 1.5681, "step": 1010 }, { "epoch": 0.24177777777777779, "grad_norm": 0.6119508743286133, "learning_rate": 4.300699300699301e-05, "loss": 1.4522, "step": 1020 }, { "epoch": 0.24414814814814814, "grad_norm": 0.654909074306488, "learning_rate": 4.2872512103281336e-05, "loss": 1.6317, "step": 1030 }, { "epoch": 0.24651851851851853, "grad_norm": 0.5818801522254944, "learning_rate": 4.2738031199569664e-05, "loss": 1.4917, "step": 1040 }, { "epoch": 0.24888888888888888, "grad_norm": 0.5295186638832092, "learning_rate": 4.260355029585799e-05, "loss": 1.6393, "step": 1050 }, { "epoch": 0.25125925925925924, "grad_norm": 0.5558478832244873, "learning_rate": 4.2469069392146315e-05, "loss": 1.5607, "step": 1060 }, { "epoch": 0.25362962962962965, "grad_norm": 0.5266067385673523, "learning_rate": 4.2334588488434644e-05, "loss": 1.4366, "step": 1070 }, { "epoch": 0.256, "grad_norm": 0.4949641823768616, "learning_rate": 4.220010758472297e-05, "loss": 1.4392, "step": 1080 }, { "epoch": 0.25837037037037036, "grad_norm": 0.48148399591445923, "learning_rate": 4.20656266810113e-05, "loss": 1.5301, "step": 1090 }, { "epoch": 0.2607407407407407, "grad_norm": 0.5564059615135193, "learning_rate": 4.1931145777299624e-05, "loss": 1.3645, "step": 1100 }, { "epoch": 0.26311111111111113, "grad_norm": 0.6419994235038757, "learning_rate": 4.179666487358795e-05, "loss": 1.4543, "step": 1110 }, { "epoch": 0.2654814814814815, "grad_norm": 0.5205827355384827, "learning_rate": 4.166218396987628e-05, "loss": 1.5358, "step": 1120 }, { "epoch": 0.26785185185185184, "grad_norm": 0.45430988073349, "learning_rate": 4.152770306616461e-05, "loss": 1.5483, "step": 1130 }, { "epoch": 0.2702222222222222, "grad_norm": 0.5467645525932312, "learning_rate": 4.139322216245293e-05, "loss": 1.4702, "step": 1140 }, { "epoch": 0.2725925925925926, "grad_norm": 0.47259363532066345, "learning_rate": 4.125874125874126e-05, "loss": 1.451, "step": 1150 }, { "epoch": 0.27496296296296296, "grad_norm": 0.48951438069343567, "learning_rate": 4.112426035502959e-05, "loss": 1.4095, "step": 1160 }, { "epoch": 0.2773333333333333, "grad_norm": 0.601701021194458, "learning_rate": 4.098977945131792e-05, "loss": 1.6848, "step": 1170 }, { "epoch": 0.2797037037037037, "grad_norm": 0.5379857420921326, "learning_rate": 4.085529854760624e-05, "loss": 1.3598, "step": 1180 }, { "epoch": 0.2820740740740741, "grad_norm": 0.6498066186904907, "learning_rate": 4.072081764389457e-05, "loss": 1.4655, "step": 1190 }, { "epoch": 0.28444444444444444, "grad_norm": 0.5294344425201416, "learning_rate": 4.05863367401829e-05, "loss": 1.3851, "step": 1200 }, { "epoch": 0.2868148148148148, "grad_norm": 0.5410310626029968, "learning_rate": 4.045185583647123e-05, "loss": 1.5091, "step": 1210 }, { "epoch": 0.2891851851851852, "grad_norm": 0.5395278334617615, "learning_rate": 4.031737493275955e-05, "loss": 1.3487, "step": 1220 }, { "epoch": 0.29155555555555557, "grad_norm": 0.637909471988678, "learning_rate": 4.018289402904788e-05, "loss": 1.5848, "step": 1230 }, { "epoch": 0.2939259259259259, "grad_norm": 0.4254130721092224, "learning_rate": 4.004841312533621e-05, "loss": 1.4186, "step": 1240 }, { "epoch": 0.2962962962962963, "grad_norm": 0.5799821019172668, "learning_rate": 3.9913932221624536e-05, "loss": 1.4289, "step": 1250 }, { "epoch": 0.2986666666666667, "grad_norm": 0.43250229954719543, "learning_rate": 3.977945131791286e-05, "loss": 1.5458, "step": 1260 }, { "epoch": 0.30103703703703705, "grad_norm": 0.4722803831100464, "learning_rate": 3.964497041420119e-05, "loss": 1.4716, "step": 1270 }, { "epoch": 0.3034074074074074, "grad_norm": 0.5686700940132141, "learning_rate": 3.9510489510489516e-05, "loss": 1.6452, "step": 1280 }, { "epoch": 0.30577777777777776, "grad_norm": 0.493028461933136, "learning_rate": 3.9376008606777844e-05, "loss": 1.4516, "step": 1290 }, { "epoch": 0.30814814814814817, "grad_norm": 0.5100602507591248, "learning_rate": 3.9241527703066166e-05, "loss": 1.3366, "step": 1300 }, { "epoch": 0.3105185185185185, "grad_norm": 0.6535771489143372, "learning_rate": 3.910704679935449e-05, "loss": 1.4312, "step": 1310 }, { "epoch": 0.3128888888888889, "grad_norm": 0.48823079466819763, "learning_rate": 3.8972565895642824e-05, "loss": 1.4888, "step": 1320 }, { "epoch": 0.31525925925925924, "grad_norm": 0.4459994435310364, "learning_rate": 3.8838084991931146e-05, "loss": 1.4637, "step": 1330 }, { "epoch": 0.31762962962962965, "grad_norm": 0.5344628691673279, "learning_rate": 3.8703604088219475e-05, "loss": 1.6118, "step": 1340 }, { "epoch": 0.32, "grad_norm": 0.44893643260002136, "learning_rate": 3.85691231845078e-05, "loss": 1.4521, "step": 1350 }, { "epoch": 0.32237037037037036, "grad_norm": 0.4381811022758484, "learning_rate": 3.8434642280796126e-05, "loss": 1.5751, "step": 1360 }, { "epoch": 0.3247407407407407, "grad_norm": 0.5791207551956177, "learning_rate": 3.8300161377084455e-05, "loss": 1.5563, "step": 1370 }, { "epoch": 0.32711111111111113, "grad_norm": 0.5584151148796082, "learning_rate": 3.8165680473372784e-05, "loss": 1.3673, "step": 1380 }, { "epoch": 0.3294814814814815, "grad_norm": 0.6110686659812927, "learning_rate": 3.8031199569661106e-05, "loss": 1.5949, "step": 1390 }, { "epoch": 0.33185185185185184, "grad_norm": 0.5519852638244629, "learning_rate": 3.7896718665949434e-05, "loss": 1.5133, "step": 1400 }, { "epoch": 0.3342222222222222, "grad_norm": 0.5732788443565369, "learning_rate": 3.776223776223776e-05, "loss": 1.4601, "step": 1410 }, { "epoch": 0.3365925925925926, "grad_norm": 0.5789920091629028, "learning_rate": 3.762775685852609e-05, "loss": 1.6461, "step": 1420 }, { "epoch": 0.33896296296296297, "grad_norm": 0.592776358127594, "learning_rate": 3.7493275954814414e-05, "loss": 1.5558, "step": 1430 }, { "epoch": 0.3413333333333333, "grad_norm": 0.5435842871665955, "learning_rate": 3.735879505110274e-05, "loss": 1.5095, "step": 1440 }, { "epoch": 0.3437037037037037, "grad_norm": 0.6474444270133972, "learning_rate": 3.722431414739107e-05, "loss": 1.429, "step": 1450 }, { "epoch": 0.3460740740740741, "grad_norm": 0.4926964342594147, "learning_rate": 3.70898332436794e-05, "loss": 1.4543, "step": 1460 }, { "epoch": 0.34844444444444445, "grad_norm": 0.5748719573020935, "learning_rate": 3.695535233996772e-05, "loss": 1.5844, "step": 1470 }, { "epoch": 0.3508148148148148, "grad_norm": 0.5535377264022827, "learning_rate": 3.682087143625605e-05, "loss": 1.5348, "step": 1480 }, { "epoch": 0.35318518518518516, "grad_norm": 0.4644632339477539, "learning_rate": 3.668639053254438e-05, "loss": 1.3157, "step": 1490 }, { "epoch": 0.35555555555555557, "grad_norm": 0.5858569145202637, "learning_rate": 3.655190962883271e-05, "loss": 1.4678, "step": 1500 }, { "epoch": 0.3579259259259259, "grad_norm": 0.5428529381752014, "learning_rate": 3.641742872512103e-05, "loss": 1.5561, "step": 1510 }, { "epoch": 0.3602962962962963, "grad_norm": 0.5255948901176453, "learning_rate": 3.628294782140936e-05, "loss": 1.5802, "step": 1520 }, { "epoch": 0.3626666666666667, "grad_norm": 0.534137487411499, "learning_rate": 3.614846691769769e-05, "loss": 1.5142, "step": 1530 }, { "epoch": 0.36503703703703705, "grad_norm": 0.5558648705482483, "learning_rate": 3.601398601398602e-05, "loss": 1.5613, "step": 1540 }, { "epoch": 0.3674074074074074, "grad_norm": 0.5890410542488098, "learning_rate": 3.587950511027434e-05, "loss": 1.4679, "step": 1550 }, { "epoch": 0.36977777777777776, "grad_norm": 0.4830753803253174, "learning_rate": 3.574502420656267e-05, "loss": 1.4698, "step": 1560 }, { "epoch": 0.3721481481481482, "grad_norm": 0.5191047787666321, "learning_rate": 3.5610543302851e-05, "loss": 1.528, "step": 1570 }, { "epoch": 0.37451851851851853, "grad_norm": 0.5316727161407471, "learning_rate": 3.5476062399139326e-05, "loss": 1.4427, "step": 1580 }, { "epoch": 0.3768888888888889, "grad_norm": 0.553815484046936, "learning_rate": 3.534158149542765e-05, "loss": 1.548, "step": 1590 }, { "epoch": 0.37925925925925924, "grad_norm": 0.47779569029808044, "learning_rate": 3.520710059171598e-05, "loss": 1.4533, "step": 1600 }, { "epoch": 0.38162962962962965, "grad_norm": 0.5595371127128601, "learning_rate": 3.5072619688004306e-05, "loss": 1.4503, "step": 1610 }, { "epoch": 0.384, "grad_norm": 0.5166143774986267, "learning_rate": 3.4938138784292635e-05, "loss": 1.3783, "step": 1620 }, { "epoch": 0.38637037037037036, "grad_norm": 0.6249716877937317, "learning_rate": 3.480365788058096e-05, "loss": 1.4494, "step": 1630 }, { "epoch": 0.3887407407407407, "grad_norm": 0.484937846660614, "learning_rate": 3.4669176976869286e-05, "loss": 1.421, "step": 1640 }, { "epoch": 0.39111111111111113, "grad_norm": 0.5464750528335571, "learning_rate": 3.4534696073157615e-05, "loss": 1.266, "step": 1650 }, { "epoch": 0.3934814814814815, "grad_norm": 0.48874956369400024, "learning_rate": 3.440021516944594e-05, "loss": 1.5355, "step": 1660 }, { "epoch": 0.39585185185185184, "grad_norm": 0.47555652260780334, "learning_rate": 3.4265734265734265e-05, "loss": 1.5799, "step": 1670 }, { "epoch": 0.3982222222222222, "grad_norm": 0.49769505858421326, "learning_rate": 3.4131253362022594e-05, "loss": 1.3748, "step": 1680 }, { "epoch": 0.4005925925925926, "grad_norm": 0.4664982259273529, "learning_rate": 3.399677245831092e-05, "loss": 1.4894, "step": 1690 }, { "epoch": 0.40296296296296297, "grad_norm": 0.5216518044471741, "learning_rate": 3.3862291554599245e-05, "loss": 1.4645, "step": 1700 }, { "epoch": 0.4053333333333333, "grad_norm": 0.6157680749893188, "learning_rate": 3.3727810650887574e-05, "loss": 1.4002, "step": 1710 }, { "epoch": 0.4077037037037037, "grad_norm": 0.5828937888145447, "learning_rate": 3.35933297471759e-05, "loss": 1.4816, "step": 1720 }, { "epoch": 0.4100740740740741, "grad_norm": 0.5792407989501953, "learning_rate": 3.345884884346423e-05, "loss": 1.3557, "step": 1730 }, { "epoch": 0.41244444444444445, "grad_norm": 0.4985092580318451, "learning_rate": 3.3324367939752554e-05, "loss": 1.4572, "step": 1740 }, { "epoch": 0.4148148148148148, "grad_norm": 0.5901199579238892, "learning_rate": 3.318988703604088e-05, "loss": 1.5292, "step": 1750 }, { "epoch": 0.41718518518518516, "grad_norm": 0.5087295174598694, "learning_rate": 3.305540613232921e-05, "loss": 1.3405, "step": 1760 }, { "epoch": 0.41955555555555557, "grad_norm": 0.5455463528633118, "learning_rate": 3.292092522861754e-05, "loss": 1.4262, "step": 1770 }, { "epoch": 0.4219259259259259, "grad_norm": 0.46563345193862915, "learning_rate": 3.278644432490586e-05, "loss": 1.4328, "step": 1780 }, { "epoch": 0.4242962962962963, "grad_norm": 0.545524537563324, "learning_rate": 3.265196342119419e-05, "loss": 1.4826, "step": 1790 }, { "epoch": 0.4266666666666667, "grad_norm": 0.4182009994983673, "learning_rate": 3.251748251748252e-05, "loss": 1.4909, "step": 1800 }, { "epoch": 0.42903703703703705, "grad_norm": 0.39127054810523987, "learning_rate": 3.238300161377085e-05, "loss": 1.4389, "step": 1810 }, { "epoch": 0.4314074074074074, "grad_norm": 0.46866652369499207, "learning_rate": 3.224852071005917e-05, "loss": 1.3992, "step": 1820 }, { "epoch": 0.43377777777777776, "grad_norm": 0.5216823816299438, "learning_rate": 3.21140398063475e-05, "loss": 1.3525, "step": 1830 }, { "epoch": 0.4361481481481482, "grad_norm": 0.49909713864326477, "learning_rate": 3.197955890263583e-05, "loss": 1.5491, "step": 1840 }, { "epoch": 0.43851851851851853, "grad_norm": 0.4957892596721649, "learning_rate": 3.184507799892416e-05, "loss": 1.4723, "step": 1850 }, { "epoch": 0.4408888888888889, "grad_norm": 0.518822431564331, "learning_rate": 3.171059709521248e-05, "loss": 1.535, "step": 1860 }, { "epoch": 0.44325925925925924, "grad_norm": 0.6380564570426941, "learning_rate": 3.157611619150081e-05, "loss": 1.5652, "step": 1870 }, { "epoch": 0.44562962962962965, "grad_norm": 0.49906617403030396, "learning_rate": 3.144163528778914e-05, "loss": 1.3624, "step": 1880 }, { "epoch": 0.448, "grad_norm": 0.5234742760658264, "learning_rate": 3.1307154384077466e-05, "loss": 1.4192, "step": 1890 }, { "epoch": 0.45037037037037037, "grad_norm": 0.5430870056152344, "learning_rate": 3.117267348036579e-05, "loss": 1.4674, "step": 1900 }, { "epoch": 0.4527407407407407, "grad_norm": 0.5488291382789612, "learning_rate": 3.103819257665412e-05, "loss": 1.4759, "step": 1910 }, { "epoch": 0.45511111111111113, "grad_norm": 0.4655541181564331, "learning_rate": 3.0903711672942446e-05, "loss": 1.4559, "step": 1920 }, { "epoch": 0.4574814814814815, "grad_norm": 0.442128449678421, "learning_rate": 3.0769230769230774e-05, "loss": 1.3129, "step": 1930 }, { "epoch": 0.45985185185185184, "grad_norm": 0.5909174084663391, "learning_rate": 3.0634749865519096e-05, "loss": 1.5915, "step": 1940 }, { "epoch": 0.4622222222222222, "grad_norm": 0.41102078557014465, "learning_rate": 3.0500268961807425e-05, "loss": 1.3717, "step": 1950 }, { "epoch": 0.4645925925925926, "grad_norm": 0.5467662811279297, "learning_rate": 3.036578805809575e-05, "loss": 1.4838, "step": 1960 }, { "epoch": 0.46696296296296297, "grad_norm": 0.6555057764053345, "learning_rate": 3.023130715438408e-05, "loss": 1.4289, "step": 1970 }, { "epoch": 0.4693333333333333, "grad_norm": 0.4430755078792572, "learning_rate": 3.0096826250672405e-05, "loss": 1.5556, "step": 1980 }, { "epoch": 0.4717037037037037, "grad_norm": 0.48016276955604553, "learning_rate": 2.9962345346960734e-05, "loss": 1.5461, "step": 1990 }, { "epoch": 0.4740740740740741, "grad_norm": 0.5283887982368469, "learning_rate": 2.982786444324906e-05, "loss": 1.4598, "step": 2000 }, { "epoch": 0.47644444444444445, "grad_norm": 0.5336430668830872, "learning_rate": 2.9693383539537388e-05, "loss": 1.5837, "step": 2010 }, { "epoch": 0.4788148148148148, "grad_norm": 0.39814135432243347, "learning_rate": 2.9558902635825713e-05, "loss": 1.3496, "step": 2020 }, { "epoch": 0.48118518518518516, "grad_norm": 0.6095125079154968, "learning_rate": 2.9424421732114042e-05, "loss": 1.5925, "step": 2030 }, { "epoch": 0.48355555555555557, "grad_norm": 0.5880560874938965, "learning_rate": 2.9289940828402368e-05, "loss": 1.3603, "step": 2040 }, { "epoch": 0.48592592592592593, "grad_norm": 0.5470516085624695, "learning_rate": 2.9155459924690697e-05, "loss": 1.5022, "step": 2050 }, { "epoch": 0.4882962962962963, "grad_norm": 0.45742228627204895, "learning_rate": 2.9020979020979022e-05, "loss": 1.4699, "step": 2060 }, { "epoch": 0.49066666666666664, "grad_norm": 0.5314275622367859, "learning_rate": 2.888649811726735e-05, "loss": 1.5043, "step": 2070 }, { "epoch": 0.49303703703703705, "grad_norm": 0.5969755053520203, "learning_rate": 2.8752017213555676e-05, "loss": 1.4709, "step": 2080 }, { "epoch": 0.4954074074074074, "grad_norm": 0.5115885138511658, "learning_rate": 2.8617536309844002e-05, "loss": 1.4031, "step": 2090 }, { "epoch": 0.49777777777777776, "grad_norm": 0.5907914042472839, "learning_rate": 2.848305540613233e-05, "loss": 1.3509, "step": 2100 }, { "epoch": 0.5001481481481481, "grad_norm": 0.48430949449539185, "learning_rate": 2.8348574502420656e-05, "loss": 1.4393, "step": 2110 }, { "epoch": 0.5025185185185185, "grad_norm": 0.5502893328666687, "learning_rate": 2.8214093598708985e-05, "loss": 1.5571, "step": 2120 }, { "epoch": 0.5048888888888889, "grad_norm": 0.48268720507621765, "learning_rate": 2.807961269499731e-05, "loss": 1.3895, "step": 2130 }, { "epoch": 0.5072592592592593, "grad_norm": 0.6141895651817322, "learning_rate": 2.794513179128564e-05, "loss": 1.5278, "step": 2140 }, { "epoch": 0.5096296296296297, "grad_norm": 0.48447638750076294, "learning_rate": 2.7810650887573965e-05, "loss": 1.456, "step": 2150 }, { "epoch": 0.512, "grad_norm": 0.4536721408367157, "learning_rate": 2.7676169983862293e-05, "loss": 1.4259, "step": 2160 }, { "epoch": 0.5143703703703704, "grad_norm": 0.5519189238548279, "learning_rate": 2.754168908015062e-05, "loss": 1.508, "step": 2170 }, { "epoch": 0.5167407407407407, "grad_norm": 0.4641801416873932, "learning_rate": 2.7407208176438948e-05, "loss": 1.5087, "step": 2180 }, { "epoch": 0.5191111111111111, "grad_norm": 0.5566359162330627, "learning_rate": 2.7272727272727273e-05, "loss": 1.4994, "step": 2190 }, { "epoch": 0.5214814814814814, "grad_norm": 0.5316601991653442, "learning_rate": 2.7138246369015602e-05, "loss": 1.375, "step": 2200 }, { "epoch": 0.5238518518518519, "grad_norm": 0.545514702796936, "learning_rate": 2.7003765465303927e-05, "loss": 1.4449, "step": 2210 }, { "epoch": 0.5262222222222223, "grad_norm": 0.5452851057052612, "learning_rate": 2.6869284561592256e-05, "loss": 1.639, "step": 2220 }, { "epoch": 0.5285925925925926, "grad_norm": 0.5291896462440491, "learning_rate": 2.673480365788058e-05, "loss": 1.3638, "step": 2230 }, { "epoch": 0.530962962962963, "grad_norm": 0.4708302319049835, "learning_rate": 2.660032275416891e-05, "loss": 1.1973, "step": 2240 }, { "epoch": 0.5333333333333333, "grad_norm": 0.4936722218990326, "learning_rate": 2.6465841850457236e-05, "loss": 1.543, "step": 2250 }, { "epoch": 0.5357037037037037, "grad_norm": 0.5722488760948181, "learning_rate": 2.6331360946745565e-05, "loss": 1.531, "step": 2260 }, { "epoch": 0.538074074074074, "grad_norm": 0.5386027097702026, "learning_rate": 2.619688004303389e-05, "loss": 1.4335, "step": 2270 }, { "epoch": 0.5404444444444444, "grad_norm": 0.5803340673446655, "learning_rate": 2.606239913932222e-05, "loss": 1.4049, "step": 2280 }, { "epoch": 0.5428148148148149, "grad_norm": 0.3970150053501129, "learning_rate": 2.5927918235610544e-05, "loss": 1.3923, "step": 2290 }, { "epoch": 0.5451851851851852, "grad_norm": 0.45682525634765625, "learning_rate": 2.5793437331898873e-05, "loss": 1.4838, "step": 2300 }, { "epoch": 0.5475555555555556, "grad_norm": 0.5088069438934326, "learning_rate": 2.56589564281872e-05, "loss": 1.4416, "step": 2310 }, { "epoch": 0.5499259259259259, "grad_norm": 0.5557109713554382, "learning_rate": 2.5524475524475528e-05, "loss": 1.4099, "step": 2320 }, { "epoch": 0.5522962962962963, "grad_norm": 0.4954288601875305, "learning_rate": 2.5389994620763853e-05, "loss": 1.3828, "step": 2330 }, { "epoch": 0.5546666666666666, "grad_norm": 0.5320334434509277, "learning_rate": 2.5255513717052182e-05, "loss": 1.3686, "step": 2340 }, { "epoch": 0.557037037037037, "grad_norm": 0.511646032333374, "learning_rate": 2.5121032813340507e-05, "loss": 1.4752, "step": 2350 }, { "epoch": 0.5594074074074074, "grad_norm": 0.4852311909198761, "learning_rate": 2.4986551909628833e-05, "loss": 1.5352, "step": 2360 }, { "epoch": 0.5617777777777778, "grad_norm": 0.5558280944824219, "learning_rate": 2.485207100591716e-05, "loss": 1.3955, "step": 2370 }, { "epoch": 0.5641481481481482, "grad_norm": 0.5369210243225098, "learning_rate": 2.4717590102205487e-05, "loss": 1.4265, "step": 2380 }, { "epoch": 0.5665185185185185, "grad_norm": 0.5134137868881226, "learning_rate": 2.4583109198493816e-05, "loss": 1.3688, "step": 2390 }, { "epoch": 0.5688888888888889, "grad_norm": 0.47109952569007874, "learning_rate": 2.444862829478214e-05, "loss": 1.4207, "step": 2400 }, { "epoch": 0.5712592592592592, "grad_norm": 0.4982026517391205, "learning_rate": 2.431414739107047e-05, "loss": 1.4249, "step": 2410 }, { "epoch": 0.5736296296296296, "grad_norm": 0.5209967494010925, "learning_rate": 2.4179666487358796e-05, "loss": 1.4708, "step": 2420 }, { "epoch": 0.576, "grad_norm": 0.5762905478477478, "learning_rate": 2.4045185583647124e-05, "loss": 1.4194, "step": 2430 }, { "epoch": 0.5783703703703704, "grad_norm": 0.4918428659439087, "learning_rate": 2.391070467993545e-05, "loss": 1.6279, "step": 2440 }, { "epoch": 0.5807407407407408, "grad_norm": 0.5050658583641052, "learning_rate": 2.377622377622378e-05, "loss": 1.5029, "step": 2450 }, { "epoch": 0.5831111111111111, "grad_norm": 0.49715667963027954, "learning_rate": 2.3641742872512104e-05, "loss": 1.4228, "step": 2460 }, { "epoch": 0.5854814814814815, "grad_norm": 0.4800516664981842, "learning_rate": 2.3507261968800433e-05, "loss": 1.3595, "step": 2470 }, { "epoch": 0.5878518518518518, "grad_norm": 0.5617285966873169, "learning_rate": 2.337278106508876e-05, "loss": 1.5974, "step": 2480 }, { "epoch": 0.5902222222222222, "grad_norm": 0.5133258700370789, "learning_rate": 2.3238300161377087e-05, "loss": 1.4437, "step": 2490 }, { "epoch": 0.5925925925925926, "grad_norm": 0.5644205212593079, "learning_rate": 2.3103819257665413e-05, "loss": 1.4633, "step": 2500 }, { "epoch": 0.5949629629629629, "grad_norm": 0.5411229133605957, "learning_rate": 2.296933835395374e-05, "loss": 1.3376, "step": 2510 }, { "epoch": 0.5973333333333334, "grad_norm": 0.5555963516235352, "learning_rate": 2.2834857450242067e-05, "loss": 1.5361, "step": 2520 }, { "epoch": 0.5997037037037037, "grad_norm": 0.48606014251708984, "learning_rate": 2.2700376546530396e-05, "loss": 1.4386, "step": 2530 }, { "epoch": 0.6020740740740741, "grad_norm": 0.4902474582195282, "learning_rate": 2.256589564281872e-05, "loss": 1.4536, "step": 2540 }, { "epoch": 0.6044444444444445, "grad_norm": 0.5565341114997864, "learning_rate": 2.243141473910705e-05, "loss": 1.4801, "step": 2550 }, { "epoch": 0.6068148148148148, "grad_norm": 0.5383167862892151, "learning_rate": 2.2296933835395375e-05, "loss": 1.3528, "step": 2560 }, { "epoch": 0.6091851851851852, "grad_norm": 0.4610041379928589, "learning_rate": 2.2162452931683704e-05, "loss": 1.5063, "step": 2570 }, { "epoch": 0.6115555555555555, "grad_norm": 0.5119171142578125, "learning_rate": 2.202797202797203e-05, "loss": 1.4559, "step": 2580 }, { "epoch": 0.6139259259259259, "grad_norm": 0.46622559428215027, "learning_rate": 2.189349112426036e-05, "loss": 1.485, "step": 2590 }, { "epoch": 0.6162962962962963, "grad_norm": 0.5610603094100952, "learning_rate": 2.1759010220548684e-05, "loss": 1.3863, "step": 2600 }, { "epoch": 0.6186666666666667, "grad_norm": 0.5185586214065552, "learning_rate": 2.162452931683701e-05, "loss": 1.4574, "step": 2610 }, { "epoch": 0.621037037037037, "grad_norm": 0.5091121196746826, "learning_rate": 2.1490048413125338e-05, "loss": 1.3745, "step": 2620 }, { "epoch": 0.6234074074074074, "grad_norm": 0.39684295654296875, "learning_rate": 2.1355567509413664e-05, "loss": 1.5785, "step": 2630 }, { "epoch": 0.6257777777777778, "grad_norm": 0.5499323606491089, "learning_rate": 2.1221086605701993e-05, "loss": 1.4926, "step": 2640 }, { "epoch": 0.6281481481481481, "grad_norm": 0.448824942111969, "learning_rate": 2.1086605701990318e-05, "loss": 1.5199, "step": 2650 }, { "epoch": 0.6305185185185185, "grad_norm": 0.5647756457328796, "learning_rate": 2.0952124798278647e-05, "loss": 1.493, "step": 2660 }, { "epoch": 0.6328888888888888, "grad_norm": 0.5426878929138184, "learning_rate": 2.0817643894566972e-05, "loss": 1.4232, "step": 2670 }, { "epoch": 0.6352592592592593, "grad_norm": 0.5105384588241577, "learning_rate": 2.0683162990855298e-05, "loss": 1.5136, "step": 2680 }, { "epoch": 0.6376296296296297, "grad_norm": 0.5488259792327881, "learning_rate": 2.0548682087143627e-05, "loss": 1.4877, "step": 2690 }, { "epoch": 0.64, "grad_norm": 0.47102248668670654, "learning_rate": 2.0414201183431952e-05, "loss": 1.2976, "step": 2700 }, { "epoch": 0.6423703703703704, "grad_norm": 0.4708435535430908, "learning_rate": 2.027972027972028e-05, "loss": 1.4773, "step": 2710 }, { "epoch": 0.6447407407407407, "grad_norm": 0.5073569416999817, "learning_rate": 2.0145239376008606e-05, "loss": 1.3551, "step": 2720 }, { "epoch": 0.6471111111111111, "grad_norm": 0.4934346377849579, "learning_rate": 2.0010758472296935e-05, "loss": 1.2423, "step": 2730 }, { "epoch": 0.6494814814814814, "grad_norm": 0.5030198097229004, "learning_rate": 1.987627756858526e-05, "loss": 1.4272, "step": 2740 }, { "epoch": 0.6518518518518519, "grad_norm": 0.4713825583457947, "learning_rate": 1.974179666487359e-05, "loss": 1.4898, "step": 2750 }, { "epoch": 0.6542222222222223, "grad_norm": 0.430649995803833, "learning_rate": 1.9607315761161915e-05, "loss": 1.3365, "step": 2760 }, { "epoch": 0.6565925925925926, "grad_norm": 0.58051598072052, "learning_rate": 1.9472834857450244e-05, "loss": 1.4567, "step": 2770 }, { "epoch": 0.658962962962963, "grad_norm": 0.46255138516426086, "learning_rate": 1.933835395373857e-05, "loss": 1.5205, "step": 2780 }, { "epoch": 0.6613333333333333, "grad_norm": 0.5674681663513184, "learning_rate": 1.9203873050026898e-05, "loss": 1.3594, "step": 2790 }, { "epoch": 0.6637037037037037, "grad_norm": 0.5063351392745972, "learning_rate": 1.9069392146315223e-05, "loss": 1.2901, "step": 2800 }, { "epoch": 0.666074074074074, "grad_norm": 0.4963226914405823, "learning_rate": 1.8934911242603552e-05, "loss": 1.4437, "step": 2810 }, { "epoch": 0.6684444444444444, "grad_norm": 0.5070900917053223, "learning_rate": 1.8800430338891878e-05, "loss": 1.376, "step": 2820 }, { "epoch": 0.6708148148148149, "grad_norm": 0.5724377036094666, "learning_rate": 1.8665949435180206e-05, "loss": 1.5226, "step": 2830 }, { "epoch": 0.6731851851851852, "grad_norm": 0.5261855125427246, "learning_rate": 1.8531468531468532e-05, "loss": 1.438, "step": 2840 }, { "epoch": 0.6755555555555556, "grad_norm": 0.5292350053787231, "learning_rate": 1.839698762775686e-05, "loss": 1.4071, "step": 2850 }, { "epoch": 0.6779259259259259, "grad_norm": 0.4596816301345825, "learning_rate": 1.8262506724045186e-05, "loss": 1.337, "step": 2860 }, { "epoch": 0.6802962962962963, "grad_norm": 0.5225928425788879, "learning_rate": 1.8128025820333515e-05, "loss": 1.4363, "step": 2870 }, { "epoch": 0.6826666666666666, "grad_norm": 0.49359938502311707, "learning_rate": 1.799354491662184e-05, "loss": 1.3388, "step": 2880 }, { "epoch": 0.685037037037037, "grad_norm": 0.5156022906303406, "learning_rate": 1.785906401291017e-05, "loss": 1.4515, "step": 2890 }, { "epoch": 0.6874074074074074, "grad_norm": 0.5047289133071899, "learning_rate": 1.7724583109198495e-05, "loss": 1.4584, "step": 2900 }, { "epoch": 0.6897777777777778, "grad_norm": 0.4975475072860718, "learning_rate": 1.7590102205486824e-05, "loss": 1.5042, "step": 2910 }, { "epoch": 0.6921481481481482, "grad_norm": 0.5997641086578369, "learning_rate": 1.745562130177515e-05, "loss": 1.5658, "step": 2920 }, { "epoch": 0.6945185185185185, "grad_norm": 0.5376483201980591, "learning_rate": 1.7321140398063478e-05, "loss": 1.3429, "step": 2930 }, { "epoch": 0.6968888888888889, "grad_norm": 0.4973870813846588, "learning_rate": 1.7186659494351803e-05, "loss": 1.4287, "step": 2940 }, { "epoch": 0.6992592592592592, "grad_norm": 0.5504077076911926, "learning_rate": 1.7052178590640132e-05, "loss": 1.5161, "step": 2950 }, { "epoch": 0.7016296296296296, "grad_norm": 0.4603710174560547, "learning_rate": 1.6917697686928457e-05, "loss": 1.463, "step": 2960 }, { "epoch": 0.704, "grad_norm": 0.5116856694221497, "learning_rate": 1.6783216783216786e-05, "loss": 1.3862, "step": 2970 }, { "epoch": 0.7063703703703703, "grad_norm": 0.49981990456581116, "learning_rate": 1.6648735879505112e-05, "loss": 1.4209, "step": 2980 }, { "epoch": 0.7087407407407408, "grad_norm": 0.5085658431053162, "learning_rate": 1.651425497579344e-05, "loss": 1.4031, "step": 2990 }, { "epoch": 0.7111111111111111, "grad_norm": 0.5499709844589233, "learning_rate": 1.6379774072081766e-05, "loss": 1.5093, "step": 3000 }, { "epoch": 0.7134814814814815, "grad_norm": 0.5997831225395203, "learning_rate": 1.6245293168370095e-05, "loss": 1.517, "step": 3010 }, { "epoch": 0.7158518518518519, "grad_norm": 0.5795171856880188, "learning_rate": 1.611081226465842e-05, "loss": 1.4279, "step": 3020 }, { "epoch": 0.7182222222222222, "grad_norm": 0.5227158069610596, "learning_rate": 1.5976331360946746e-05, "loss": 1.5101, "step": 3030 }, { "epoch": 0.7205925925925926, "grad_norm": 0.6352266669273376, "learning_rate": 1.5841850457235075e-05, "loss": 1.4849, "step": 3040 }, { "epoch": 0.7229629629629629, "grad_norm": 0.6044921278953552, "learning_rate": 1.57073695535234e-05, "loss": 1.5239, "step": 3050 }, { "epoch": 0.7253333333333334, "grad_norm": 0.4992562532424927, "learning_rate": 1.557288864981173e-05, "loss": 1.37, "step": 3060 }, { "epoch": 0.7277037037037037, "grad_norm": 0.5439409017562866, "learning_rate": 1.5438407746100054e-05, "loss": 1.3724, "step": 3070 }, { "epoch": 0.7300740740740741, "grad_norm": 0.5415698289871216, "learning_rate": 1.530392684238838e-05, "loss": 1.4562, "step": 3080 }, { "epoch": 0.7324444444444445, "grad_norm": 0.4034167230129242, "learning_rate": 1.5169445938676707e-05, "loss": 1.3212, "step": 3090 }, { "epoch": 0.7348148148148148, "grad_norm": 0.5307872891426086, "learning_rate": 1.5034965034965034e-05, "loss": 1.4165, "step": 3100 }, { "epoch": 0.7371851851851852, "grad_norm": 0.5271874666213989, "learning_rate": 1.4900484131253361e-05, "loss": 1.4077, "step": 3110 }, { "epoch": 0.7395555555555555, "grad_norm": 0.5995745658874512, "learning_rate": 1.4766003227541688e-05, "loss": 1.4732, "step": 3120 }, { "epoch": 0.7419259259259259, "grad_norm": 0.45358097553253174, "learning_rate": 1.4631522323830015e-05, "loss": 1.4083, "step": 3130 }, { "epoch": 0.7442962962962963, "grad_norm": 0.47864630818367004, "learning_rate": 1.4497041420118343e-05, "loss": 1.4363, "step": 3140 }, { "epoch": 0.7466666666666667, "grad_norm": 0.46107572317123413, "learning_rate": 1.436256051640667e-05, "loss": 1.5163, "step": 3150 }, { "epoch": 0.7490370370370371, "grad_norm": 0.5413241386413574, "learning_rate": 1.4228079612694997e-05, "loss": 1.3225, "step": 3160 }, { "epoch": 0.7514074074074074, "grad_norm": 0.4649742841720581, "learning_rate": 1.4093598708983324e-05, "loss": 1.4101, "step": 3170 }, { "epoch": 0.7537777777777778, "grad_norm": 0.5219136476516724, "learning_rate": 1.3959117805271651e-05, "loss": 1.4523, "step": 3180 }, { "epoch": 0.7561481481481481, "grad_norm": 0.5591155886650085, "learning_rate": 1.3824636901559978e-05, "loss": 1.5966, "step": 3190 }, { "epoch": 0.7585185185185185, "grad_norm": 0.5293004512786865, "learning_rate": 1.3690155997848305e-05, "loss": 1.4613, "step": 3200 }, { "epoch": 0.7608888888888888, "grad_norm": 0.46828821301460266, "learning_rate": 1.3555675094136632e-05, "loss": 1.5187, "step": 3210 }, { "epoch": 0.7632592592592593, "grad_norm": 0.4590572416782379, "learning_rate": 1.342119419042496e-05, "loss": 1.4418, "step": 3220 }, { "epoch": 0.7656296296296297, "grad_norm": 0.6020212769508362, "learning_rate": 1.3286713286713287e-05, "loss": 1.5557, "step": 3230 }, { "epoch": 0.768, "grad_norm": 0.542536199092865, "learning_rate": 1.3152232383001614e-05, "loss": 1.3689, "step": 3240 }, { "epoch": 0.7703703703703704, "grad_norm": 0.5394562482833862, "learning_rate": 1.3017751479289941e-05, "loss": 1.4272, "step": 3250 }, { "epoch": 0.7727407407407407, "grad_norm": 0.38109496235847473, "learning_rate": 1.2883270575578268e-05, "loss": 1.419, "step": 3260 }, { "epoch": 0.7751111111111111, "grad_norm": 0.4617583155632019, "learning_rate": 1.2748789671866595e-05, "loss": 1.3547, "step": 3270 }, { "epoch": 0.7774814814814814, "grad_norm": 0.5739762783050537, "learning_rate": 1.2614308768154922e-05, "loss": 1.5191, "step": 3280 }, { "epoch": 0.7798518518518519, "grad_norm": 0.42539921402931213, "learning_rate": 1.247982786444325e-05, "loss": 1.3347, "step": 3290 }, { "epoch": 0.7822222222222223, "grad_norm": 0.5273600816726685, "learning_rate": 1.2345346960731577e-05, "loss": 1.3758, "step": 3300 }, { "epoch": 0.7845925925925926, "grad_norm": 0.4796091616153717, "learning_rate": 1.2210866057019904e-05, "loss": 1.3903, "step": 3310 }, { "epoch": 0.786962962962963, "grad_norm": 0.46542009711265564, "learning_rate": 1.2076385153308231e-05, "loss": 1.406, "step": 3320 }, { "epoch": 0.7893333333333333, "grad_norm": 0.47980108857154846, "learning_rate": 1.1941904249596558e-05, "loss": 1.3991, "step": 3330 }, { "epoch": 0.7917037037037037, "grad_norm": 0.507736086845398, "learning_rate": 1.1807423345884885e-05, "loss": 1.4583, "step": 3340 }, { "epoch": 0.794074074074074, "grad_norm": 0.5380430817604065, "learning_rate": 1.1672942442173212e-05, "loss": 1.2621, "step": 3350 }, { "epoch": 0.7964444444444444, "grad_norm": 0.6689913272857666, "learning_rate": 1.153846153846154e-05, "loss": 1.5355, "step": 3360 }, { "epoch": 0.7988148148148149, "grad_norm": 0.5129537582397461, "learning_rate": 1.1403980634749865e-05, "loss": 1.5895, "step": 3370 }, { "epoch": 0.8011851851851852, "grad_norm": 0.5240408182144165, "learning_rate": 1.1269499731038192e-05, "loss": 1.4896, "step": 3380 }, { "epoch": 0.8035555555555556, "grad_norm": 0.5004174709320068, "learning_rate": 1.113501882732652e-05, "loss": 1.4303, "step": 3390 }, { "epoch": 0.8059259259259259, "grad_norm": 0.46890896558761597, "learning_rate": 1.1000537923614846e-05, "loss": 1.3536, "step": 3400 }, { "epoch": 0.8082962962962963, "grad_norm": 0.5150523781776428, "learning_rate": 1.0866057019903174e-05, "loss": 1.3952, "step": 3410 }, { "epoch": 0.8106666666666666, "grad_norm": 0.4322206676006317, "learning_rate": 1.07315761161915e-05, "loss": 1.2252, "step": 3420 }, { "epoch": 0.813037037037037, "grad_norm": 0.5782944560050964, "learning_rate": 1.0597095212479828e-05, "loss": 1.4982, "step": 3430 }, { "epoch": 0.8154074074074074, "grad_norm": 0.6032952666282654, "learning_rate": 1.0462614308768155e-05, "loss": 1.4614, "step": 3440 }, { "epoch": 0.8177777777777778, "grad_norm": 0.453756183385849, "learning_rate": 1.0328133405056482e-05, "loss": 1.5017, "step": 3450 }, { "epoch": 0.8201481481481482, "grad_norm": 0.5315883159637451, "learning_rate": 1.019365250134481e-05, "loss": 1.5354, "step": 3460 }, { "epoch": 0.8225185185185185, "grad_norm": 0.5065041184425354, "learning_rate": 1.0059171597633136e-05, "loss": 1.4245, "step": 3470 }, { "epoch": 0.8248888888888889, "grad_norm": 0.542103111743927, "learning_rate": 9.924690693921463e-06, "loss": 1.3599, "step": 3480 }, { "epoch": 0.8272592592592592, "grad_norm": 0.5330160856246948, "learning_rate": 9.79020979020979e-06, "loss": 1.3502, "step": 3490 }, { "epoch": 0.8296296296296296, "grad_norm": 0.4731038212776184, "learning_rate": 9.655728886498118e-06, "loss": 1.2931, "step": 3500 }, { "epoch": 0.832, "grad_norm": 0.4719734787940979, "learning_rate": 9.521247982786445e-06, "loss": 1.3185, "step": 3510 }, { "epoch": 0.8343703703703703, "grad_norm": 0.5551607012748718, "learning_rate": 9.386767079074772e-06, "loss": 1.4045, "step": 3520 }, { "epoch": 0.8367407407407408, "grad_norm": 0.5661736130714417, "learning_rate": 9.252286175363099e-06, "loss": 1.6005, "step": 3530 }, { "epoch": 0.8391111111111111, "grad_norm": 0.5772873759269714, "learning_rate": 9.117805271651426e-06, "loss": 1.3975, "step": 3540 }, { "epoch": 0.8414814814814815, "grad_norm": 0.5180752873420715, "learning_rate": 8.983324367939753e-06, "loss": 1.5147, "step": 3550 }, { "epoch": 0.8438518518518519, "grad_norm": 0.5256723165512085, "learning_rate": 8.84884346422808e-06, "loss": 1.4673, "step": 3560 }, { "epoch": 0.8462222222222222, "grad_norm": 0.4829583168029785, "learning_rate": 8.714362560516406e-06, "loss": 1.5102, "step": 3570 }, { "epoch": 0.8485925925925926, "grad_norm": 0.5027347207069397, "learning_rate": 8.579881656804733e-06, "loss": 1.6514, "step": 3580 }, { "epoch": 0.8509629629629629, "grad_norm": 0.5117186903953552, "learning_rate": 8.44540075309306e-06, "loss": 1.4451, "step": 3590 }, { "epoch": 0.8533333333333334, "grad_norm": 0.4994155466556549, "learning_rate": 8.310919849381387e-06, "loss": 1.3408, "step": 3600 }, { "epoch": 0.8557037037037037, "grad_norm": 0.4399481415748596, "learning_rate": 8.176438945669715e-06, "loss": 1.4044, "step": 3610 }, { "epoch": 0.8580740740740741, "grad_norm": 0.49395203590393066, "learning_rate": 8.041958041958042e-06, "loss": 1.3895, "step": 3620 }, { "epoch": 0.8604444444444445, "grad_norm": 0.4922611713409424, "learning_rate": 7.907477138246369e-06, "loss": 1.5503, "step": 3630 }, { "epoch": 0.8628148148148148, "grad_norm": 0.5255241990089417, "learning_rate": 7.772996234534696e-06, "loss": 1.4736, "step": 3640 }, { "epoch": 0.8651851851851852, "grad_norm": 0.4713379442691803, "learning_rate": 7.638515330823023e-06, "loss": 1.4395, "step": 3650 }, { "epoch": 0.8675555555555555, "grad_norm": 0.4290190637111664, "learning_rate": 7.50403442711135e-06, "loss": 1.4265, "step": 3660 }, { "epoch": 0.8699259259259259, "grad_norm": 0.5157113075256348, "learning_rate": 7.369553523399677e-06, "loss": 1.4363, "step": 3670 }, { "epoch": 0.8722962962962963, "grad_norm": 0.5694654583930969, "learning_rate": 7.2350726196880045e-06, "loss": 1.4602, "step": 3680 }, { "epoch": 0.8746666666666667, "grad_norm": 0.47969937324523926, "learning_rate": 7.100591715976332e-06, "loss": 1.2716, "step": 3690 }, { "epoch": 0.8770370370370371, "grad_norm": 0.4939590096473694, "learning_rate": 6.966110812264659e-06, "loss": 1.4356, "step": 3700 }, { "epoch": 0.8794074074074074, "grad_norm": 0.4829910695552826, "learning_rate": 6.831629908552986e-06, "loss": 1.4482, "step": 3710 }, { "epoch": 0.8817777777777778, "grad_norm": 0.473178505897522, "learning_rate": 6.697149004841313e-06, "loss": 1.3772, "step": 3720 }, { "epoch": 0.8841481481481481, "grad_norm": 0.5327422618865967, "learning_rate": 6.56266810112964e-06, "loss": 1.5707, "step": 3730 }, { "epoch": 0.8865185185185185, "grad_norm": 0.4069652259349823, "learning_rate": 6.428187197417967e-06, "loss": 1.3501, "step": 3740 }, { "epoch": 0.8888888888888888, "grad_norm": 0.5368005037307739, "learning_rate": 6.2937062937062944e-06, "loss": 1.6791, "step": 3750 }, { "epoch": 0.8912592592592593, "grad_norm": 0.5622044801712036, "learning_rate": 6.159225389994621e-06, "loss": 1.2727, "step": 3760 }, { "epoch": 0.8936296296296297, "grad_norm": 0.5526837110519409, "learning_rate": 6.024744486282948e-06, "loss": 1.478, "step": 3770 }, { "epoch": 0.896, "grad_norm": 0.5442282557487488, "learning_rate": 5.890263582571275e-06, "loss": 1.3917, "step": 3780 }, { "epoch": 0.8983703703703704, "grad_norm": 0.5711065530776978, "learning_rate": 5.755782678859602e-06, "loss": 1.4748, "step": 3790 }, { "epoch": 0.9007407407407407, "grad_norm": 0.5068963766098022, "learning_rate": 5.621301775147929e-06, "loss": 1.3375, "step": 3800 }, { "epoch": 0.9031111111111111, "grad_norm": 0.6032804846763611, "learning_rate": 5.486820871436256e-06, "loss": 1.553, "step": 3810 }, { "epoch": 0.9054814814814814, "grad_norm": 0.4718996286392212, "learning_rate": 5.352339967724583e-06, "loss": 1.4586, "step": 3820 }, { "epoch": 0.9078518518518518, "grad_norm": 0.583185076713562, "learning_rate": 5.21785906401291e-06, "loss": 1.4294, "step": 3830 }, { "epoch": 0.9102222222222223, "grad_norm": 0.5688157677650452, "learning_rate": 5.083378160301237e-06, "loss": 1.3434, "step": 3840 }, { "epoch": 0.9125925925925926, "grad_norm": 0.43667495250701904, "learning_rate": 4.948897256589564e-06, "loss": 1.5107, "step": 3850 }, { "epoch": 0.914962962962963, "grad_norm": 0.5924187302589417, "learning_rate": 4.814416352877891e-06, "loss": 1.3558, "step": 3860 }, { "epoch": 0.9173333333333333, "grad_norm": 0.538318932056427, "learning_rate": 4.679935449166218e-06, "loss": 1.379, "step": 3870 }, { "epoch": 0.9197037037037037, "grad_norm": 0.4840611219406128, "learning_rate": 4.5454545454545455e-06, "loss": 1.2776, "step": 3880 }, { "epoch": 0.922074074074074, "grad_norm": 0.5927128195762634, "learning_rate": 4.410973641742873e-06, "loss": 1.4598, "step": 3890 }, { "epoch": 0.9244444444444444, "grad_norm": 0.5672274827957153, "learning_rate": 4.2764927380312e-06, "loss": 1.467, "step": 3900 }, { "epoch": 0.9268148148148149, "grad_norm": 0.5424569249153137, "learning_rate": 4.142011834319527e-06, "loss": 1.532, "step": 3910 }, { "epoch": 0.9291851851851852, "grad_norm": 0.5921524167060852, "learning_rate": 4.007530930607853e-06, "loss": 1.5244, "step": 3920 }, { "epoch": 0.9315555555555556, "grad_norm": 0.4295691251754761, "learning_rate": 3.87305002689618e-06, "loss": 1.4358, "step": 3930 }, { "epoch": 0.9339259259259259, "grad_norm": 0.5117975473403931, "learning_rate": 3.738569123184508e-06, "loss": 1.4241, "step": 3940 }, { "epoch": 0.9362962962962963, "grad_norm": 0.5273323655128479, "learning_rate": 3.604088219472835e-06, "loss": 1.3188, "step": 3950 }, { "epoch": 0.9386666666666666, "grad_norm": 0.4947817325592041, "learning_rate": 3.469607315761162e-06, "loss": 1.444, "step": 3960 }, { "epoch": 0.941037037037037, "grad_norm": 0.5852746367454529, "learning_rate": 3.3351264120494893e-06, "loss": 1.4334, "step": 3970 }, { "epoch": 0.9434074074074074, "grad_norm": 0.5183681845664978, "learning_rate": 3.2006455083378165e-06, "loss": 1.4422, "step": 3980 }, { "epoch": 0.9457777777777778, "grad_norm": 0.5193647146224976, "learning_rate": 3.066164604626143e-06, "loss": 1.3769, "step": 3990 }, { "epoch": 0.9481481481481482, "grad_norm": 0.5597278475761414, "learning_rate": 2.9316837009144703e-06, "loss": 1.5406, "step": 4000 }, { "epoch": 0.9505185185185185, "grad_norm": 0.5173184275627136, "learning_rate": 2.7972027972027974e-06, "loss": 1.5082, "step": 4010 }, { "epoch": 0.9528888888888889, "grad_norm": 0.5205141305923462, "learning_rate": 2.6627218934911246e-06, "loss": 1.4971, "step": 4020 }, { "epoch": 0.9552592592592593, "grad_norm": 0.48843199014663696, "learning_rate": 2.5282409897794517e-06, "loss": 1.4113, "step": 4030 }, { "epoch": 0.9576296296296296, "grad_norm": 0.5854997634887695, "learning_rate": 2.3937600860677784e-06, "loss": 1.5666, "step": 4040 }, { "epoch": 0.96, "grad_norm": 0.5264609456062317, "learning_rate": 2.2592791823561056e-06, "loss": 1.3334, "step": 4050 }, { "epoch": 0.9623703703703703, "grad_norm": 0.5044777989387512, "learning_rate": 2.1247982786444327e-06, "loss": 1.4251, "step": 4060 }, { "epoch": 0.9647407407407408, "grad_norm": 0.5678290128707886, "learning_rate": 1.99031737493276e-06, "loss": 1.4359, "step": 4070 }, { "epoch": 0.9671111111111111, "grad_norm": 0.5562720894813538, "learning_rate": 1.8558364712210868e-06, "loss": 1.4333, "step": 4080 }, { "epoch": 0.9694814814814815, "grad_norm": 0.5118197798728943, "learning_rate": 1.7213555675094137e-06, "loss": 1.4257, "step": 4090 }, { "epoch": 0.9718518518518519, "grad_norm": 0.4556100070476532, "learning_rate": 1.5868746637977408e-06, "loss": 1.4414, "step": 4100 }, { "epoch": 0.9742222222222222, "grad_norm": 0.5370482802391052, "learning_rate": 1.452393760086068e-06, "loss": 1.6166, "step": 4110 }, { "epoch": 0.9765925925925926, "grad_norm": 0.5851370096206665, "learning_rate": 1.3179128563743949e-06, "loss": 1.5649, "step": 4120 }, { "epoch": 0.9789629629629629, "grad_norm": 0.5804405212402344, "learning_rate": 1.183431952662722e-06, "loss": 1.3898, "step": 4130 }, { "epoch": 0.9813333333333333, "grad_norm": 0.5411353707313538, "learning_rate": 1.0489510489510491e-06, "loss": 1.4885, "step": 4140 }, { "epoch": 0.9837037037037037, "grad_norm": 0.5271933078765869, "learning_rate": 9.14470145239376e-07, "loss": 1.3816, "step": 4150 }, { "epoch": 0.9860740740740741, "grad_norm": 0.5039179921150208, "learning_rate": 7.799892415277031e-07, "loss": 1.2985, "step": 4160 }, { "epoch": 0.9884444444444445, "grad_norm": 0.47758767008781433, "learning_rate": 6.455083378160301e-07, "loss": 1.4848, "step": 4170 }, { "epoch": 0.9908148148148148, "grad_norm": 0.5355851054191589, "learning_rate": 5.110274341043572e-07, "loss": 1.5264, "step": 4180 }, { "epoch": 0.9931851851851852, "grad_norm": 0.5918956398963928, "learning_rate": 3.7654653039268424e-07, "loss": 1.4337, "step": 4190 }, { "epoch": 0.9955555555555555, "grad_norm": 0.4990881085395813, "learning_rate": 2.4206562668101127e-07, "loss": 1.3851, "step": 4200 }, { "epoch": 0.9979259259259259, "grad_norm": 0.49343588948249817, "learning_rate": 1.0758472296933835e-07, "loss": 1.5421, "step": 4210 } ], "logging_steps": 10, "max_steps": 4218, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.660058442305372e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }