radic2682's picture
End of training
d5b05e0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 2000,
"global_step": 21900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00684931506849315,
"grad_norm": 55.08233642578125,
"learning_rate": 2.9943835616438356e-05,
"loss": 4.9626,
"step": 50
},
{
"epoch": 0.0136986301369863,
"grad_norm": 61.599609375,
"learning_rate": 2.987808219178082e-05,
"loss": 3.2637,
"step": 100
},
{
"epoch": 0.02054794520547945,
"grad_norm": 54.39605712890625,
"learning_rate": 2.980958904109589e-05,
"loss": 2.5155,
"step": 150
},
{
"epoch": 0.0273972602739726,
"grad_norm": 35.411338806152344,
"learning_rate": 2.974109589041096e-05,
"loss": 2.3836,
"step": 200
},
{
"epoch": 0.03424657534246575,
"grad_norm": 38.87581253051758,
"learning_rate": 2.9672602739726026e-05,
"loss": 2.1592,
"step": 250
},
{
"epoch": 0.0410958904109589,
"grad_norm": 43.26409149169922,
"learning_rate": 2.9604109589041095e-05,
"loss": 2.0851,
"step": 300
},
{
"epoch": 0.04794520547945205,
"grad_norm": 32.2227897644043,
"learning_rate": 2.9535616438356165e-05,
"loss": 1.833,
"step": 350
},
{
"epoch": 0.0547945205479452,
"grad_norm": 34.12604904174805,
"learning_rate": 2.9467123287671234e-05,
"loss": 1.7324,
"step": 400
},
{
"epoch": 0.06164383561643835,
"grad_norm": 802.5269165039062,
"learning_rate": 2.93986301369863e-05,
"loss": 1.715,
"step": 450
},
{
"epoch": 0.0684931506849315,
"grad_norm": 21.16217803955078,
"learning_rate": 2.933013698630137e-05,
"loss": 1.7966,
"step": 500
},
{
"epoch": 0.07534246575342465,
"grad_norm": 53.79206466674805,
"learning_rate": 2.926164383561644e-05,
"loss": 1.7756,
"step": 550
},
{
"epoch": 0.0821917808219178,
"grad_norm": 69.78266906738281,
"learning_rate": 2.919315068493151e-05,
"loss": 1.6611,
"step": 600
},
{
"epoch": 0.08904109589041095,
"grad_norm": 19.355026245117188,
"learning_rate": 2.9124657534246575e-05,
"loss": 1.5454,
"step": 650
},
{
"epoch": 0.0958904109589041,
"grad_norm": 32.797298431396484,
"learning_rate": 2.9056164383561644e-05,
"loss": 1.533,
"step": 700
},
{
"epoch": 0.10273972602739725,
"grad_norm": 22.2159423828125,
"learning_rate": 2.8987671232876714e-05,
"loss": 1.555,
"step": 750
},
{
"epoch": 0.1095890410958904,
"grad_norm": 41.200904846191406,
"learning_rate": 2.8919178082191783e-05,
"loss": 1.5322,
"step": 800
},
{
"epoch": 0.11643835616438356,
"grad_norm": 22.64232635498047,
"learning_rate": 2.885068493150685e-05,
"loss": 1.5337,
"step": 850
},
{
"epoch": 0.1232876712328767,
"grad_norm": 38.510257720947266,
"learning_rate": 2.878219178082192e-05,
"loss": 1.5001,
"step": 900
},
{
"epoch": 0.13013698630136986,
"grad_norm": 23.04859733581543,
"learning_rate": 2.871369863013699e-05,
"loss": 1.4488,
"step": 950
},
{
"epoch": 0.136986301369863,
"grad_norm": 23.516109466552734,
"learning_rate": 2.8645205479452058e-05,
"loss": 1.3247,
"step": 1000
},
{
"epoch": 0.14383561643835616,
"grad_norm": 24.6121883392334,
"learning_rate": 2.8576712328767124e-05,
"loss": 1.3537,
"step": 1050
},
{
"epoch": 0.1506849315068493,
"grad_norm": 29.713895797729492,
"learning_rate": 2.8508219178082194e-05,
"loss": 1.4116,
"step": 1100
},
{
"epoch": 0.15753424657534246,
"grad_norm": 102.77496337890625,
"learning_rate": 2.8439726027397263e-05,
"loss": 1.3196,
"step": 1150
},
{
"epoch": 0.1643835616438356,
"grad_norm": 46.55476379394531,
"learning_rate": 2.8371232876712332e-05,
"loss": 1.3987,
"step": 1200
},
{
"epoch": 0.17123287671232876,
"grad_norm": 22.365116119384766,
"learning_rate": 2.8302739726027395e-05,
"loss": 1.3027,
"step": 1250
},
{
"epoch": 0.1780821917808219,
"grad_norm": 48.17926788330078,
"learning_rate": 2.8234246575342465e-05,
"loss": 1.3698,
"step": 1300
},
{
"epoch": 0.18493150684931506,
"grad_norm": 29.022974014282227,
"learning_rate": 2.8165753424657534e-05,
"loss": 1.2422,
"step": 1350
},
{
"epoch": 0.1917808219178082,
"grad_norm": 23.812314987182617,
"learning_rate": 2.8097260273972604e-05,
"loss": 1.2691,
"step": 1400
},
{
"epoch": 0.19863013698630136,
"grad_norm": 18.73370361328125,
"learning_rate": 2.802876712328767e-05,
"loss": 1.3611,
"step": 1450
},
{
"epoch": 0.2054794520547945,
"grad_norm": 23.951034545898438,
"learning_rate": 2.796027397260274e-05,
"loss": 1.3148,
"step": 1500
},
{
"epoch": 0.21232876712328766,
"grad_norm": 33.09256362915039,
"learning_rate": 2.789178082191781e-05,
"loss": 1.3978,
"step": 1550
},
{
"epoch": 0.2191780821917808,
"grad_norm": 23.944395065307617,
"learning_rate": 2.7823287671232878e-05,
"loss": 1.2173,
"step": 1600
},
{
"epoch": 0.22602739726027396,
"grad_norm": 16.93844223022461,
"learning_rate": 2.7754794520547944e-05,
"loss": 1.3082,
"step": 1650
},
{
"epoch": 0.2328767123287671,
"grad_norm": 21.014097213745117,
"learning_rate": 2.7686301369863014e-05,
"loss": 1.3363,
"step": 1700
},
{
"epoch": 0.23972602739726026,
"grad_norm": 34.768096923828125,
"learning_rate": 2.7617808219178083e-05,
"loss": 1.1646,
"step": 1750
},
{
"epoch": 0.2465753424657534,
"grad_norm": 19.525144577026367,
"learning_rate": 2.7549315068493153e-05,
"loss": 1.2566,
"step": 1800
},
{
"epoch": 0.2534246575342466,
"grad_norm": 17.319896697998047,
"learning_rate": 2.748082191780822e-05,
"loss": 1.2123,
"step": 1850
},
{
"epoch": 0.2602739726027397,
"grad_norm": 21.581501007080078,
"learning_rate": 2.7412328767123288e-05,
"loss": 1.4083,
"step": 1900
},
{
"epoch": 0.2671232876712329,
"grad_norm": 38.267120361328125,
"learning_rate": 2.7343835616438358e-05,
"loss": 1.1987,
"step": 1950
},
{
"epoch": 0.273972602739726,
"grad_norm": 64.0147933959961,
"learning_rate": 2.7275342465753427e-05,
"loss": 1.1474,
"step": 2000
},
{
"epoch": 0.273972602739726,
"eval_exact_match": 71.11636707663197,
"eval_f1": 81.37469585294956,
"eval_runtime": 406.8449,
"eval_samples_per_second": 25.98,
"eval_steps_per_second": 1.625,
"step": 2000
},
{
"epoch": 0.2808219178082192,
"grad_norm": 22.60133171081543,
"learning_rate": 2.7206849315068493e-05,
"loss": 1.3887,
"step": 2050
},
{
"epoch": 0.2876712328767123,
"grad_norm": 51.95814514160156,
"learning_rate": 2.7138356164383563e-05,
"loss": 1.2049,
"step": 2100
},
{
"epoch": 0.2945205479452055,
"grad_norm": 11.237629890441895,
"learning_rate": 2.7069863013698632e-05,
"loss": 1.1684,
"step": 2150
},
{
"epoch": 0.3013698630136986,
"grad_norm": 23.52985954284668,
"learning_rate": 2.70013698630137e-05,
"loss": 1.2162,
"step": 2200
},
{
"epoch": 0.3082191780821918,
"grad_norm": 19.681074142456055,
"learning_rate": 2.6932876712328768e-05,
"loss": 1.0853,
"step": 2250
},
{
"epoch": 0.3150684931506849,
"grad_norm": 13.592674255371094,
"learning_rate": 2.6864383561643837e-05,
"loss": 1.2977,
"step": 2300
},
{
"epoch": 0.3219178082191781,
"grad_norm": 30.62900733947754,
"learning_rate": 2.6795890410958907e-05,
"loss": 1.1385,
"step": 2350
},
{
"epoch": 0.3287671232876712,
"grad_norm": 15.438138961791992,
"learning_rate": 2.6727397260273976e-05,
"loss": 1.1983,
"step": 2400
},
{
"epoch": 0.3356164383561644,
"grad_norm": 21.04355812072754,
"learning_rate": 2.6658904109589042e-05,
"loss": 1.2473,
"step": 2450
},
{
"epoch": 0.3424657534246575,
"grad_norm": 101.09291076660156,
"learning_rate": 2.659041095890411e-05,
"loss": 1.2827,
"step": 2500
},
{
"epoch": 0.3493150684931507,
"grad_norm": 24.547096252441406,
"learning_rate": 2.6521917808219178e-05,
"loss": 1.2088,
"step": 2550
},
{
"epoch": 0.3561643835616438,
"grad_norm": 17.362468719482422,
"learning_rate": 2.6453424657534247e-05,
"loss": 1.1586,
"step": 2600
},
{
"epoch": 0.363013698630137,
"grad_norm": 22.94930076599121,
"learning_rate": 2.6384931506849313e-05,
"loss": 1.1435,
"step": 2650
},
{
"epoch": 0.3698630136986301,
"grad_norm": 9.875961303710938,
"learning_rate": 2.6316438356164383e-05,
"loss": 1.2004,
"step": 2700
},
{
"epoch": 0.3767123287671233,
"grad_norm": 39.17120361328125,
"learning_rate": 2.6247945205479452e-05,
"loss": 1.1009,
"step": 2750
},
{
"epoch": 0.3835616438356164,
"grad_norm": 29.799320220947266,
"learning_rate": 2.6179452054794522e-05,
"loss": 1.1917,
"step": 2800
},
{
"epoch": 0.3904109589041096,
"grad_norm": 15.327347755432129,
"learning_rate": 2.6110958904109588e-05,
"loss": 1.1141,
"step": 2850
},
{
"epoch": 0.3972602739726027,
"grad_norm": 63.62096405029297,
"learning_rate": 2.6042465753424657e-05,
"loss": 1.1083,
"step": 2900
},
{
"epoch": 0.4041095890410959,
"grad_norm": 77.25872802734375,
"learning_rate": 2.5973972602739727e-05,
"loss": 1.0962,
"step": 2950
},
{
"epoch": 0.410958904109589,
"grad_norm": 26.37015724182129,
"learning_rate": 2.5905479452054796e-05,
"loss": 1.1158,
"step": 3000
},
{
"epoch": 0.4178082191780822,
"grad_norm": 30.89618682861328,
"learning_rate": 2.5836986301369862e-05,
"loss": 1.1356,
"step": 3050
},
{
"epoch": 0.4246575342465753,
"grad_norm": 26.392322540283203,
"learning_rate": 2.5768493150684932e-05,
"loss": 0.9842,
"step": 3100
},
{
"epoch": 0.4315068493150685,
"grad_norm": 19.746112823486328,
"learning_rate": 2.57e-05,
"loss": 1.0477,
"step": 3150
},
{
"epoch": 0.4383561643835616,
"grad_norm": 21.304296493530273,
"learning_rate": 2.563150684931507e-05,
"loss": 1.1522,
"step": 3200
},
{
"epoch": 0.4452054794520548,
"grad_norm": 21.67245864868164,
"learning_rate": 2.5563013698630137e-05,
"loss": 1.0677,
"step": 3250
},
{
"epoch": 0.4520547945205479,
"grad_norm": 17.96895980834961,
"learning_rate": 2.5494520547945206e-05,
"loss": 1.0657,
"step": 3300
},
{
"epoch": 0.4589041095890411,
"grad_norm": 19.414440155029297,
"learning_rate": 2.5426027397260276e-05,
"loss": 1.1109,
"step": 3350
},
{
"epoch": 0.4657534246575342,
"grad_norm": 20.33131217956543,
"learning_rate": 2.5357534246575345e-05,
"loss": 1.0958,
"step": 3400
},
{
"epoch": 0.4726027397260274,
"grad_norm": 35.86481475830078,
"learning_rate": 2.528904109589041e-05,
"loss": 1.1134,
"step": 3450
},
{
"epoch": 0.4794520547945205,
"grad_norm": 20.007892608642578,
"learning_rate": 2.522054794520548e-05,
"loss": 1.0236,
"step": 3500
},
{
"epoch": 0.4863013698630137,
"grad_norm": 8.771307945251465,
"learning_rate": 2.515205479452055e-05,
"loss": 1.0518,
"step": 3550
},
{
"epoch": 0.4931506849315068,
"grad_norm": 31.425060272216797,
"learning_rate": 2.508356164383562e-05,
"loss": 1.0397,
"step": 3600
},
{
"epoch": 0.5,
"grad_norm": 33.32624816894531,
"learning_rate": 2.5015068493150686e-05,
"loss": 1.0374,
"step": 3650
},
{
"epoch": 0.5068493150684932,
"grad_norm": 15.538783073425293,
"learning_rate": 2.4946575342465755e-05,
"loss": 1.2677,
"step": 3700
},
{
"epoch": 0.5136986301369864,
"grad_norm": 32.81166076660156,
"learning_rate": 2.4878082191780825e-05,
"loss": 1.0274,
"step": 3750
},
{
"epoch": 0.5205479452054794,
"grad_norm": 40.134246826171875,
"learning_rate": 2.480958904109589e-05,
"loss": 1.0493,
"step": 3800
},
{
"epoch": 0.5273972602739726,
"grad_norm": 19.939796447753906,
"learning_rate": 2.4741095890410957e-05,
"loss": 1.1492,
"step": 3850
},
{
"epoch": 0.5342465753424658,
"grad_norm": 23.643762588500977,
"learning_rate": 2.4672602739726026e-05,
"loss": 1.0892,
"step": 3900
},
{
"epoch": 0.541095890410959,
"grad_norm": 28.48002052307129,
"learning_rate": 2.4604109589041096e-05,
"loss": 1.144,
"step": 3950
},
{
"epoch": 0.547945205479452,
"grad_norm": 15.827131271362305,
"learning_rate": 2.4535616438356165e-05,
"loss": 1.117,
"step": 4000
},
{
"epoch": 0.547945205479452,
"eval_exact_match": 78.61873226111636,
"eval_f1": 86.50538268692839,
"eval_runtime": 407.223,
"eval_samples_per_second": 25.956,
"eval_steps_per_second": 1.623,
"step": 4000
},
{
"epoch": 0.5547945205479452,
"grad_norm": 17.803165435791016,
"learning_rate": 2.446712328767123e-05,
"loss": 1.1428,
"step": 4050
},
{
"epoch": 0.5616438356164384,
"grad_norm": 15.47031307220459,
"learning_rate": 2.43986301369863e-05,
"loss": 1.1178,
"step": 4100
},
{
"epoch": 0.5684931506849316,
"grad_norm": 20.780733108520508,
"learning_rate": 2.433013698630137e-05,
"loss": 1.0105,
"step": 4150
},
{
"epoch": 0.5753424657534246,
"grad_norm": 33.10541915893555,
"learning_rate": 2.426164383561644e-05,
"loss": 1.0502,
"step": 4200
},
{
"epoch": 0.5821917808219178,
"grad_norm": 19.134233474731445,
"learning_rate": 2.4193150684931506e-05,
"loss": 1.1227,
"step": 4250
},
{
"epoch": 0.589041095890411,
"grad_norm": 10.8368558883667,
"learning_rate": 2.4124657534246575e-05,
"loss": 1.0697,
"step": 4300
},
{
"epoch": 0.5958904109589042,
"grad_norm": 18.496990203857422,
"learning_rate": 2.4057534246575344e-05,
"loss": 1.0705,
"step": 4350
},
{
"epoch": 0.6027397260273972,
"grad_norm": 16.98480987548828,
"learning_rate": 2.398904109589041e-05,
"loss": 1.135,
"step": 4400
},
{
"epoch": 0.6095890410958904,
"grad_norm": 35.130393981933594,
"learning_rate": 2.392054794520548e-05,
"loss": 1.1933,
"step": 4450
},
{
"epoch": 0.6164383561643836,
"grad_norm": 71.43773651123047,
"learning_rate": 2.385205479452055e-05,
"loss": 1.0783,
"step": 4500
},
{
"epoch": 0.6232876712328768,
"grad_norm": 21.982946395874023,
"learning_rate": 2.378356164383562e-05,
"loss": 1.0517,
"step": 4550
},
{
"epoch": 0.6301369863013698,
"grad_norm": 21.33681297302246,
"learning_rate": 2.3715068493150685e-05,
"loss": 1.0829,
"step": 4600
},
{
"epoch": 0.636986301369863,
"grad_norm": 14.706864356994629,
"learning_rate": 2.3646575342465754e-05,
"loss": 1.0452,
"step": 4650
},
{
"epoch": 0.6438356164383562,
"grad_norm": 19.40340805053711,
"learning_rate": 2.3578082191780824e-05,
"loss": 1.0843,
"step": 4700
},
{
"epoch": 0.6506849315068494,
"grad_norm": 66.0240478515625,
"learning_rate": 2.3509589041095893e-05,
"loss": 0.9722,
"step": 4750
},
{
"epoch": 0.6575342465753424,
"grad_norm": 30.19057273864746,
"learning_rate": 2.344109589041096e-05,
"loss": 0.8738,
"step": 4800
},
{
"epoch": 0.6643835616438356,
"grad_norm": 24.212846755981445,
"learning_rate": 2.337260273972603e-05,
"loss": 0.9865,
"step": 4850
},
{
"epoch": 0.6712328767123288,
"grad_norm": 24.953773498535156,
"learning_rate": 2.3304109589041098e-05,
"loss": 0.9409,
"step": 4900
},
{
"epoch": 0.678082191780822,
"grad_norm": 32.05400466918945,
"learning_rate": 2.3235616438356168e-05,
"loss": 0.9674,
"step": 4950
},
{
"epoch": 0.684931506849315,
"grad_norm": 20.69804573059082,
"learning_rate": 2.3167123287671234e-05,
"loss": 0.9547,
"step": 5000
},
{
"epoch": 0.6917808219178082,
"grad_norm": 9.982364654541016,
"learning_rate": 2.3098630136986303e-05,
"loss": 0.9724,
"step": 5050
},
{
"epoch": 0.6986301369863014,
"grad_norm": 24.489803314208984,
"learning_rate": 2.3030136986301373e-05,
"loss": 1.0141,
"step": 5100
},
{
"epoch": 0.7054794520547946,
"grad_norm": 22.05426597595215,
"learning_rate": 2.296164383561644e-05,
"loss": 1.0011,
"step": 5150
},
{
"epoch": 0.7123287671232876,
"grad_norm": 11.475118637084961,
"learning_rate": 2.2893150684931505e-05,
"loss": 1.0293,
"step": 5200
},
{
"epoch": 0.7191780821917808,
"grad_norm": 24.616731643676758,
"learning_rate": 2.2824657534246574e-05,
"loss": 1.0347,
"step": 5250
},
{
"epoch": 0.726027397260274,
"grad_norm": 13.638339042663574,
"learning_rate": 2.2756164383561644e-05,
"loss": 0.9843,
"step": 5300
},
{
"epoch": 0.7328767123287672,
"grad_norm": 26.214601516723633,
"learning_rate": 2.2687671232876713e-05,
"loss": 0.9909,
"step": 5350
},
{
"epoch": 0.7397260273972602,
"grad_norm": 18.321882247924805,
"learning_rate": 2.261917808219178e-05,
"loss": 0.9893,
"step": 5400
},
{
"epoch": 0.7465753424657534,
"grad_norm": 14.493846893310547,
"learning_rate": 2.255068493150685e-05,
"loss": 1.0089,
"step": 5450
},
{
"epoch": 0.7534246575342466,
"grad_norm": 29.463472366333008,
"learning_rate": 2.248219178082192e-05,
"loss": 1.052,
"step": 5500
},
{
"epoch": 0.7602739726027398,
"grad_norm": 16.674697875976562,
"learning_rate": 2.2413698630136988e-05,
"loss": 0.977,
"step": 5550
},
{
"epoch": 0.7671232876712328,
"grad_norm": 20.96110725402832,
"learning_rate": 2.2345205479452054e-05,
"loss": 0.9393,
"step": 5600
},
{
"epoch": 0.773972602739726,
"grad_norm": 10.15765380859375,
"learning_rate": 2.2276712328767123e-05,
"loss": 0.9647,
"step": 5650
},
{
"epoch": 0.7808219178082192,
"grad_norm": 18.204395294189453,
"learning_rate": 2.2208219178082193e-05,
"loss": 1.0376,
"step": 5700
},
{
"epoch": 0.7876712328767124,
"grad_norm": 12.090936660766602,
"learning_rate": 2.2139726027397262e-05,
"loss": 1.0562,
"step": 5750
},
{
"epoch": 0.7945205479452054,
"grad_norm": 23.062904357910156,
"learning_rate": 2.207123287671233e-05,
"loss": 1.0397,
"step": 5800
},
{
"epoch": 0.8013698630136986,
"grad_norm": 13.404748916625977,
"learning_rate": 2.2002739726027398e-05,
"loss": 1.0177,
"step": 5850
},
{
"epoch": 0.8082191780821918,
"grad_norm": 16.615171432495117,
"learning_rate": 2.1934246575342467e-05,
"loss": 0.9627,
"step": 5900
},
{
"epoch": 0.815068493150685,
"grad_norm": 24.662452697753906,
"learning_rate": 2.1865753424657537e-05,
"loss": 0.9447,
"step": 5950
},
{
"epoch": 0.821917808219178,
"grad_norm": 9.534379959106445,
"learning_rate": 2.1797260273972603e-05,
"loss": 1.0024,
"step": 6000
},
{
"epoch": 0.821917808219178,
"eval_exact_match": 80.7379375591296,
"eval_f1": 88.15133262306199,
"eval_runtime": 405.8112,
"eval_samples_per_second": 26.047,
"eval_steps_per_second": 1.629,
"step": 6000
},
{
"epoch": 0.8287671232876712,
"grad_norm": 31.16766929626465,
"learning_rate": 2.1728767123287672e-05,
"loss": 0.9153,
"step": 6050
},
{
"epoch": 0.8356164383561644,
"grad_norm": 21.178213119506836,
"learning_rate": 2.1660273972602742e-05,
"loss": 0.919,
"step": 6100
},
{
"epoch": 0.8424657534246576,
"grad_norm": 16.18219566345215,
"learning_rate": 2.159178082191781e-05,
"loss": 1.0232,
"step": 6150
},
{
"epoch": 0.8493150684931506,
"grad_norm": 10.178057670593262,
"learning_rate": 2.1523287671232877e-05,
"loss": 1.0113,
"step": 6200
},
{
"epoch": 0.8561643835616438,
"grad_norm": 32.234249114990234,
"learning_rate": 2.1454794520547947e-05,
"loss": 0.9115,
"step": 6250
},
{
"epoch": 0.863013698630137,
"grad_norm": 10.931483268737793,
"learning_rate": 2.1386301369863016e-05,
"loss": 0.9493,
"step": 6300
},
{
"epoch": 0.8698630136986302,
"grad_norm": 18.961946487426758,
"learning_rate": 2.1317808219178086e-05,
"loss": 0.9662,
"step": 6350
},
{
"epoch": 0.8767123287671232,
"grad_norm": 32.1358642578125,
"learning_rate": 2.125068493150685e-05,
"loss": 0.9122,
"step": 6400
},
{
"epoch": 0.8835616438356164,
"grad_norm": 27.434032440185547,
"learning_rate": 2.118219178082192e-05,
"loss": 0.9665,
"step": 6450
},
{
"epoch": 0.8904109589041096,
"grad_norm": 13.604903221130371,
"learning_rate": 2.1113698630136987e-05,
"loss": 0.9752,
"step": 6500
},
{
"epoch": 0.8972602739726028,
"grad_norm": 13.965779304504395,
"learning_rate": 2.1045205479452053e-05,
"loss": 0.8704,
"step": 6550
},
{
"epoch": 0.9041095890410958,
"grad_norm": 18.716514587402344,
"learning_rate": 2.0976712328767122e-05,
"loss": 0.9159,
"step": 6600
},
{
"epoch": 0.910958904109589,
"grad_norm": 19.964460372924805,
"learning_rate": 2.0908219178082192e-05,
"loss": 0.9359,
"step": 6650
},
{
"epoch": 0.9178082191780822,
"grad_norm": 13.847062110900879,
"learning_rate": 2.083972602739726e-05,
"loss": 0.8623,
"step": 6700
},
{
"epoch": 0.9246575342465754,
"grad_norm": 25.339975357055664,
"learning_rate": 2.0771232876712327e-05,
"loss": 0.9804,
"step": 6750
},
{
"epoch": 0.9315068493150684,
"grad_norm": 16.102794647216797,
"learning_rate": 2.0702739726027397e-05,
"loss": 0.8721,
"step": 6800
},
{
"epoch": 0.9383561643835616,
"grad_norm": 15.555779457092285,
"learning_rate": 2.0634246575342466e-05,
"loss": 0.9942,
"step": 6850
},
{
"epoch": 0.9452054794520548,
"grad_norm": 14.534811019897461,
"learning_rate": 2.0565753424657536e-05,
"loss": 0.9333,
"step": 6900
},
{
"epoch": 0.952054794520548,
"grad_norm": 24.963340759277344,
"learning_rate": 2.0497260273972602e-05,
"loss": 0.966,
"step": 6950
},
{
"epoch": 0.958904109589041,
"grad_norm": 42.927734375,
"learning_rate": 2.042876712328767e-05,
"loss": 0.913,
"step": 7000
},
{
"epoch": 0.9657534246575342,
"grad_norm": 14.816558837890625,
"learning_rate": 2.036027397260274e-05,
"loss": 0.9717,
"step": 7050
},
{
"epoch": 0.9726027397260274,
"grad_norm": 27.83946418762207,
"learning_rate": 2.029178082191781e-05,
"loss": 0.9855,
"step": 7100
},
{
"epoch": 0.9794520547945206,
"grad_norm": 23.87200164794922,
"learning_rate": 2.0223287671232876e-05,
"loss": 0.9934,
"step": 7150
},
{
"epoch": 0.9863013698630136,
"grad_norm": 30.313343048095703,
"learning_rate": 2.0154794520547946e-05,
"loss": 1.0148,
"step": 7200
},
{
"epoch": 0.9931506849315068,
"grad_norm": 23.50067138671875,
"learning_rate": 2.0086301369863015e-05,
"loss": 0.9578,
"step": 7250
},
{
"epoch": 1.0,
"grad_norm": 17.968570709228516,
"learning_rate": 2.0017808219178085e-05,
"loss": 0.821,
"step": 7300
},
{
"epoch": 1.0068493150684932,
"grad_norm": 44.593414306640625,
"learning_rate": 1.994931506849315e-05,
"loss": 0.6823,
"step": 7350
},
{
"epoch": 1.0136986301369864,
"grad_norm": 12.959754943847656,
"learning_rate": 1.988082191780822e-05,
"loss": 0.6937,
"step": 7400
},
{
"epoch": 1.0205479452054795,
"grad_norm": 20.291378021240234,
"learning_rate": 1.981232876712329e-05,
"loss": 0.6457,
"step": 7450
},
{
"epoch": 1.0273972602739727,
"grad_norm": 5.460036277770996,
"learning_rate": 1.974383561643836e-05,
"loss": 0.729,
"step": 7500
},
{
"epoch": 1.0342465753424657,
"grad_norm": 17.41657257080078,
"learning_rate": 1.9675342465753425e-05,
"loss": 0.7032,
"step": 7550
},
{
"epoch": 1.0410958904109588,
"grad_norm": 10.595620155334473,
"learning_rate": 1.9606849315068495e-05,
"loss": 0.6788,
"step": 7600
},
{
"epoch": 1.047945205479452,
"grad_norm": 32.301509857177734,
"learning_rate": 1.9538356164383564e-05,
"loss": 0.669,
"step": 7650
},
{
"epoch": 1.0547945205479452,
"grad_norm": 5.369728088378906,
"learning_rate": 1.9469863013698634e-05,
"loss": 0.6169,
"step": 7700
},
{
"epoch": 1.0616438356164384,
"grad_norm": 38.604183197021484,
"learning_rate": 1.94013698630137e-05,
"loss": 0.6585,
"step": 7750
},
{
"epoch": 1.0684931506849316,
"grad_norm": 12.535406112670898,
"learning_rate": 1.9332876712328766e-05,
"loss": 0.6448,
"step": 7800
},
{
"epoch": 1.0753424657534247,
"grad_norm": 11.292516708374023,
"learning_rate": 1.9264383561643835e-05,
"loss": 0.6429,
"step": 7850
},
{
"epoch": 1.0821917808219177,
"grad_norm": 15.400158882141113,
"learning_rate": 1.9195890410958905e-05,
"loss": 0.6553,
"step": 7900
},
{
"epoch": 1.0890410958904109,
"grad_norm": 28.782424926757812,
"learning_rate": 1.912739726027397e-05,
"loss": 0.7737,
"step": 7950
},
{
"epoch": 1.095890410958904,
"grad_norm": 18.407026290893555,
"learning_rate": 1.905890410958904e-05,
"loss": 0.5717,
"step": 8000
},
{
"epoch": 1.095890410958904,
"eval_exact_match": 81.46641438032167,
"eval_f1": 88.73804254841154,
"eval_runtime": 407.6234,
"eval_samples_per_second": 25.931,
"eval_steps_per_second": 1.622,
"step": 8000
},
{
"epoch": 1.1027397260273972,
"grad_norm": 54.460880279541016,
"learning_rate": 1.899041095890411e-05,
"loss": 0.6957,
"step": 8050
},
{
"epoch": 1.1095890410958904,
"grad_norm": 10.459880828857422,
"learning_rate": 1.892191780821918e-05,
"loss": 0.6871,
"step": 8100
},
{
"epoch": 1.1164383561643836,
"grad_norm": 21.884292602539062,
"learning_rate": 1.8853424657534245e-05,
"loss": 0.7586,
"step": 8150
},
{
"epoch": 1.1232876712328768,
"grad_norm": 18.18182373046875,
"learning_rate": 1.8784931506849315e-05,
"loss": 0.7294,
"step": 8200
},
{
"epoch": 1.13013698630137,
"grad_norm": 14.117548942565918,
"learning_rate": 1.8716438356164384e-05,
"loss": 0.6837,
"step": 8250
},
{
"epoch": 1.1369863013698631,
"grad_norm": 20.044261932373047,
"learning_rate": 1.8647945205479454e-05,
"loss": 0.7353,
"step": 8300
},
{
"epoch": 1.143835616438356,
"grad_norm": 16.077611923217773,
"learning_rate": 1.857945205479452e-05,
"loss": 0.6038,
"step": 8350
},
{
"epoch": 1.1506849315068493,
"grad_norm": 28.85369300842285,
"learning_rate": 1.851095890410959e-05,
"loss": 0.6414,
"step": 8400
},
{
"epoch": 1.1575342465753424,
"grad_norm": 14.509927749633789,
"learning_rate": 1.844246575342466e-05,
"loss": 0.7068,
"step": 8450
},
{
"epoch": 1.1643835616438356,
"grad_norm": 14.299630165100098,
"learning_rate": 1.837397260273973e-05,
"loss": 0.7094,
"step": 8500
},
{
"epoch": 1.1712328767123288,
"grad_norm": 11.768505096435547,
"learning_rate": 1.8305479452054794e-05,
"loss": 0.6914,
"step": 8550
},
{
"epoch": 1.178082191780822,
"grad_norm": 42.37126922607422,
"learning_rate": 1.8236986301369864e-05,
"loss": 0.6539,
"step": 8600
},
{
"epoch": 1.1849315068493151,
"grad_norm": 14.21442985534668,
"learning_rate": 1.8168493150684933e-05,
"loss": 0.7302,
"step": 8650
},
{
"epoch": 1.191780821917808,
"grad_norm": 19.04937171936035,
"learning_rate": 1.8100000000000003e-05,
"loss": 0.7685,
"step": 8700
},
{
"epoch": 1.1986301369863013,
"grad_norm": 17.215967178344727,
"learning_rate": 1.803150684931507e-05,
"loss": 0.6888,
"step": 8750
},
{
"epoch": 1.2054794520547945,
"grad_norm": 16.23516082763672,
"learning_rate": 1.796301369863014e-05,
"loss": 0.6825,
"step": 8800
},
{
"epoch": 1.2123287671232876,
"grad_norm": 39.78145217895508,
"learning_rate": 1.7894520547945208e-05,
"loss": 0.7799,
"step": 8850
},
{
"epoch": 1.2191780821917808,
"grad_norm": 26.684986114501953,
"learning_rate": 1.7826027397260277e-05,
"loss": 0.7425,
"step": 8900
},
{
"epoch": 1.226027397260274,
"grad_norm": 35.67079544067383,
"learning_rate": 1.7757534246575343e-05,
"loss": 0.7405,
"step": 8950
},
{
"epoch": 1.2328767123287672,
"grad_norm": 18.228994369506836,
"learning_rate": 1.7689041095890413e-05,
"loss": 0.5968,
"step": 9000
},
{
"epoch": 1.2397260273972603,
"grad_norm": 15.768519401550293,
"learning_rate": 1.7620547945205482e-05,
"loss": 0.6791,
"step": 9050
},
{
"epoch": 1.2465753424657535,
"grad_norm": 26.350936889648438,
"learning_rate": 1.7552054794520545e-05,
"loss": 0.7312,
"step": 9100
},
{
"epoch": 1.2534246575342465,
"grad_norm": 22.138206481933594,
"learning_rate": 1.7483561643835615e-05,
"loss": 0.6496,
"step": 9150
},
{
"epoch": 1.2602739726027397,
"grad_norm": 20.530515670776367,
"learning_rate": 1.7415068493150684e-05,
"loss": 0.6592,
"step": 9200
},
{
"epoch": 1.2671232876712328,
"grad_norm": 18.60872459411621,
"learning_rate": 1.7346575342465754e-05,
"loss": 0.7109,
"step": 9250
},
{
"epoch": 1.273972602739726,
"grad_norm": 10.934627532958984,
"learning_rate": 1.727808219178082e-05,
"loss": 0.6364,
"step": 9300
},
{
"epoch": 1.2808219178082192,
"grad_norm": 24.400938034057617,
"learning_rate": 1.720958904109589e-05,
"loss": 0.6624,
"step": 9350
},
{
"epoch": 1.2876712328767124,
"grad_norm": 33.15473556518555,
"learning_rate": 1.714109589041096e-05,
"loss": 0.7526,
"step": 9400
},
{
"epoch": 1.2945205479452055,
"grad_norm": 4.9253339767456055,
"learning_rate": 1.7072602739726028e-05,
"loss": 0.6094,
"step": 9450
},
{
"epoch": 1.3013698630136985,
"grad_norm": 26.45025634765625,
"learning_rate": 1.7004109589041094e-05,
"loss": 0.6779,
"step": 9500
},
{
"epoch": 1.308219178082192,
"grad_norm": 12.181562423706055,
"learning_rate": 1.6935616438356164e-05,
"loss": 0.7728,
"step": 9550
},
{
"epoch": 1.3150684931506849,
"grad_norm": 6.9998040199279785,
"learning_rate": 1.6867123287671233e-05,
"loss": 0.6221,
"step": 9600
},
{
"epoch": 1.321917808219178,
"grad_norm": 40.702369689941406,
"learning_rate": 1.6798630136986303e-05,
"loss": 0.7124,
"step": 9650
},
{
"epoch": 1.3287671232876712,
"grad_norm": 18.84299659729004,
"learning_rate": 1.673013698630137e-05,
"loss": 0.6579,
"step": 9700
},
{
"epoch": 1.3356164383561644,
"grad_norm": 24.911535263061523,
"learning_rate": 1.6661643835616438e-05,
"loss": 0.7197,
"step": 9750
},
{
"epoch": 1.3424657534246576,
"grad_norm": 24.64927101135254,
"learning_rate": 1.6593150684931508e-05,
"loss": 0.6767,
"step": 9800
},
{
"epoch": 1.3493150684931507,
"grad_norm": 11.854528427124023,
"learning_rate": 1.6524657534246577e-05,
"loss": 0.6726,
"step": 9850
},
{
"epoch": 1.356164383561644,
"grad_norm": 38.699310302734375,
"learning_rate": 1.6456164383561643e-05,
"loss": 0.6484,
"step": 9900
},
{
"epoch": 1.3630136986301369,
"grad_norm": 21.76763916015625,
"learning_rate": 1.6387671232876713e-05,
"loss": 0.6959,
"step": 9950
},
{
"epoch": 1.36986301369863,
"grad_norm": 15.432331085205078,
"learning_rate": 1.6319178082191782e-05,
"loss": 0.6585,
"step": 10000
},
{
"epoch": 1.36986301369863,
"eval_exact_match": 80.78524124881741,
"eval_f1": 88.39324523394289,
"eval_runtime": 407.758,
"eval_samples_per_second": 25.922,
"eval_steps_per_second": 1.621,
"step": 10000
},
{
"epoch": 1.3767123287671232,
"grad_norm": 21.714828491210938,
"learning_rate": 1.625068493150685e-05,
"loss": 0.6976,
"step": 10050
},
{
"epoch": 1.3835616438356164,
"grad_norm": 6.602792739868164,
"learning_rate": 1.6182191780821918e-05,
"loss": 0.6094,
"step": 10100
},
{
"epoch": 1.3904109589041096,
"grad_norm": 42.471412658691406,
"learning_rate": 1.6113698630136987e-05,
"loss": 0.7151,
"step": 10150
},
{
"epoch": 1.3972602739726028,
"grad_norm": 11.658584594726562,
"learning_rate": 1.6045205479452057e-05,
"loss": 0.639,
"step": 10200
},
{
"epoch": 1.404109589041096,
"grad_norm": 37.821659088134766,
"learning_rate": 1.5976712328767126e-05,
"loss": 0.6079,
"step": 10250
},
{
"epoch": 1.410958904109589,
"grad_norm": 5.837065696716309,
"learning_rate": 1.5908219178082192e-05,
"loss": 0.5878,
"step": 10300
},
{
"epoch": 1.4178082191780823,
"grad_norm": 67.67562866210938,
"learning_rate": 1.5839726027397258e-05,
"loss": 0.7724,
"step": 10350
},
{
"epoch": 1.4246575342465753,
"grad_norm": 2.624040365219116,
"learning_rate": 1.5771232876712328e-05,
"loss": 0.6753,
"step": 10400
},
{
"epoch": 1.4315068493150684,
"grad_norm": 32.9188232421875,
"learning_rate": 1.5702739726027397e-05,
"loss": 0.6534,
"step": 10450
},
{
"epoch": 1.4383561643835616,
"grad_norm": 32.16576385498047,
"learning_rate": 1.5634246575342463e-05,
"loss": 0.716,
"step": 10500
},
{
"epoch": 1.4452054794520548,
"grad_norm": 19.11595344543457,
"learning_rate": 1.5565753424657533e-05,
"loss": 0.6081,
"step": 10550
},
{
"epoch": 1.452054794520548,
"grad_norm": 13.033523559570312,
"learning_rate": 1.5497260273972602e-05,
"loss": 0.7286,
"step": 10600
},
{
"epoch": 1.4589041095890412,
"grad_norm": 22.972614288330078,
"learning_rate": 1.5428767123287672e-05,
"loss": 0.6916,
"step": 10650
},
{
"epoch": 1.4657534246575343,
"grad_norm": 14.032968521118164,
"learning_rate": 1.5360273972602738e-05,
"loss": 0.6643,
"step": 10700
},
{
"epoch": 1.4726027397260273,
"grad_norm": 60.66258239746094,
"learning_rate": 1.5291780821917807e-05,
"loss": 0.5659,
"step": 10750
},
{
"epoch": 1.4794520547945205,
"grad_norm": 18.83857536315918,
"learning_rate": 1.5223287671232877e-05,
"loss": 0.6599,
"step": 10800
},
{
"epoch": 1.4863013698630136,
"grad_norm": 16.09821128845215,
"learning_rate": 1.5154794520547946e-05,
"loss": 0.6523,
"step": 10850
},
{
"epoch": 1.4931506849315068,
"grad_norm": 8.068375587463379,
"learning_rate": 1.5086301369863012e-05,
"loss": 0.6724,
"step": 10900
},
{
"epoch": 1.5,
"grad_norm": 16.360702514648438,
"learning_rate": 1.5017808219178082e-05,
"loss": 0.6371,
"step": 10950
},
{
"epoch": 1.5068493150684932,
"grad_norm": 17.347824096679688,
"learning_rate": 1.4949315068493151e-05,
"loss": 0.6732,
"step": 11000
},
{
"epoch": 1.5136986301369864,
"grad_norm": 45.4647216796875,
"learning_rate": 1.4880821917808219e-05,
"loss": 0.721,
"step": 11050
},
{
"epoch": 1.5205479452054793,
"grad_norm": 53.550174713134766,
"learning_rate": 1.4812328767123289e-05,
"loss": 0.6261,
"step": 11100
},
{
"epoch": 1.5273972602739727,
"grad_norm": 27.888072967529297,
"learning_rate": 1.4743835616438356e-05,
"loss": 0.7,
"step": 11150
},
{
"epoch": 1.5342465753424657,
"grad_norm": 23.892024993896484,
"learning_rate": 1.4675342465753426e-05,
"loss": 0.6329,
"step": 11200
},
{
"epoch": 1.541095890410959,
"grad_norm": 12.964653968811035,
"learning_rate": 1.4606849315068494e-05,
"loss": 0.6893,
"step": 11250
},
{
"epoch": 1.547945205479452,
"grad_norm": 16.30516242980957,
"learning_rate": 1.4538356164383563e-05,
"loss": 0.6674,
"step": 11300
},
{
"epoch": 1.5547945205479452,
"grad_norm": 20.829771041870117,
"learning_rate": 1.4469863013698629e-05,
"loss": 0.7394,
"step": 11350
},
{
"epoch": 1.5616438356164384,
"grad_norm": 21.59797477722168,
"learning_rate": 1.4401369863013699e-05,
"loss": 0.7592,
"step": 11400
},
{
"epoch": 1.5684931506849316,
"grad_norm": 14.089178085327148,
"learning_rate": 1.4332876712328766e-05,
"loss": 0.6692,
"step": 11450
},
{
"epoch": 1.5753424657534247,
"grad_norm": 9.009936332702637,
"learning_rate": 1.4264383561643836e-05,
"loss": 0.5945,
"step": 11500
},
{
"epoch": 1.5821917808219177,
"grad_norm": 14.968718528747559,
"learning_rate": 1.4195890410958904e-05,
"loss": 0.7055,
"step": 11550
},
{
"epoch": 1.589041095890411,
"grad_norm": 8.840102195739746,
"learning_rate": 1.4127397260273973e-05,
"loss": 0.7345,
"step": 11600
},
{
"epoch": 1.595890410958904,
"grad_norm": 17.11764907836914,
"learning_rate": 1.4058904109589041e-05,
"loss": 0.7073,
"step": 11650
},
{
"epoch": 1.6027397260273972,
"grad_norm": 11.691437721252441,
"learning_rate": 1.399041095890411e-05,
"loss": 0.6319,
"step": 11700
},
{
"epoch": 1.6095890410958904,
"grad_norm": 12.635778427124023,
"learning_rate": 1.3921917808219178e-05,
"loss": 0.6966,
"step": 11750
},
{
"epoch": 1.6164383561643836,
"grad_norm": 8.899778366088867,
"learning_rate": 1.3853424657534248e-05,
"loss": 0.7087,
"step": 11800
},
{
"epoch": 1.6232876712328768,
"grad_norm": 20.235586166381836,
"learning_rate": 1.3784931506849315e-05,
"loss": 0.6193,
"step": 11850
},
{
"epoch": 1.6301369863013697,
"grad_norm": 36.906707763671875,
"learning_rate": 1.3716438356164385e-05,
"loss": 0.6291,
"step": 11900
},
{
"epoch": 1.6369863013698631,
"grad_norm": 11.22154712677002,
"learning_rate": 1.3647945205479453e-05,
"loss": 0.7093,
"step": 11950
},
{
"epoch": 1.643835616438356,
"grad_norm": 27.797801971435547,
"learning_rate": 1.357945205479452e-05,
"loss": 0.6589,
"step": 12000
},
{
"epoch": 1.643835616438356,
"eval_exact_match": 82.42194891201514,
"eval_f1": 89.45487444483695,
"eval_runtime": 406.0814,
"eval_samples_per_second": 26.029,
"eval_steps_per_second": 1.628,
"step": 12000
},
{
"epoch": 1.6506849315068495,
"grad_norm": 39.78184509277344,
"learning_rate": 1.3510958904109588e-05,
"loss": 0.6151,
"step": 12050
},
{
"epoch": 1.6575342465753424,
"grad_norm": 8.190918922424316,
"learning_rate": 1.3442465753424658e-05,
"loss": 0.6776,
"step": 12100
},
{
"epoch": 1.6643835616438356,
"grad_norm": 33.2342529296875,
"learning_rate": 1.3373972602739725e-05,
"loss": 0.6841,
"step": 12150
},
{
"epoch": 1.6712328767123288,
"grad_norm": 20.676816940307617,
"learning_rate": 1.3305479452054795e-05,
"loss": 0.6696,
"step": 12200
},
{
"epoch": 1.678082191780822,
"grad_norm": 28.78380584716797,
"learning_rate": 1.3236986301369863e-05,
"loss": 0.6833,
"step": 12250
},
{
"epoch": 1.6849315068493151,
"grad_norm": 7.916901588439941,
"learning_rate": 1.3168493150684932e-05,
"loss": 0.6699,
"step": 12300
},
{
"epoch": 1.691780821917808,
"grad_norm": 19.101404190063477,
"learning_rate": 1.31e-05,
"loss": 0.698,
"step": 12350
},
{
"epoch": 1.6986301369863015,
"grad_norm": 7.5608978271484375,
"learning_rate": 1.303150684931507e-05,
"loss": 0.6808,
"step": 12400
},
{
"epoch": 1.7054794520547945,
"grad_norm": 24.476348876953125,
"learning_rate": 1.2963013698630137e-05,
"loss": 0.7041,
"step": 12450
},
{
"epoch": 1.7123287671232876,
"grad_norm": 22.60247039794922,
"learning_rate": 1.2894520547945207e-05,
"loss": 0.6406,
"step": 12500
},
{
"epoch": 1.7191780821917808,
"grad_norm": 18.08481216430664,
"learning_rate": 1.2826027397260274e-05,
"loss": 0.6683,
"step": 12550
},
{
"epoch": 1.726027397260274,
"grad_norm": 254.4781951904297,
"learning_rate": 1.2757534246575342e-05,
"loss": 0.5617,
"step": 12600
},
{
"epoch": 1.7328767123287672,
"grad_norm": 24.120647430419922,
"learning_rate": 1.268904109589041e-05,
"loss": 0.6121,
"step": 12650
},
{
"epoch": 1.7397260273972601,
"grad_norm": 25.768285751342773,
"learning_rate": 1.262054794520548e-05,
"loss": 0.5745,
"step": 12700
},
{
"epoch": 1.7465753424657535,
"grad_norm": 13.516427993774414,
"learning_rate": 1.2552054794520547e-05,
"loss": 0.6538,
"step": 12750
},
{
"epoch": 1.7534246575342465,
"grad_norm": 26.595272064208984,
"learning_rate": 1.2483561643835617e-05,
"loss": 0.7562,
"step": 12800
},
{
"epoch": 1.7602739726027399,
"grad_norm": 9.390408515930176,
"learning_rate": 1.2415068493150685e-05,
"loss": 0.6179,
"step": 12850
},
{
"epoch": 1.7671232876712328,
"grad_norm": 25.727460861206055,
"learning_rate": 1.2346575342465754e-05,
"loss": 0.6364,
"step": 12900
},
{
"epoch": 1.773972602739726,
"grad_norm": 17.040943145751953,
"learning_rate": 1.2278082191780822e-05,
"loss": 0.6312,
"step": 12950
},
{
"epoch": 1.7808219178082192,
"grad_norm": 47.64375305175781,
"learning_rate": 1.2209589041095891e-05,
"loss": 0.6916,
"step": 13000
},
{
"epoch": 1.7876712328767124,
"grad_norm": 7.669281005859375,
"learning_rate": 1.2141095890410959e-05,
"loss": 0.6765,
"step": 13050
},
{
"epoch": 1.7945205479452055,
"grad_norm": 27.307491302490234,
"learning_rate": 1.2072602739726028e-05,
"loss": 0.6479,
"step": 13100
},
{
"epoch": 1.8013698630136985,
"grad_norm": 34.39345932006836,
"learning_rate": 1.2004109589041096e-05,
"loss": 0.6739,
"step": 13150
},
{
"epoch": 1.808219178082192,
"grad_norm": 3.3462016582489014,
"learning_rate": 1.1935616438356166e-05,
"loss": 0.6523,
"step": 13200
},
{
"epoch": 1.8150684931506849,
"grad_norm": 6.420201301574707,
"learning_rate": 1.1867123287671232e-05,
"loss": 0.6458,
"step": 13250
},
{
"epoch": 1.821917808219178,
"grad_norm": 60.785194396972656,
"learning_rate": 1.1798630136986301e-05,
"loss": 0.621,
"step": 13300
},
{
"epoch": 1.8287671232876712,
"grad_norm": 25.992483139038086,
"learning_rate": 1.1730136986301369e-05,
"loss": 0.6654,
"step": 13350
},
{
"epoch": 1.8356164383561644,
"grad_norm": 26.358213424682617,
"learning_rate": 1.1661643835616439e-05,
"loss": 0.6936,
"step": 13400
},
{
"epoch": 1.8424657534246576,
"grad_norm": 19.42777442932129,
"learning_rate": 1.1593150684931506e-05,
"loss": 0.6549,
"step": 13450
},
{
"epoch": 1.8493150684931505,
"grad_norm": 35.19224548339844,
"learning_rate": 1.1524657534246576e-05,
"loss": 0.6872,
"step": 13500
},
{
"epoch": 1.856164383561644,
"grad_norm": 24.116058349609375,
"learning_rate": 1.1456164383561644e-05,
"loss": 0.6744,
"step": 13550
},
{
"epoch": 1.8630136986301369,
"grad_norm": 29.181964874267578,
"learning_rate": 1.1387671232876713e-05,
"loss": 0.7206,
"step": 13600
},
{
"epoch": 1.8698630136986303,
"grad_norm": 33.76987838745117,
"learning_rate": 1.131917808219178e-05,
"loss": 0.6819,
"step": 13650
},
{
"epoch": 1.8767123287671232,
"grad_norm": 17.98587417602539,
"learning_rate": 1.125068493150685e-05,
"loss": 0.6991,
"step": 13700
},
{
"epoch": 1.8835616438356164,
"grad_norm": 30.777263641357422,
"learning_rate": 1.1182191780821918e-05,
"loss": 0.6692,
"step": 13750
},
{
"epoch": 1.8904109589041096,
"grad_norm": 9.233376502990723,
"learning_rate": 1.1113698630136988e-05,
"loss": 0.6068,
"step": 13800
},
{
"epoch": 1.8972602739726028,
"grad_norm": 6.541473388671875,
"learning_rate": 1.1045205479452055e-05,
"loss": 0.6263,
"step": 13850
},
{
"epoch": 1.904109589041096,
"grad_norm": 10.6819486618042,
"learning_rate": 1.0976712328767123e-05,
"loss": 0.5947,
"step": 13900
},
{
"epoch": 1.910958904109589,
"grad_norm": 11.901646614074707,
"learning_rate": 1.0908219178082191e-05,
"loss": 0.722,
"step": 13950
},
{
"epoch": 1.9178082191780823,
"grad_norm": 11.687748908996582,
"learning_rate": 1.083972602739726e-05,
"loss": 0.6237,
"step": 14000
},
{
"epoch": 1.9178082191780823,
"eval_exact_match": 82.82876064333018,
"eval_f1": 89.81835180261338,
"eval_runtime": 406.8019,
"eval_samples_per_second": 25.983,
"eval_steps_per_second": 1.625,
"step": 14000
},
{
"epoch": 1.9246575342465753,
"grad_norm": 22.835041046142578,
"learning_rate": 1.0771232876712328e-05,
"loss": 0.657,
"step": 14050
},
{
"epoch": 1.9315068493150684,
"grad_norm": 15.053028106689453,
"learning_rate": 1.0702739726027398e-05,
"loss": 0.7027,
"step": 14100
},
{
"epoch": 1.9383561643835616,
"grad_norm": 18.73754119873047,
"learning_rate": 1.0634246575342465e-05,
"loss": 0.6092,
"step": 14150
},
{
"epoch": 1.9452054794520548,
"grad_norm": 22.99529266357422,
"learning_rate": 1.0565753424657535e-05,
"loss": 0.6337,
"step": 14200
},
{
"epoch": 1.952054794520548,
"grad_norm": 11.403436660766602,
"learning_rate": 1.0497260273972603e-05,
"loss": 0.7014,
"step": 14250
},
{
"epoch": 1.958904109589041,
"grad_norm": 10.497685432434082,
"learning_rate": 1.0428767123287672e-05,
"loss": 0.624,
"step": 14300
},
{
"epoch": 1.9657534246575343,
"grad_norm": 39.24784851074219,
"learning_rate": 1.036027397260274e-05,
"loss": 0.7035,
"step": 14350
},
{
"epoch": 1.9726027397260273,
"grad_norm": 10.58161449432373,
"learning_rate": 1.029178082191781e-05,
"loss": 0.6705,
"step": 14400
},
{
"epoch": 1.9794520547945207,
"grad_norm": 30.54125213623047,
"learning_rate": 1.0223287671232877e-05,
"loss": 0.6178,
"step": 14450
},
{
"epoch": 1.9863013698630136,
"grad_norm": 11.545398712158203,
"learning_rate": 1.0154794520547947e-05,
"loss": 0.6929,
"step": 14500
},
{
"epoch": 1.9931506849315068,
"grad_norm": 12.356890678405762,
"learning_rate": 1.0086301369863013e-05,
"loss": 0.6686,
"step": 14550
},
{
"epoch": 2.0,
"grad_norm": 12.808874130249023,
"learning_rate": 1.0017808219178082e-05,
"loss": 0.709,
"step": 14600
},
{
"epoch": 2.006849315068493,
"grad_norm": 20.87654685974121,
"learning_rate": 9.94931506849315e-06,
"loss": 0.3807,
"step": 14650
},
{
"epoch": 2.0136986301369864,
"grad_norm": 11.455994606018066,
"learning_rate": 9.88082191780822e-06,
"loss": 0.4346,
"step": 14700
},
{
"epoch": 2.0205479452054793,
"grad_norm": 9.450268745422363,
"learning_rate": 9.812328767123287e-06,
"loss": 0.4046,
"step": 14750
},
{
"epoch": 2.0273972602739727,
"grad_norm": 11.238616943359375,
"learning_rate": 9.743835616438357e-06,
"loss": 0.4077,
"step": 14800
},
{
"epoch": 2.0342465753424657,
"grad_norm": 7.271957874298096,
"learning_rate": 9.675342465753424e-06,
"loss": 0.4039,
"step": 14850
},
{
"epoch": 2.041095890410959,
"grad_norm": 13.240756034851074,
"learning_rate": 9.606849315068494e-06,
"loss": 0.4307,
"step": 14900
},
{
"epoch": 2.047945205479452,
"grad_norm": 17.84387969970703,
"learning_rate": 9.538356164383562e-06,
"loss": 0.4196,
"step": 14950
},
{
"epoch": 2.0547945205479454,
"grad_norm": 15.702322959899902,
"learning_rate": 9.469863013698631e-06,
"loss": 0.39,
"step": 15000
},
{
"epoch": 2.0616438356164384,
"grad_norm": 17.96023178100586,
"learning_rate": 9.401369863013699e-06,
"loss": 0.3965,
"step": 15050
},
{
"epoch": 2.0684931506849313,
"grad_norm": 29.622323989868164,
"learning_rate": 9.332876712328768e-06,
"loss": 0.4599,
"step": 15100
},
{
"epoch": 2.0753424657534247,
"grad_norm": 10.847167015075684,
"learning_rate": 9.264383561643836e-06,
"loss": 0.3994,
"step": 15150
},
{
"epoch": 2.0821917808219177,
"grad_norm": 9.122156143188477,
"learning_rate": 9.195890410958904e-06,
"loss": 0.3829,
"step": 15200
},
{
"epoch": 2.089041095890411,
"grad_norm": 61.343101501464844,
"learning_rate": 9.127397260273972e-06,
"loss": 0.4434,
"step": 15250
},
{
"epoch": 2.095890410958904,
"grad_norm": 14.082651138305664,
"learning_rate": 9.058904109589041e-06,
"loss": 0.377,
"step": 15300
},
{
"epoch": 2.1027397260273974,
"grad_norm": 10.202653884887695,
"learning_rate": 8.990410958904109e-06,
"loss": 0.3685,
"step": 15350
},
{
"epoch": 2.1095890410958904,
"grad_norm": 18.169658660888672,
"learning_rate": 8.921917808219179e-06,
"loss": 0.4122,
"step": 15400
},
{
"epoch": 2.1164383561643834,
"grad_norm": 54.33354568481445,
"learning_rate": 8.853424657534246e-06,
"loss": 0.3906,
"step": 15450
},
{
"epoch": 2.1232876712328768,
"grad_norm": 6.232911109924316,
"learning_rate": 8.784931506849316e-06,
"loss": 0.3766,
"step": 15500
},
{
"epoch": 2.1301369863013697,
"grad_norm": 40.90781784057617,
"learning_rate": 8.716438356164384e-06,
"loss": 0.4178,
"step": 15550
},
{
"epoch": 2.136986301369863,
"grad_norm": 43.94190979003906,
"learning_rate": 8.647945205479453e-06,
"loss": 0.4115,
"step": 15600
},
{
"epoch": 2.143835616438356,
"grad_norm": 14.004490852355957,
"learning_rate": 8.57945205479452e-06,
"loss": 0.4495,
"step": 15650
},
{
"epoch": 2.1506849315068495,
"grad_norm": 10.205181121826172,
"learning_rate": 8.51095890410959e-06,
"loss": 0.3702,
"step": 15700
},
{
"epoch": 2.1575342465753424,
"grad_norm": 12.333789825439453,
"learning_rate": 8.442465753424658e-06,
"loss": 0.4153,
"step": 15750
},
{
"epoch": 2.1643835616438354,
"grad_norm": 10.010555267333984,
"learning_rate": 8.373972602739728e-06,
"loss": 0.3854,
"step": 15800
},
{
"epoch": 2.171232876712329,
"grad_norm": 25.08806037902832,
"learning_rate": 8.305479452054794e-06,
"loss": 0.3935,
"step": 15850
},
{
"epoch": 2.1780821917808217,
"grad_norm": 5.474767684936523,
"learning_rate": 8.236986301369863e-06,
"loss": 0.3408,
"step": 15900
},
{
"epoch": 2.184931506849315,
"grad_norm": 19.28006362915039,
"learning_rate": 8.168493150684931e-06,
"loss": 0.3059,
"step": 15950
},
{
"epoch": 2.191780821917808,
"grad_norm": 1.0139840841293335,
"learning_rate": 8.1e-06,
"loss": 0.4515,
"step": 16000
},
{
"epoch": 2.191780821917808,
"eval_exact_match": 82.21381267738883,
"eval_f1": 89.63473536642901,
"eval_runtime": 406.5079,
"eval_samples_per_second": 26.002,
"eval_steps_per_second": 1.626,
"step": 16000
},
{
"epoch": 2.1986301369863015,
"grad_norm": 26.130165100097656,
"learning_rate": 8.031506849315068e-06,
"loss": 0.3657,
"step": 16050
},
{
"epoch": 2.2054794520547945,
"grad_norm": 12.132906913757324,
"learning_rate": 7.963013698630138e-06,
"loss": 0.3715,
"step": 16100
},
{
"epoch": 2.212328767123288,
"grad_norm": 19.634357452392578,
"learning_rate": 7.894520547945205e-06,
"loss": 0.3868,
"step": 16150
},
{
"epoch": 2.219178082191781,
"grad_norm": 12.86025333404541,
"learning_rate": 7.826027397260275e-06,
"loss": 0.4119,
"step": 16200
},
{
"epoch": 2.2260273972602738,
"grad_norm": 13.171170234680176,
"learning_rate": 7.757534246575343e-06,
"loss": 0.4362,
"step": 16250
},
{
"epoch": 2.232876712328767,
"grad_norm": 32.56090545654297,
"learning_rate": 7.689041095890412e-06,
"loss": 0.3995,
"step": 16300
},
{
"epoch": 2.23972602739726,
"grad_norm": 31.1318416595459,
"learning_rate": 7.620547945205479e-06,
"loss": 0.4267,
"step": 16350
},
{
"epoch": 2.2465753424657535,
"grad_norm": 8.52885913848877,
"learning_rate": 7.5520547945205485e-06,
"loss": 0.4065,
"step": 16400
},
{
"epoch": 2.2534246575342465,
"grad_norm": 5.443692684173584,
"learning_rate": 7.483561643835616e-06,
"loss": 0.4486,
"step": 16450
},
{
"epoch": 2.26027397260274,
"grad_norm": 25.305814743041992,
"learning_rate": 7.415068493150685e-06,
"loss": 0.4277,
"step": 16500
},
{
"epoch": 2.267123287671233,
"grad_norm": 32.161224365234375,
"learning_rate": 7.3465753424657536e-06,
"loss": 0.4175,
"step": 16550
},
{
"epoch": 2.2739726027397262,
"grad_norm": 12.804214477539062,
"learning_rate": 7.278082191780822e-06,
"loss": 0.3335,
"step": 16600
},
{
"epoch": 2.280821917808219,
"grad_norm": 22.67701530456543,
"learning_rate": 7.20958904109589e-06,
"loss": 0.4323,
"step": 16650
},
{
"epoch": 2.287671232876712,
"grad_norm": 31.13144302368164,
"learning_rate": 7.142465753424657e-06,
"loss": 0.4542,
"step": 16700
},
{
"epoch": 2.2945205479452055,
"grad_norm": 20.63867950439453,
"learning_rate": 7.073972602739726e-06,
"loss": 0.4023,
"step": 16750
},
{
"epoch": 2.3013698630136985,
"grad_norm": 21.299816131591797,
"learning_rate": 7.005479452054794e-06,
"loss": 0.3852,
"step": 16800
},
{
"epoch": 2.308219178082192,
"grad_norm": 27.046512603759766,
"learning_rate": 6.936986301369863e-06,
"loss": 0.4234,
"step": 16850
},
{
"epoch": 2.315068493150685,
"grad_norm": 23.793231964111328,
"learning_rate": 6.8684931506849315e-06,
"loss": 0.4347,
"step": 16900
},
{
"epoch": 2.3219178082191783,
"grad_norm": 19.59113883972168,
"learning_rate": 6.8e-06,
"loss": 0.4505,
"step": 16950
},
{
"epoch": 2.328767123287671,
"grad_norm": 14.70582389831543,
"learning_rate": 6.731506849315069e-06,
"loss": 0.36,
"step": 17000
},
{
"epoch": 2.3356164383561646,
"grad_norm": 49.76693344116211,
"learning_rate": 6.6630136986301365e-06,
"loss": 0.3639,
"step": 17050
},
{
"epoch": 2.3424657534246576,
"grad_norm": 44.807167053222656,
"learning_rate": 6.594520547945205e-06,
"loss": 0.3981,
"step": 17100
},
{
"epoch": 2.3493150684931505,
"grad_norm": 9.841875076293945,
"learning_rate": 6.526027397260274e-06,
"loss": 0.3733,
"step": 17150
},
{
"epoch": 2.356164383561644,
"grad_norm": 50.942108154296875,
"learning_rate": 6.457534246575342e-06,
"loss": 0.4233,
"step": 17200
},
{
"epoch": 2.363013698630137,
"grad_norm": 9.675677299499512,
"learning_rate": 6.389041095890411e-06,
"loss": 0.4035,
"step": 17250
},
{
"epoch": 2.3698630136986303,
"grad_norm": 5.541302680969238,
"learning_rate": 6.32054794520548e-06,
"loss": 0.4161,
"step": 17300
},
{
"epoch": 2.3767123287671232,
"grad_norm": 8.581879615783691,
"learning_rate": 6.2520547945205474e-06,
"loss": 0.3986,
"step": 17350
},
{
"epoch": 2.383561643835616,
"grad_norm": 9.531363487243652,
"learning_rate": 6.183561643835616e-06,
"loss": 0.3827,
"step": 17400
},
{
"epoch": 2.3904109589041096,
"grad_norm": 7.351466178894043,
"learning_rate": 6.115068493150685e-06,
"loss": 0.3796,
"step": 17450
},
{
"epoch": 2.3972602739726026,
"grad_norm": 8.607413291931152,
"learning_rate": 6.046575342465753e-06,
"loss": 0.3983,
"step": 17500
},
{
"epoch": 2.404109589041096,
"grad_norm": 8.33619499206543,
"learning_rate": 5.978082191780822e-06,
"loss": 0.3695,
"step": 17550
},
{
"epoch": 2.410958904109589,
"grad_norm": 6.319853782653809,
"learning_rate": 5.9095890410958906e-06,
"loss": 0.3843,
"step": 17600
},
{
"epoch": 2.4178082191780823,
"grad_norm": 19.26327133178711,
"learning_rate": 5.841095890410958e-06,
"loss": 0.4723,
"step": 17650
},
{
"epoch": 2.4246575342465753,
"grad_norm": 16.118913650512695,
"learning_rate": 5.772602739726027e-06,
"loss": 0.4518,
"step": 17700
},
{
"epoch": 2.4315068493150687,
"grad_norm": 7.736336708068848,
"learning_rate": 5.704109589041096e-06,
"loss": 0.3713,
"step": 17750
},
{
"epoch": 2.4383561643835616,
"grad_norm": 11.381460189819336,
"learning_rate": 5.635616438356164e-06,
"loss": 0.4285,
"step": 17800
},
{
"epoch": 2.4452054794520546,
"grad_norm": 401.49493408203125,
"learning_rate": 5.567123287671233e-06,
"loss": 0.3637,
"step": 17850
},
{
"epoch": 2.452054794520548,
"grad_norm": 20.71985626220703,
"learning_rate": 5.4986301369863015e-06,
"loss": 0.4036,
"step": 17900
},
{
"epoch": 2.458904109589041,
"grad_norm": 14.313848495483398,
"learning_rate": 5.43013698630137e-06,
"loss": 0.4131,
"step": 17950
},
{
"epoch": 2.4657534246575343,
"grad_norm": 14.69888687133789,
"learning_rate": 5.361643835616438e-06,
"loss": 0.4162,
"step": 18000
},
{
"epoch": 2.4657534246575343,
"eval_exact_match": 82.2705771050142,
"eval_f1": 89.66159681312358,
"eval_runtime": 407.322,
"eval_samples_per_second": 25.95,
"eval_steps_per_second": 1.623,
"step": 18000
},
{
"epoch": 2.4726027397260273,
"grad_norm": 15.124028205871582,
"learning_rate": 5.2931506849315065e-06,
"loss": 0.4069,
"step": 18050
},
{
"epoch": 2.4794520547945207,
"grad_norm": 32.08819580078125,
"learning_rate": 5.224657534246575e-06,
"loss": 0.3963,
"step": 18100
},
{
"epoch": 2.4863013698630136,
"grad_norm": 10.072858810424805,
"learning_rate": 5.156164383561644e-06,
"loss": 0.3669,
"step": 18150
},
{
"epoch": 2.493150684931507,
"grad_norm": 7.13778018951416,
"learning_rate": 5.087671232876712e-06,
"loss": 0.3498,
"step": 18200
},
{
"epoch": 2.5,
"grad_norm": 22.02096176147461,
"learning_rate": 5.019178082191781e-06,
"loss": 0.4829,
"step": 18250
},
{
"epoch": 2.506849315068493,
"grad_norm": 11.320915222167969,
"learning_rate": 4.950684931506849e-06,
"loss": 0.3595,
"step": 18300
},
{
"epoch": 2.5136986301369864,
"grad_norm": 12.33502197265625,
"learning_rate": 4.8821917808219174e-06,
"loss": 0.3876,
"step": 18350
},
{
"epoch": 2.5205479452054793,
"grad_norm": 13.46191692352295,
"learning_rate": 4.813698630136986e-06,
"loss": 0.454,
"step": 18400
},
{
"epoch": 2.5273972602739727,
"grad_norm": 13.49188232421875,
"learning_rate": 4.746575342465753e-06,
"loss": 0.3855,
"step": 18450
},
{
"epoch": 2.5342465753424657,
"grad_norm": 17.15059471130371,
"learning_rate": 4.678082191780822e-06,
"loss": 0.366,
"step": 18500
},
{
"epoch": 2.541095890410959,
"grad_norm": 8.814437866210938,
"learning_rate": 4.60958904109589e-06,
"loss": 0.4034,
"step": 18550
},
{
"epoch": 2.547945205479452,
"grad_norm": 22.521684646606445,
"learning_rate": 4.541095890410959e-06,
"loss": 0.4877,
"step": 18600
},
{
"epoch": 2.5547945205479454,
"grad_norm": 12.521992683410645,
"learning_rate": 4.4726027397260276e-06,
"loss": 0.39,
"step": 18650
},
{
"epoch": 2.5616438356164384,
"grad_norm": 8.952247619628906,
"learning_rate": 4.404109589041095e-06,
"loss": 0.3797,
"step": 18700
},
{
"epoch": 2.5684931506849313,
"grad_norm": 22.96004867553711,
"learning_rate": 4.335616438356164e-06,
"loss": 0.3642,
"step": 18750
},
{
"epoch": 2.5753424657534247,
"grad_norm": 15.552525520324707,
"learning_rate": 4.267123287671233e-06,
"loss": 0.3611,
"step": 18800
},
{
"epoch": 2.5821917808219177,
"grad_norm": 6.638945579528809,
"learning_rate": 4.198630136986301e-06,
"loss": 0.3153,
"step": 18850
},
{
"epoch": 2.589041095890411,
"grad_norm": 22.522302627563477,
"learning_rate": 4.13013698630137e-06,
"loss": 0.4371,
"step": 18900
},
{
"epoch": 2.595890410958904,
"grad_norm": 9.711662292480469,
"learning_rate": 4.0616438356164385e-06,
"loss": 0.3819,
"step": 18950
},
{
"epoch": 2.602739726027397,
"grad_norm": 13.974068641662598,
"learning_rate": 3.993150684931506e-06,
"loss": 0.3498,
"step": 19000
},
{
"epoch": 2.6095890410958904,
"grad_norm": 16.810501098632812,
"learning_rate": 3.924657534246575e-06,
"loss": 0.3499,
"step": 19050
},
{
"epoch": 2.616438356164384,
"grad_norm": 12.136087417602539,
"learning_rate": 3.8561643835616435e-06,
"loss": 0.3913,
"step": 19100
},
{
"epoch": 2.6232876712328768,
"grad_norm": 20.14099884033203,
"learning_rate": 3.787671232876712e-06,
"loss": 0.3576,
"step": 19150
},
{
"epoch": 2.6301369863013697,
"grad_norm": 3.969587802886963,
"learning_rate": 3.719178082191781e-06,
"loss": 0.4321,
"step": 19200
},
{
"epoch": 2.636986301369863,
"grad_norm": 2.799744129180908,
"learning_rate": 3.6506849315068494e-06,
"loss": 0.3585,
"step": 19250
},
{
"epoch": 2.643835616438356,
"grad_norm": 48.521202087402344,
"learning_rate": 3.582191780821918e-06,
"loss": 0.406,
"step": 19300
},
{
"epoch": 2.6506849315068495,
"grad_norm": 15.20323371887207,
"learning_rate": 3.5136986301369866e-06,
"loss": 0.3498,
"step": 19350
},
{
"epoch": 2.6575342465753424,
"grad_norm": 14.380341529846191,
"learning_rate": 3.445205479452055e-06,
"loss": 0.3952,
"step": 19400
},
{
"epoch": 2.6643835616438354,
"grad_norm": 21.444190979003906,
"learning_rate": 3.3767123287671235e-06,
"loss": 0.4067,
"step": 19450
},
{
"epoch": 2.671232876712329,
"grad_norm": 12.899758338928223,
"learning_rate": 3.308219178082192e-06,
"loss": 0.4587,
"step": 19500
},
{
"epoch": 2.678082191780822,
"grad_norm": 9.967342376708984,
"learning_rate": 3.2397260273972603e-06,
"loss": 0.3656,
"step": 19550
},
{
"epoch": 2.684931506849315,
"grad_norm": 5.67423677444458,
"learning_rate": 3.171232876712329e-06,
"loss": 0.4351,
"step": 19600
},
{
"epoch": 2.691780821917808,
"grad_norm": 14.479686737060547,
"learning_rate": 3.1027397260273976e-06,
"loss": 0.3801,
"step": 19650
},
{
"epoch": 2.6986301369863015,
"grad_norm": 17.046621322631836,
"learning_rate": 3.034246575342466e-06,
"loss": 0.4302,
"step": 19700
},
{
"epoch": 2.7054794520547945,
"grad_norm": 10.318126678466797,
"learning_rate": 2.9657534246575344e-06,
"loss": 0.3594,
"step": 19750
},
{
"epoch": 2.712328767123288,
"grad_norm": 24.85907554626465,
"learning_rate": 2.897260273972603e-06,
"loss": 0.3195,
"step": 19800
},
{
"epoch": 2.719178082191781,
"grad_norm": 16.129796981811523,
"learning_rate": 2.8287671232876716e-06,
"loss": 0.3899,
"step": 19850
},
{
"epoch": 2.7260273972602738,
"grad_norm": 13.561001777648926,
"learning_rate": 2.76027397260274e-06,
"loss": 0.3348,
"step": 19900
},
{
"epoch": 2.732876712328767,
"grad_norm": 32.10982131958008,
"learning_rate": 2.6917808219178085e-06,
"loss": 0.3648,
"step": 19950
},
{
"epoch": 2.73972602739726,
"grad_norm": 49.121280670166016,
"learning_rate": 2.623287671232877e-06,
"loss": 0.4146,
"step": 20000
},
{
"epoch": 2.73972602739726,
"eval_exact_match": 82.77199621570483,
"eval_f1": 89.87472419974556,
"eval_runtime": 407.372,
"eval_samples_per_second": 25.947,
"eval_steps_per_second": 1.623,
"step": 20000
},
{
"epoch": 2.7465753424657535,
"grad_norm": 10.713053703308105,
"learning_rate": 2.5547945205479453e-06,
"loss": 0.3653,
"step": 20050
},
{
"epoch": 2.7534246575342465,
"grad_norm": 20.47135353088379,
"learning_rate": 2.486301369863014e-06,
"loss": 0.4231,
"step": 20100
},
{
"epoch": 2.76027397260274,
"grad_norm": 40.13731002807617,
"learning_rate": 2.4178082191780826e-06,
"loss": 0.3993,
"step": 20150
},
{
"epoch": 2.767123287671233,
"grad_norm": 12.861891746520996,
"learning_rate": 2.3493150684931508e-06,
"loss": 0.3912,
"step": 20200
},
{
"epoch": 2.7739726027397262,
"grad_norm": 7.921535968780518,
"learning_rate": 2.2808219178082194e-06,
"loss": 0.408,
"step": 20250
},
{
"epoch": 2.780821917808219,
"grad_norm": 26.30048179626465,
"learning_rate": 2.2123287671232876e-06,
"loss": 0.4308,
"step": 20300
},
{
"epoch": 2.787671232876712,
"grad_norm": 22.600740432739258,
"learning_rate": 2.1438356164383562e-06,
"loss": 0.363,
"step": 20350
},
{
"epoch": 2.7945205479452055,
"grad_norm": 10.7540283203125,
"learning_rate": 2.0753424657534244e-06,
"loss": 0.3956,
"step": 20400
},
{
"epoch": 2.8013698630136985,
"grad_norm": 13.891115188598633,
"learning_rate": 2.006849315068493e-06,
"loss": 0.3441,
"step": 20450
},
{
"epoch": 2.808219178082192,
"grad_norm": 14.40695571899414,
"learning_rate": 1.9383561643835617e-06,
"loss": 0.3875,
"step": 20500
},
{
"epoch": 2.815068493150685,
"grad_norm": 19.043682098388672,
"learning_rate": 1.8698630136986303e-06,
"loss": 0.4015,
"step": 20550
},
{
"epoch": 2.821917808219178,
"grad_norm": 26.111764907836914,
"learning_rate": 1.8013698630136987e-06,
"loss": 0.3877,
"step": 20600
},
{
"epoch": 2.828767123287671,
"grad_norm": 13.53073787689209,
"learning_rate": 1.7328767123287671e-06,
"loss": 0.3292,
"step": 20650
},
{
"epoch": 2.8356164383561646,
"grad_norm": 12.833224296569824,
"learning_rate": 1.6643835616438358e-06,
"loss": 0.3202,
"step": 20700
},
{
"epoch": 2.8424657534246576,
"grad_norm": 12.937023162841797,
"learning_rate": 1.5958904109589042e-06,
"loss": 0.3709,
"step": 20750
},
{
"epoch": 2.8493150684931505,
"grad_norm": 30.91938018798828,
"learning_rate": 1.5273972602739726e-06,
"loss": 0.4258,
"step": 20800
},
{
"epoch": 2.856164383561644,
"grad_norm": 50.359283447265625,
"learning_rate": 1.4589041095890412e-06,
"loss": 0.3572,
"step": 20850
},
{
"epoch": 2.863013698630137,
"grad_norm": 7.755626201629639,
"learning_rate": 1.3904109589041096e-06,
"loss": 0.3985,
"step": 20900
},
{
"epoch": 2.8698630136986303,
"grad_norm": 33.82756805419922,
"learning_rate": 1.3219178082191783e-06,
"loss": 0.3494,
"step": 20950
},
{
"epoch": 2.8767123287671232,
"grad_norm": 44.668338775634766,
"learning_rate": 1.2534246575342467e-06,
"loss": 0.4596,
"step": 21000
},
{
"epoch": 2.883561643835616,
"grad_norm": 24.665861129760742,
"learning_rate": 1.184931506849315e-06,
"loss": 0.3069,
"step": 21050
},
{
"epoch": 2.8904109589041096,
"grad_norm": 13.378949165344238,
"learning_rate": 1.1164383561643837e-06,
"loss": 0.3873,
"step": 21100
},
{
"epoch": 2.897260273972603,
"grad_norm": 8.084388732910156,
"learning_rate": 1.0479452054794521e-06,
"loss": 0.3911,
"step": 21150
},
{
"epoch": 2.904109589041096,
"grad_norm": 8.717424392700195,
"learning_rate": 9.794520547945205e-07,
"loss": 0.4187,
"step": 21200
},
{
"epoch": 2.910958904109589,
"grad_norm": 10.450674057006836,
"learning_rate": 9.123287671232876e-07,
"loss": 0.3498,
"step": 21250
},
{
"epoch": 2.9178082191780823,
"grad_norm": 8.336039543151855,
"learning_rate": 8.438356164383562e-07,
"loss": 0.3815,
"step": 21300
},
{
"epoch": 2.9246575342465753,
"grad_norm": 25.949018478393555,
"learning_rate": 7.753424657534247e-07,
"loss": 0.3061,
"step": 21350
},
{
"epoch": 2.9315068493150687,
"grad_norm": 3.685792922973633,
"learning_rate": 7.068493150684931e-07,
"loss": 0.4067,
"step": 21400
},
{
"epoch": 2.9383561643835616,
"grad_norm": 20.176740646362305,
"learning_rate": 6.383561643835616e-07,
"loss": 0.4145,
"step": 21450
},
{
"epoch": 2.9452054794520546,
"grad_norm": 6.9156270027160645,
"learning_rate": 5.698630136986301e-07,
"loss": 0.3644,
"step": 21500
},
{
"epoch": 2.952054794520548,
"grad_norm": 16.190839767456055,
"learning_rate": 5.013698630136987e-07,
"loss": 0.3783,
"step": 21550
},
{
"epoch": 2.958904109589041,
"grad_norm": 14.747089385986328,
"learning_rate": 4.3287671232876714e-07,
"loss": 0.4057,
"step": 21600
},
{
"epoch": 2.9657534246575343,
"grad_norm": 28.770313262939453,
"learning_rate": 3.643835616438356e-07,
"loss": 0.3261,
"step": 21650
},
{
"epoch": 2.9726027397260273,
"grad_norm": 6.209177017211914,
"learning_rate": 2.958904109589041e-07,
"loss": 0.4036,
"step": 21700
},
{
"epoch": 2.9794520547945207,
"grad_norm": 28.698171615600586,
"learning_rate": 2.273972602739726e-07,
"loss": 0.3678,
"step": 21750
},
{
"epoch": 2.9863013698630136,
"grad_norm": 7.778885364532471,
"learning_rate": 1.589041095890411e-07,
"loss": 0.3814,
"step": 21800
},
{
"epoch": 2.993150684931507,
"grad_norm": 28.04308319091797,
"learning_rate": 9.04109589041096e-08,
"loss": 0.3538,
"step": 21850
},
{
"epoch": 3.0,
"grad_norm": 3.1987411975860596,
"learning_rate": 2.1917808219178083e-08,
"loss": 0.3481,
"step": 21900
},
{
"epoch": 3.0,
"step": 21900,
"total_flos": 2.006738209660207e+18,
"train_loss": 0.7586759792397556,
"train_runtime": 82472.0176,
"train_samples_per_second": 3.186,
"train_steps_per_second": 0.266
}
],
"logging_steps": 50,
"max_steps": 21900,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.006738209660207e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}