arianhosseini's picture
Training in progress, step 2000, checkpoint
0fc4f9a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.24512099921936,
"eval_steps": 400,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00312256049960968,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.8678,
"step": 1
},
{
"epoch": 0.0312256049960968,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.7236,
"step": 10
},
{
"epoch": 0.0624512099921936,
"grad_norm": 37.27561569213867,
"learning_rate": 8e-08,
"loss": 1.805,
"step": 20
},
{
"epoch": 0.0936768149882904,
"grad_norm": 41.067867279052734,
"learning_rate": 4.800000000000001e-07,
"loss": 1.6883,
"step": 30
},
{
"epoch": 0.1249024199843872,
"grad_norm": 43.672298431396484,
"learning_rate": 8.400000000000001e-07,
"loss": 1.6964,
"step": 40
},
{
"epoch": 0.156128024980484,
"grad_norm": 43.07832717895508,
"learning_rate": 1.2400000000000002e-06,
"loss": 1.6138,
"step": 50
},
{
"epoch": 0.1873536299765808,
"grad_norm": 37.47705841064453,
"learning_rate": 1.6400000000000002e-06,
"loss": 1.5515,
"step": 60
},
{
"epoch": 0.2185792349726776,
"grad_norm": 28.83339500427246,
"learning_rate": 2.04e-06,
"loss": 1.3408,
"step": 70
},
{
"epoch": 0.2498048399687744,
"grad_norm": 31.222503662109375,
"learning_rate": 2.4400000000000004e-06,
"loss": 1.2731,
"step": 80
},
{
"epoch": 0.2810304449648712,
"grad_norm": 23.76290512084961,
"learning_rate": 2.84e-06,
"loss": 1.2666,
"step": 90
},
{
"epoch": 0.312256049960968,
"grad_norm": 23.913143157958984,
"learning_rate": 3.2400000000000003e-06,
"loss": 1.1393,
"step": 100
},
{
"epoch": 0.3434816549570648,
"grad_norm": 24.92310905456543,
"learning_rate": 3.6400000000000003e-06,
"loss": 1.1529,
"step": 110
},
{
"epoch": 0.3747072599531616,
"grad_norm": 20.76234245300293,
"learning_rate": 4.04e-06,
"loss": 1.0776,
"step": 120
},
{
"epoch": 0.4059328649492584,
"grad_norm": 30.90992546081543,
"learning_rate": 4.440000000000001e-06,
"loss": 1.0028,
"step": 130
},
{
"epoch": 0.4371584699453552,
"grad_norm": 30.7198429107666,
"learning_rate": 4.84e-06,
"loss": 0.9807,
"step": 140
},
{
"epoch": 0.468384074941452,
"grad_norm": 22.76320457458496,
"learning_rate": 5.240000000000001e-06,
"loss": 0.992,
"step": 150
},
{
"epoch": 0.4996096799375488,
"grad_norm": 24.735822677612305,
"learning_rate": 5.64e-06,
"loss": 0.8421,
"step": 160
},
{
"epoch": 0.5308352849336456,
"grad_norm": 27.185937881469727,
"learning_rate": 6.040000000000001e-06,
"loss": 1.012,
"step": 170
},
{
"epoch": 0.5620608899297423,
"grad_norm": 16.42388916015625,
"learning_rate": 6.440000000000001e-06,
"loss": 0.7315,
"step": 180
},
{
"epoch": 0.5932864949258392,
"grad_norm": 25.17578887939453,
"learning_rate": 6.8400000000000014e-06,
"loss": 0.6995,
"step": 190
},
{
"epoch": 0.624512099921936,
"grad_norm": 19.550037384033203,
"learning_rate": 7.24e-06,
"loss": 0.8551,
"step": 200
},
{
"epoch": 0.6557377049180327,
"grad_norm": 22.346853256225586,
"learning_rate": 7.640000000000001e-06,
"loss": 0.726,
"step": 210
},
{
"epoch": 0.6869633099141296,
"grad_norm": 31.998685836791992,
"learning_rate": 8.040000000000001e-06,
"loss": 0.8553,
"step": 220
},
{
"epoch": 0.7181889149102264,
"grad_norm": 23.751340866088867,
"learning_rate": 8.44e-06,
"loss": 0.7651,
"step": 230
},
{
"epoch": 0.7494145199063232,
"grad_norm": 33.09165954589844,
"learning_rate": 8.8e-06,
"loss": 0.8395,
"step": 240
},
{
"epoch": 0.78064012490242,
"grad_norm": 35.236629486083984,
"learning_rate": 9.200000000000002e-06,
"loss": 0.7972,
"step": 250
},
{
"epoch": 0.8118657298985168,
"grad_norm": 32.98189926147461,
"learning_rate": 9.600000000000001e-06,
"loss": 0.7247,
"step": 260
},
{
"epoch": 0.8430913348946136,
"grad_norm": 26.376121520996094,
"learning_rate": 1e-05,
"loss": 0.7572,
"step": 270
},
{
"epoch": 0.8743169398907104,
"grad_norm": 40.748741149902344,
"learning_rate": 1e-05,
"loss": 0.6943,
"step": 280
},
{
"epoch": 0.9055425448868072,
"grad_norm": 55.08994674682617,
"learning_rate": 1e-05,
"loss": 0.8476,
"step": 290
},
{
"epoch": 0.936768149882904,
"grad_norm": 40.200077056884766,
"learning_rate": 1e-05,
"loss": 0.7114,
"step": 300
},
{
"epoch": 0.9679937548790007,
"grad_norm": 24.698932647705078,
"learning_rate": 1e-05,
"loss": 0.7889,
"step": 310
},
{
"epoch": 0.9992193598750976,
"grad_norm": 20.618940353393555,
"learning_rate": 1e-05,
"loss": 0.7606,
"step": 320
},
{
"epoch": 1.0304449648711944,
"grad_norm": 24.90777587890625,
"learning_rate": 1e-05,
"loss": 0.4925,
"step": 330
},
{
"epoch": 1.0616705698672912,
"grad_norm": 28.75925636291504,
"learning_rate": 1e-05,
"loss": 0.4349,
"step": 340
},
{
"epoch": 1.092896174863388,
"grad_norm": 306.2433166503906,
"learning_rate": 1e-05,
"loss": 0.4855,
"step": 350
},
{
"epoch": 1.1241217798594847,
"grad_norm": 30.801406860351562,
"learning_rate": 1e-05,
"loss": 0.4829,
"step": 360
},
{
"epoch": 1.1553473848555815,
"grad_norm": 20.874588012695312,
"learning_rate": 1e-05,
"loss": 0.4967,
"step": 370
},
{
"epoch": 1.1865729898516784,
"grad_norm": 15.966379165649414,
"learning_rate": 1e-05,
"loss": 0.4283,
"step": 380
},
{
"epoch": 1.2177985948477752,
"grad_norm": 82.65829467773438,
"learning_rate": 1e-05,
"loss": 0.4268,
"step": 390
},
{
"epoch": 1.249024199843872,
"grad_norm": 32.251461029052734,
"learning_rate": 1e-05,
"loss": 0.5603,
"step": 400
},
{
"epoch": 1.249024199843872,
"eval_accuracy": 0.7,
"eval_loss": 0.378662109375,
"eval_runtime": 0.8734,
"eval_samples_per_second": 11.449,
"eval_steps_per_second": 1.145,
"step": 400
},
{
"epoch": 1.2802498048399689,
"grad_norm": 16.248600006103516,
"learning_rate": 1e-05,
"loss": 0.4496,
"step": 410
},
{
"epoch": 1.3114754098360657,
"grad_norm": 26.644573211669922,
"learning_rate": 1e-05,
"loss": 0.45,
"step": 420
},
{
"epoch": 1.3427010148321623,
"grad_norm": 31.046363830566406,
"learning_rate": 1e-05,
"loss": 0.4094,
"step": 430
},
{
"epoch": 1.3739266198282591,
"grad_norm": 25.93197250366211,
"learning_rate": 1e-05,
"loss": 0.3649,
"step": 440
},
{
"epoch": 1.405152224824356,
"grad_norm": 19.997283935546875,
"learning_rate": 1e-05,
"loss": 0.5174,
"step": 450
},
{
"epoch": 1.4363778298204528,
"grad_norm": 20.04343032836914,
"learning_rate": 1e-05,
"loss": 0.4514,
"step": 460
},
{
"epoch": 1.4676034348165496,
"grad_norm": 18.52043914794922,
"learning_rate": 1e-05,
"loss": 0.3747,
"step": 470
},
{
"epoch": 1.4988290398126463,
"grad_norm": 74.7401123046875,
"learning_rate": 1e-05,
"loss": 0.4383,
"step": 480
},
{
"epoch": 1.530054644808743,
"grad_norm": 114.52285766601562,
"learning_rate": 1e-05,
"loss": 0.461,
"step": 490
},
{
"epoch": 1.56128024980484,
"grad_norm": 122.9369125366211,
"learning_rate": 1e-05,
"loss": 0.6252,
"step": 500
},
{
"epoch": 1.5925058548009368,
"grad_norm": 44.502681732177734,
"learning_rate": 1e-05,
"loss": 0.7419,
"step": 510
},
{
"epoch": 1.6237314597970336,
"grad_norm": 48.50262451171875,
"learning_rate": 1e-05,
"loss": 0.6756,
"step": 520
},
{
"epoch": 1.6549570647931304,
"grad_norm": 39.29521942138672,
"learning_rate": 1e-05,
"loss": 0.6941,
"step": 530
},
{
"epoch": 1.6861826697892273,
"grad_norm": 33.0960807800293,
"learning_rate": 1e-05,
"loss": 0.6813,
"step": 540
},
{
"epoch": 1.717408274785324,
"grad_norm": 25.355117797851562,
"learning_rate": 1e-05,
"loss": 0.7615,
"step": 550
},
{
"epoch": 1.748633879781421,
"grad_norm": 20.417200088500977,
"learning_rate": 1e-05,
"loss": 0.6087,
"step": 560
},
{
"epoch": 1.7798594847775175,
"grad_norm": 33.266746520996094,
"learning_rate": 1e-05,
"loss": 0.7996,
"step": 570
},
{
"epoch": 1.8110850897736144,
"grad_norm": 13.53630542755127,
"learning_rate": 1e-05,
"loss": 0.6292,
"step": 580
},
{
"epoch": 1.8423106947697112,
"grad_norm": 39.0125732421875,
"learning_rate": 1e-05,
"loss": 0.591,
"step": 590
},
{
"epoch": 1.8735362997658078,
"grad_norm": 24.019407272338867,
"learning_rate": 1e-05,
"loss": 0.6722,
"step": 600
},
{
"epoch": 1.9047619047619047,
"grad_norm": 27.3595027923584,
"learning_rate": 1e-05,
"loss": 0.5955,
"step": 610
},
{
"epoch": 1.9359875097580015,
"grad_norm": 22.498308181762695,
"learning_rate": 1e-05,
"loss": 0.5076,
"step": 620
},
{
"epoch": 1.9672131147540983,
"grad_norm": 18.389278411865234,
"learning_rate": 1e-05,
"loss": 0.6773,
"step": 630
},
{
"epoch": 1.9984387197501952,
"grad_norm": 17.433815002441406,
"learning_rate": 1e-05,
"loss": 0.5944,
"step": 640
},
{
"epoch": 2.029664324746292,
"grad_norm": 11.7727632522583,
"learning_rate": 1e-05,
"loss": 0.1184,
"step": 650
},
{
"epoch": 2.060889929742389,
"grad_norm": 44.985408782958984,
"learning_rate": 1e-05,
"loss": 0.4219,
"step": 660
},
{
"epoch": 2.0921155347384857,
"grad_norm": 27.04376220703125,
"learning_rate": 1e-05,
"loss": 0.1695,
"step": 670
},
{
"epoch": 2.1233411397345825,
"grad_norm": 29.073190689086914,
"learning_rate": 1e-05,
"loss": 0.2694,
"step": 680
},
{
"epoch": 2.1545667447306793,
"grad_norm": 30.895280838012695,
"learning_rate": 1e-05,
"loss": 0.2046,
"step": 690
},
{
"epoch": 2.185792349726776,
"grad_norm": 10.022652626037598,
"learning_rate": 1e-05,
"loss": 0.1136,
"step": 700
},
{
"epoch": 2.2170179547228726,
"grad_norm": 26.809078216552734,
"learning_rate": 1e-05,
"loss": 0.1925,
"step": 710
},
{
"epoch": 2.2482435597189694,
"grad_norm": 36.76298141479492,
"learning_rate": 1e-05,
"loss": 0.2269,
"step": 720
},
{
"epoch": 2.279469164715066,
"grad_norm": 15.884474754333496,
"learning_rate": 1e-05,
"loss": 0.2236,
"step": 730
},
{
"epoch": 2.310694769711163,
"grad_norm": 48.100120544433594,
"learning_rate": 1e-05,
"loss": 0.2063,
"step": 740
},
{
"epoch": 2.34192037470726,
"grad_norm": 7.69113302230835,
"learning_rate": 1e-05,
"loss": 0.1649,
"step": 750
},
{
"epoch": 2.3731459797033567,
"grad_norm": 37.846527099609375,
"learning_rate": 1e-05,
"loss": 0.1523,
"step": 760
},
{
"epoch": 2.4043715846994536,
"grad_norm": 17.19913101196289,
"learning_rate": 1e-05,
"loss": 0.2338,
"step": 770
},
{
"epoch": 2.4355971896955504,
"grad_norm": 42.62053298950195,
"learning_rate": 1e-05,
"loss": 0.4299,
"step": 780
},
{
"epoch": 2.4668227946916472,
"grad_norm": 14.81313705444336,
"learning_rate": 1e-05,
"loss": 0.2679,
"step": 790
},
{
"epoch": 2.498048399687744,
"grad_norm": 16.247289657592773,
"learning_rate": 1e-05,
"loss": 0.2645,
"step": 800
},
{
"epoch": 2.498048399687744,
"eval_accuracy": 0.7,
"eval_loss": 0.490234375,
"eval_runtime": 0.8679,
"eval_samples_per_second": 11.522,
"eval_steps_per_second": 1.152,
"step": 800
},
{
"epoch": 2.529274004683841,
"grad_norm": 26.519615173339844,
"learning_rate": 1e-05,
"loss": 0.2979,
"step": 810
},
{
"epoch": 2.5604996096799377,
"grad_norm": 35.26914596557617,
"learning_rate": 1e-05,
"loss": 0.2336,
"step": 820
},
{
"epoch": 2.5917252146760346,
"grad_norm": 21.243257522583008,
"learning_rate": 1e-05,
"loss": 0.2344,
"step": 830
},
{
"epoch": 2.6229508196721314,
"grad_norm": 59.89961624145508,
"learning_rate": 1e-05,
"loss": 0.2617,
"step": 840
},
{
"epoch": 2.654176424668228,
"grad_norm": 19.667827606201172,
"learning_rate": 1e-05,
"loss": 0.197,
"step": 850
},
{
"epoch": 2.6854020296643246,
"grad_norm": 27.412151336669922,
"learning_rate": 1e-05,
"loss": 0.1607,
"step": 860
},
{
"epoch": 2.7166276346604215,
"grad_norm": 10.426700592041016,
"learning_rate": 1e-05,
"loss": 0.2341,
"step": 870
},
{
"epoch": 2.7478532396565183,
"grad_norm": 25.850656509399414,
"learning_rate": 1e-05,
"loss": 0.1947,
"step": 880
},
{
"epoch": 2.779078844652615,
"grad_norm": 33.998863220214844,
"learning_rate": 1e-05,
"loss": 0.2047,
"step": 890
},
{
"epoch": 2.810304449648712,
"grad_norm": 17.702449798583984,
"learning_rate": 1e-05,
"loss": 0.238,
"step": 900
},
{
"epoch": 2.841530054644809,
"grad_norm": 3.9858572483062744,
"learning_rate": 1e-05,
"loss": 0.2327,
"step": 910
},
{
"epoch": 2.8727556596409056,
"grad_norm": 35.145668029785156,
"learning_rate": 1e-05,
"loss": 0.1995,
"step": 920
},
{
"epoch": 2.9039812646370025,
"grad_norm": 46.61024856567383,
"learning_rate": 1e-05,
"loss": 0.1658,
"step": 930
},
{
"epoch": 2.9352068696330993,
"grad_norm": 23.774057388305664,
"learning_rate": 1e-05,
"loss": 0.2819,
"step": 940
},
{
"epoch": 2.9664324746291957,
"grad_norm": 15.349525451660156,
"learning_rate": 1e-05,
"loss": 0.1376,
"step": 950
},
{
"epoch": 2.9976580796252925,
"grad_norm": 13.426594734191895,
"learning_rate": 1e-05,
"loss": 0.2882,
"step": 960
},
{
"epoch": 3.0288836846213893,
"grad_norm": 6.281402587890625,
"learning_rate": 1e-05,
"loss": 0.0894,
"step": 970
},
{
"epoch": 3.060109289617486,
"grad_norm": 2.655089855194092,
"learning_rate": 1e-05,
"loss": 0.0745,
"step": 980
},
{
"epoch": 3.091334894613583,
"grad_norm": 3.948760986328125,
"learning_rate": 1e-05,
"loss": 0.07,
"step": 990
},
{
"epoch": 3.12256049960968,
"grad_norm": 20.85759735107422,
"learning_rate": 1e-05,
"loss": 0.0706,
"step": 1000
},
{
"epoch": 3.1537861046057767,
"grad_norm": 17.535884857177734,
"learning_rate": 1e-05,
"loss": 0.2045,
"step": 1010
},
{
"epoch": 3.1850117096018735,
"grad_norm": 21.014545440673828,
"learning_rate": 1e-05,
"loss": 0.1267,
"step": 1020
},
{
"epoch": 3.2162373145979704,
"grad_norm": 6.366164207458496,
"learning_rate": 1e-05,
"loss": 0.1616,
"step": 1030
},
{
"epoch": 3.247462919594067,
"grad_norm": 20.15192222595215,
"learning_rate": 1e-05,
"loss": 0.0979,
"step": 1040
},
{
"epoch": 3.278688524590164,
"grad_norm": 0.4769607186317444,
"learning_rate": 1e-05,
"loss": 0.1403,
"step": 1050
},
{
"epoch": 3.309914129586261,
"grad_norm": 9.628069877624512,
"learning_rate": 1e-05,
"loss": 0.0501,
"step": 1060
},
{
"epoch": 3.3411397345823577,
"grad_norm": 52.10974884033203,
"learning_rate": 1e-05,
"loss": 0.1405,
"step": 1070
},
{
"epoch": 3.3723653395784545,
"grad_norm": 23.110986709594727,
"learning_rate": 1e-05,
"loss": 0.1177,
"step": 1080
},
{
"epoch": 3.4035909445745514,
"grad_norm": 19.135101318359375,
"learning_rate": 1e-05,
"loss": 0.1152,
"step": 1090
},
{
"epoch": 3.4348165495706477,
"grad_norm": 10.451769828796387,
"learning_rate": 1e-05,
"loss": 0.1005,
"step": 1100
},
{
"epoch": 3.4660421545667446,
"grad_norm": 15.533573150634766,
"learning_rate": 1e-05,
"loss": 0.0585,
"step": 1110
},
{
"epoch": 3.4972677595628414,
"grad_norm": 8.309584617614746,
"learning_rate": 1e-05,
"loss": 0.1311,
"step": 1120
},
{
"epoch": 3.5284933645589383,
"grad_norm": 3.8508894443511963,
"learning_rate": 1e-05,
"loss": 0.0971,
"step": 1130
},
{
"epoch": 3.559718969555035,
"grad_norm": 16.79774284362793,
"learning_rate": 1e-05,
"loss": 0.1476,
"step": 1140
},
{
"epoch": 3.590944574551132,
"grad_norm": 1.4701294898986816,
"learning_rate": 1e-05,
"loss": 0.1692,
"step": 1150
},
{
"epoch": 3.6221701795472288,
"grad_norm": 13.413945198059082,
"learning_rate": 1e-05,
"loss": 0.1472,
"step": 1160
},
{
"epoch": 3.6533957845433256,
"grad_norm": 27.413959503173828,
"learning_rate": 1e-05,
"loss": 0.1762,
"step": 1170
},
{
"epoch": 3.6846213895394224,
"grad_norm": 32.048553466796875,
"learning_rate": 1e-05,
"loss": 0.1562,
"step": 1180
},
{
"epoch": 3.7158469945355193,
"grad_norm": 31.58294677734375,
"learning_rate": 1e-05,
"loss": 0.1347,
"step": 1190
},
{
"epoch": 3.747072599531616,
"grad_norm": 17.824254989624023,
"learning_rate": 1e-05,
"loss": 0.1158,
"step": 1200
},
{
"epoch": 3.747072599531616,
"eval_accuracy": 0.6,
"eval_loss": 1.248046875,
"eval_runtime": 0.8648,
"eval_samples_per_second": 11.563,
"eval_steps_per_second": 1.156,
"step": 1200
},
{
"epoch": 3.7782982045277125,
"grad_norm": 46.47492599487305,
"learning_rate": 1e-05,
"loss": 0.1508,
"step": 1210
},
{
"epoch": 3.8095238095238093,
"grad_norm": 13.830499649047852,
"learning_rate": 1e-05,
"loss": 0.0936,
"step": 1220
},
{
"epoch": 3.840749414519906,
"grad_norm": 19.533958435058594,
"learning_rate": 1e-05,
"loss": 0.063,
"step": 1230
},
{
"epoch": 3.871975019516003,
"grad_norm": 43.4871940612793,
"learning_rate": 1e-05,
"loss": 0.1794,
"step": 1240
},
{
"epoch": 3.9032006245121,
"grad_norm": 17.626535415649414,
"learning_rate": 1e-05,
"loss": 0.1324,
"step": 1250
},
{
"epoch": 3.9344262295081966,
"grad_norm": 18.589401245117188,
"learning_rate": 1e-05,
"loss": 0.1517,
"step": 1260
},
{
"epoch": 3.9656518345042935,
"grad_norm": 8.064416885375977,
"learning_rate": 1e-05,
"loss": 0.108,
"step": 1270
},
{
"epoch": 3.9968774395003903,
"grad_norm": 3.094780206680298,
"learning_rate": 1e-05,
"loss": 0.1716,
"step": 1280
},
{
"epoch": 4.028103044496487,
"grad_norm": 9.602354049682617,
"learning_rate": 1e-05,
"loss": 0.0586,
"step": 1290
},
{
"epoch": 4.059328649492584,
"grad_norm": 17.06719207763672,
"learning_rate": 1e-05,
"loss": 0.0568,
"step": 1300
},
{
"epoch": 4.090554254488681,
"grad_norm": 23.80466079711914,
"learning_rate": 1e-05,
"loss": 0.0135,
"step": 1310
},
{
"epoch": 4.121779859484778,
"grad_norm": 1.7121708393096924,
"learning_rate": 1e-05,
"loss": 0.0382,
"step": 1320
},
{
"epoch": 4.1530054644808745,
"grad_norm": 0.5317578315734863,
"learning_rate": 1e-05,
"loss": 0.086,
"step": 1330
},
{
"epoch": 4.184231069476971,
"grad_norm": 46.14189147949219,
"learning_rate": 1e-05,
"loss": 0.2723,
"step": 1340
},
{
"epoch": 4.215456674473068,
"grad_norm": 14.067253112792969,
"learning_rate": 1e-05,
"loss": 0.1464,
"step": 1350
},
{
"epoch": 4.246682279469165,
"grad_norm": 5.362925052642822,
"learning_rate": 1e-05,
"loss": 0.0283,
"step": 1360
},
{
"epoch": 4.277907884465262,
"grad_norm": 6.1237874031066895,
"learning_rate": 1e-05,
"loss": 0.0601,
"step": 1370
},
{
"epoch": 4.309133489461359,
"grad_norm": 1.5201495885849,
"learning_rate": 1e-05,
"loss": 0.0139,
"step": 1380
},
{
"epoch": 4.3403590944574555,
"grad_norm": 12.532272338867188,
"learning_rate": 1e-05,
"loss": 0.0499,
"step": 1390
},
{
"epoch": 4.371584699453552,
"grad_norm": 6.465614318847656,
"learning_rate": 1e-05,
"loss": 0.1156,
"step": 1400
},
{
"epoch": 4.402810304449648,
"grad_norm": 32.81221389770508,
"learning_rate": 1e-05,
"loss": 0.0678,
"step": 1410
},
{
"epoch": 4.434035909445745,
"grad_norm": 0.24042364954948425,
"learning_rate": 1e-05,
"loss": 0.1699,
"step": 1420
},
{
"epoch": 4.465261514441842,
"grad_norm": 50.17581558227539,
"learning_rate": 1e-05,
"loss": 0.09,
"step": 1430
},
{
"epoch": 4.496487119437939,
"grad_norm": 3.710916519165039,
"learning_rate": 1e-05,
"loss": 0.1545,
"step": 1440
},
{
"epoch": 4.527712724434036,
"grad_norm": 7.061243534088135,
"learning_rate": 1e-05,
"loss": 0.2035,
"step": 1450
},
{
"epoch": 4.558938329430132,
"grad_norm": 13.808802604675293,
"learning_rate": 1e-05,
"loss": 0.0959,
"step": 1460
},
{
"epoch": 4.590163934426229,
"grad_norm": 7.443483352661133,
"learning_rate": 1e-05,
"loss": 0.0549,
"step": 1470
},
{
"epoch": 4.621389539422326,
"grad_norm": 1.2829999923706055,
"learning_rate": 1e-05,
"loss": 0.1526,
"step": 1480
},
{
"epoch": 4.652615144418423,
"grad_norm": 26.241554260253906,
"learning_rate": 1e-05,
"loss": 0.0783,
"step": 1490
},
{
"epoch": 4.68384074941452,
"grad_norm": 43.98433303833008,
"learning_rate": 1e-05,
"loss": 0.0907,
"step": 1500
},
{
"epoch": 4.715066354410617,
"grad_norm": 1.828418254852295,
"learning_rate": 1e-05,
"loss": 0.1057,
"step": 1510
},
{
"epoch": 4.7462919594067134,
"grad_norm": 19.284440994262695,
"learning_rate": 1e-05,
"loss": 0.0701,
"step": 1520
},
{
"epoch": 4.77751756440281,
"grad_norm": 18.53413963317871,
"learning_rate": 1e-05,
"loss": 0.1294,
"step": 1530
},
{
"epoch": 4.808743169398907,
"grad_norm": 2.0131237506866455,
"learning_rate": 1e-05,
"loss": 0.1589,
"step": 1540
},
{
"epoch": 4.839968774395004,
"grad_norm": 7.335690021514893,
"learning_rate": 1e-05,
"loss": 0.1426,
"step": 1550
},
{
"epoch": 4.871194379391101,
"grad_norm": 28.594770431518555,
"learning_rate": 1e-05,
"loss": 0.113,
"step": 1560
},
{
"epoch": 4.902419984387198,
"grad_norm": 4.218417644500732,
"learning_rate": 1e-05,
"loss": 0.1795,
"step": 1570
},
{
"epoch": 4.9336455893832944,
"grad_norm": 37.12601089477539,
"learning_rate": 1e-05,
"loss": 0.1044,
"step": 1580
},
{
"epoch": 4.964871194379391,
"grad_norm": 28.900989532470703,
"learning_rate": 1e-05,
"loss": 0.1998,
"step": 1590
},
{
"epoch": 4.996096799375488,
"grad_norm": 15.175968170166016,
"learning_rate": 1e-05,
"loss": 0.0844,
"step": 1600
},
{
"epoch": 4.996096799375488,
"eval_accuracy": 0.8,
"eval_loss": 1.69921875,
"eval_runtime": 0.8704,
"eval_samples_per_second": 11.49,
"eval_steps_per_second": 1.149,
"step": 1600
},
{
"epoch": 5.027322404371585,
"grad_norm": 96.47978973388672,
"learning_rate": 1e-05,
"loss": 0.1599,
"step": 1610
},
{
"epoch": 5.058548009367682,
"grad_norm": 5.848822116851807,
"learning_rate": 1e-05,
"loss": 0.0671,
"step": 1620
},
{
"epoch": 5.089773614363779,
"grad_norm": 8.831692695617676,
"learning_rate": 1e-05,
"loss": 0.047,
"step": 1630
},
{
"epoch": 5.1209992193598755,
"grad_norm": 0.2928885221481323,
"learning_rate": 1e-05,
"loss": 0.0895,
"step": 1640
},
{
"epoch": 5.152224824355972,
"grad_norm": 4.588135242462158,
"learning_rate": 1e-05,
"loss": 0.0109,
"step": 1650
},
{
"epoch": 5.183450429352069,
"grad_norm": 0.0034015802666544914,
"learning_rate": 1e-05,
"loss": 0.1328,
"step": 1660
},
{
"epoch": 5.214676034348165,
"grad_norm": 2.1403472423553467,
"learning_rate": 1e-05,
"loss": 0.1678,
"step": 1670
},
{
"epoch": 5.245901639344262,
"grad_norm": 38.722293853759766,
"learning_rate": 1e-05,
"loss": 0.0738,
"step": 1680
},
{
"epoch": 5.277127244340359,
"grad_norm": 24.931602478027344,
"learning_rate": 1e-05,
"loss": 0.0893,
"step": 1690
},
{
"epoch": 5.308352849336456,
"grad_norm": 8.807583808898926,
"learning_rate": 1e-05,
"loss": 0.0387,
"step": 1700
},
{
"epoch": 5.339578454332552,
"grad_norm": 56.61589431762695,
"learning_rate": 1e-05,
"loss": 0.1091,
"step": 1710
},
{
"epoch": 5.370804059328649,
"grad_norm": 3.9017961025238037,
"learning_rate": 1e-05,
"loss": 0.1022,
"step": 1720
},
{
"epoch": 5.402029664324746,
"grad_norm": 13.145605087280273,
"learning_rate": 1e-05,
"loss": 0.0593,
"step": 1730
},
{
"epoch": 5.433255269320843,
"grad_norm": 2.734715223312378,
"learning_rate": 1e-05,
"loss": 0.0412,
"step": 1740
},
{
"epoch": 5.46448087431694,
"grad_norm": 11.634307861328125,
"learning_rate": 1e-05,
"loss": 0.0745,
"step": 1750
},
{
"epoch": 5.495706479313037,
"grad_norm": 32.81011962890625,
"learning_rate": 1e-05,
"loss": 0.0778,
"step": 1760
},
{
"epoch": 5.526932084309133,
"grad_norm": 4.1930975914001465,
"learning_rate": 1e-05,
"loss": 0.1031,
"step": 1770
},
{
"epoch": 5.55815768930523,
"grad_norm": 1.3936034440994263,
"learning_rate": 1e-05,
"loss": 0.241,
"step": 1780
},
{
"epoch": 5.589383294301327,
"grad_norm": 31.164995193481445,
"learning_rate": 1e-05,
"loss": 0.0586,
"step": 1790
},
{
"epoch": 5.620608899297424,
"grad_norm": 2.2932653427124023,
"learning_rate": 1e-05,
"loss": 0.0132,
"step": 1800
},
{
"epoch": 5.651834504293521,
"grad_norm": 0.4385182857513428,
"learning_rate": 1e-05,
"loss": 0.1186,
"step": 1810
},
{
"epoch": 5.683060109289618,
"grad_norm": 25.183168411254883,
"learning_rate": 1e-05,
"loss": 0.0207,
"step": 1820
},
{
"epoch": 5.714285714285714,
"grad_norm": 8.401402473449707,
"learning_rate": 1e-05,
"loss": 0.0865,
"step": 1830
},
{
"epoch": 5.745511319281811,
"grad_norm": 6.048158168792725,
"learning_rate": 1e-05,
"loss": 0.1033,
"step": 1840
},
{
"epoch": 5.776736924277908,
"grad_norm": 10.991080284118652,
"learning_rate": 1e-05,
"loss": 0.1009,
"step": 1850
},
{
"epoch": 5.807962529274005,
"grad_norm": 5.008920669555664,
"learning_rate": 1e-05,
"loss": 0.1051,
"step": 1860
},
{
"epoch": 5.839188134270102,
"grad_norm": 59.1823616027832,
"learning_rate": 1e-05,
"loss": 0.1015,
"step": 1870
},
{
"epoch": 5.870413739266199,
"grad_norm": 31.044307708740234,
"learning_rate": 1e-05,
"loss": 0.0661,
"step": 1880
},
{
"epoch": 5.901639344262295,
"grad_norm": 53.48557662963867,
"learning_rate": 1e-05,
"loss": 0.0645,
"step": 1890
},
{
"epoch": 5.932864949258392,
"grad_norm": 85.64656066894531,
"learning_rate": 1e-05,
"loss": 0.0618,
"step": 1900
},
{
"epoch": 5.964090554254488,
"grad_norm": 55.22670364379883,
"learning_rate": 1e-05,
"loss": 0.1957,
"step": 1910
},
{
"epoch": 5.995316159250585,
"grad_norm": 20.682653427124023,
"learning_rate": 1e-05,
"loss": 0.0895,
"step": 1920
},
{
"epoch": 6.026541764246682,
"grad_norm": 20.45547103881836,
"learning_rate": 1e-05,
"loss": 0.0349,
"step": 1930
},
{
"epoch": 6.057767369242779,
"grad_norm": 0.7434096336364746,
"learning_rate": 1e-05,
"loss": 0.0839,
"step": 1940
},
{
"epoch": 6.0889929742388755,
"grad_norm": 3.747971534729004,
"learning_rate": 1e-05,
"loss": 0.0582,
"step": 1950
},
{
"epoch": 6.120218579234972,
"grad_norm": 12.133618354797363,
"learning_rate": 1e-05,
"loss": 0.1125,
"step": 1960
},
{
"epoch": 6.151444184231069,
"grad_norm": 0.936773955821991,
"learning_rate": 1e-05,
"loss": 0.0583,
"step": 1970
},
{
"epoch": 6.182669789227166,
"grad_norm": 50.38084411621094,
"learning_rate": 1e-05,
"loss": 0.0407,
"step": 1980
},
{
"epoch": 6.213895394223263,
"grad_norm": 26.78063201904297,
"learning_rate": 1e-05,
"loss": 0.0461,
"step": 1990
},
{
"epoch": 6.24512099921936,
"grad_norm": 0.004929454065859318,
"learning_rate": 1e-05,
"loss": 0.0454,
"step": 2000
},
{
"epoch": 6.24512099921936,
"eval_accuracy": 0.8,
"eval_loss": 0.7734375,
"eval_runtime": 0.8692,
"eval_samples_per_second": 11.505,
"eval_steps_per_second": 1.151,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 2500,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.8332508576095928e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}