{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.24512099921936, "eval_steps": 400, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00312256049960968, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 1.8678, "step": 1 }, { "epoch": 0.0312256049960968, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 1.7236, "step": 10 }, { "epoch": 0.0624512099921936, "grad_norm": 37.27561569213867, "learning_rate": 8e-08, "loss": 1.805, "step": 20 }, { "epoch": 0.0936768149882904, "grad_norm": 41.067867279052734, "learning_rate": 4.800000000000001e-07, "loss": 1.6883, "step": 30 }, { "epoch": 0.1249024199843872, "grad_norm": 43.672298431396484, "learning_rate": 8.400000000000001e-07, "loss": 1.6964, "step": 40 }, { "epoch": 0.156128024980484, "grad_norm": 43.07832717895508, "learning_rate": 1.2400000000000002e-06, "loss": 1.6138, "step": 50 }, { "epoch": 0.1873536299765808, "grad_norm": 37.47705841064453, "learning_rate": 1.6400000000000002e-06, "loss": 1.5515, "step": 60 }, { "epoch": 0.2185792349726776, "grad_norm": 28.83339500427246, "learning_rate": 2.04e-06, "loss": 1.3408, "step": 70 }, { "epoch": 0.2498048399687744, "grad_norm": 31.222503662109375, "learning_rate": 2.4400000000000004e-06, "loss": 1.2731, "step": 80 }, { "epoch": 0.2810304449648712, "grad_norm": 23.76290512084961, "learning_rate": 2.84e-06, "loss": 1.2666, "step": 90 }, { "epoch": 0.312256049960968, "grad_norm": 23.913143157958984, "learning_rate": 3.2400000000000003e-06, "loss": 1.1393, "step": 100 }, { "epoch": 0.3434816549570648, "grad_norm": 24.92310905456543, "learning_rate": 3.6400000000000003e-06, "loss": 1.1529, "step": 110 }, { "epoch": 0.3747072599531616, "grad_norm": 20.76234245300293, "learning_rate": 4.04e-06, "loss": 1.0776, "step": 120 }, { "epoch": 0.4059328649492584, "grad_norm": 30.90992546081543, "learning_rate": 4.440000000000001e-06, "loss": 1.0028, "step": 130 }, { "epoch": 0.4371584699453552, "grad_norm": 30.7198429107666, "learning_rate": 4.84e-06, "loss": 0.9807, "step": 140 }, { "epoch": 0.468384074941452, "grad_norm": 22.76320457458496, "learning_rate": 5.240000000000001e-06, "loss": 0.992, "step": 150 }, { "epoch": 0.4996096799375488, "grad_norm": 24.735822677612305, "learning_rate": 5.64e-06, "loss": 0.8421, "step": 160 }, { "epoch": 0.5308352849336456, "grad_norm": 27.185937881469727, "learning_rate": 6.040000000000001e-06, "loss": 1.012, "step": 170 }, { "epoch": 0.5620608899297423, "grad_norm": 16.42388916015625, "learning_rate": 6.440000000000001e-06, "loss": 0.7315, "step": 180 }, { "epoch": 0.5932864949258392, "grad_norm": 25.17578887939453, "learning_rate": 6.8400000000000014e-06, "loss": 0.6995, "step": 190 }, { "epoch": 0.624512099921936, "grad_norm": 19.550037384033203, "learning_rate": 7.24e-06, "loss": 0.8551, "step": 200 }, { "epoch": 0.6557377049180327, "grad_norm": 22.346853256225586, "learning_rate": 7.640000000000001e-06, "loss": 0.726, "step": 210 }, { "epoch": 0.6869633099141296, "grad_norm": 31.998685836791992, "learning_rate": 8.040000000000001e-06, "loss": 0.8553, "step": 220 }, { "epoch": 0.7181889149102264, "grad_norm": 23.751340866088867, "learning_rate": 8.44e-06, "loss": 0.7651, "step": 230 }, { "epoch": 0.7494145199063232, "grad_norm": 33.09165954589844, "learning_rate": 8.8e-06, "loss": 0.8395, "step": 240 }, { "epoch": 0.78064012490242, "grad_norm": 35.236629486083984, "learning_rate": 9.200000000000002e-06, "loss": 0.7972, "step": 250 }, { "epoch": 0.8118657298985168, "grad_norm": 32.98189926147461, "learning_rate": 9.600000000000001e-06, "loss": 0.7247, "step": 260 }, { "epoch": 0.8430913348946136, "grad_norm": 26.376121520996094, "learning_rate": 1e-05, "loss": 0.7572, "step": 270 }, { "epoch": 0.8743169398907104, "grad_norm": 40.748741149902344, "learning_rate": 1e-05, "loss": 0.6943, "step": 280 }, { "epoch": 0.9055425448868072, "grad_norm": 55.08994674682617, "learning_rate": 1e-05, "loss": 0.8476, "step": 290 }, { "epoch": 0.936768149882904, "grad_norm": 40.200077056884766, "learning_rate": 1e-05, "loss": 0.7114, "step": 300 }, { "epoch": 0.9679937548790007, "grad_norm": 24.698932647705078, "learning_rate": 1e-05, "loss": 0.7889, "step": 310 }, { "epoch": 0.9992193598750976, "grad_norm": 20.618940353393555, "learning_rate": 1e-05, "loss": 0.7606, "step": 320 }, { "epoch": 1.0304449648711944, "grad_norm": 24.90777587890625, "learning_rate": 1e-05, "loss": 0.4925, "step": 330 }, { "epoch": 1.0616705698672912, "grad_norm": 28.75925636291504, "learning_rate": 1e-05, "loss": 0.4349, "step": 340 }, { "epoch": 1.092896174863388, "grad_norm": 306.2433166503906, "learning_rate": 1e-05, "loss": 0.4855, "step": 350 }, { "epoch": 1.1241217798594847, "grad_norm": 30.801406860351562, "learning_rate": 1e-05, "loss": 0.4829, "step": 360 }, { "epoch": 1.1553473848555815, "grad_norm": 20.874588012695312, "learning_rate": 1e-05, "loss": 0.4967, "step": 370 }, { "epoch": 1.1865729898516784, "grad_norm": 15.966379165649414, "learning_rate": 1e-05, "loss": 0.4283, "step": 380 }, { "epoch": 1.2177985948477752, "grad_norm": 82.65829467773438, "learning_rate": 1e-05, "loss": 0.4268, "step": 390 }, { "epoch": 1.249024199843872, "grad_norm": 32.251461029052734, "learning_rate": 1e-05, "loss": 0.5603, "step": 400 }, { "epoch": 1.249024199843872, "eval_accuracy": 0.7, "eval_loss": 0.378662109375, "eval_runtime": 0.8734, "eval_samples_per_second": 11.449, "eval_steps_per_second": 1.145, "step": 400 }, { "epoch": 1.2802498048399689, "grad_norm": 16.248600006103516, "learning_rate": 1e-05, "loss": 0.4496, "step": 410 }, { "epoch": 1.3114754098360657, "grad_norm": 26.644573211669922, "learning_rate": 1e-05, "loss": 0.45, "step": 420 }, { "epoch": 1.3427010148321623, "grad_norm": 31.046363830566406, "learning_rate": 1e-05, "loss": 0.4094, "step": 430 }, { "epoch": 1.3739266198282591, "grad_norm": 25.93197250366211, "learning_rate": 1e-05, "loss": 0.3649, "step": 440 }, { "epoch": 1.405152224824356, "grad_norm": 19.997283935546875, "learning_rate": 1e-05, "loss": 0.5174, "step": 450 }, { "epoch": 1.4363778298204528, "grad_norm": 20.04343032836914, "learning_rate": 1e-05, "loss": 0.4514, "step": 460 }, { "epoch": 1.4676034348165496, "grad_norm": 18.52043914794922, "learning_rate": 1e-05, "loss": 0.3747, "step": 470 }, { "epoch": 1.4988290398126463, "grad_norm": 74.7401123046875, "learning_rate": 1e-05, "loss": 0.4383, "step": 480 }, { "epoch": 1.530054644808743, "grad_norm": 114.52285766601562, "learning_rate": 1e-05, "loss": 0.461, "step": 490 }, { "epoch": 1.56128024980484, "grad_norm": 122.9369125366211, "learning_rate": 1e-05, "loss": 0.6252, "step": 500 }, { "epoch": 1.5925058548009368, "grad_norm": 44.502681732177734, "learning_rate": 1e-05, "loss": 0.7419, "step": 510 }, { "epoch": 1.6237314597970336, "grad_norm": 48.50262451171875, "learning_rate": 1e-05, "loss": 0.6756, "step": 520 }, { "epoch": 1.6549570647931304, "grad_norm": 39.29521942138672, "learning_rate": 1e-05, "loss": 0.6941, "step": 530 }, { "epoch": 1.6861826697892273, "grad_norm": 33.0960807800293, "learning_rate": 1e-05, "loss": 0.6813, "step": 540 }, { "epoch": 1.717408274785324, "grad_norm": 25.355117797851562, "learning_rate": 1e-05, "loss": 0.7615, "step": 550 }, { "epoch": 1.748633879781421, "grad_norm": 20.417200088500977, "learning_rate": 1e-05, "loss": 0.6087, "step": 560 }, { "epoch": 1.7798594847775175, "grad_norm": 33.266746520996094, "learning_rate": 1e-05, "loss": 0.7996, "step": 570 }, { "epoch": 1.8110850897736144, "grad_norm": 13.53630542755127, "learning_rate": 1e-05, "loss": 0.6292, "step": 580 }, { "epoch": 1.8423106947697112, "grad_norm": 39.0125732421875, "learning_rate": 1e-05, "loss": 0.591, "step": 590 }, { "epoch": 1.8735362997658078, "grad_norm": 24.019407272338867, "learning_rate": 1e-05, "loss": 0.6722, "step": 600 }, { "epoch": 1.9047619047619047, "grad_norm": 27.3595027923584, "learning_rate": 1e-05, "loss": 0.5955, "step": 610 }, { "epoch": 1.9359875097580015, "grad_norm": 22.498308181762695, "learning_rate": 1e-05, "loss": 0.5076, "step": 620 }, { "epoch": 1.9672131147540983, "grad_norm": 18.389278411865234, "learning_rate": 1e-05, "loss": 0.6773, "step": 630 }, { "epoch": 1.9984387197501952, "grad_norm": 17.433815002441406, "learning_rate": 1e-05, "loss": 0.5944, "step": 640 }, { "epoch": 2.029664324746292, "grad_norm": 11.7727632522583, "learning_rate": 1e-05, "loss": 0.1184, "step": 650 }, { "epoch": 2.060889929742389, "grad_norm": 44.985408782958984, "learning_rate": 1e-05, "loss": 0.4219, "step": 660 }, { "epoch": 2.0921155347384857, "grad_norm": 27.04376220703125, "learning_rate": 1e-05, "loss": 0.1695, "step": 670 }, { "epoch": 2.1233411397345825, "grad_norm": 29.073190689086914, "learning_rate": 1e-05, "loss": 0.2694, "step": 680 }, { "epoch": 2.1545667447306793, "grad_norm": 30.895280838012695, "learning_rate": 1e-05, "loss": 0.2046, "step": 690 }, { "epoch": 2.185792349726776, "grad_norm": 10.022652626037598, "learning_rate": 1e-05, "loss": 0.1136, "step": 700 }, { "epoch": 2.2170179547228726, "grad_norm": 26.809078216552734, "learning_rate": 1e-05, "loss": 0.1925, "step": 710 }, { "epoch": 2.2482435597189694, "grad_norm": 36.76298141479492, "learning_rate": 1e-05, "loss": 0.2269, "step": 720 }, { "epoch": 2.279469164715066, "grad_norm": 15.884474754333496, "learning_rate": 1e-05, "loss": 0.2236, "step": 730 }, { "epoch": 2.310694769711163, "grad_norm": 48.100120544433594, "learning_rate": 1e-05, "loss": 0.2063, "step": 740 }, { "epoch": 2.34192037470726, "grad_norm": 7.69113302230835, "learning_rate": 1e-05, "loss": 0.1649, "step": 750 }, { "epoch": 2.3731459797033567, "grad_norm": 37.846527099609375, "learning_rate": 1e-05, "loss": 0.1523, "step": 760 }, { "epoch": 2.4043715846994536, "grad_norm": 17.19913101196289, "learning_rate": 1e-05, "loss": 0.2338, "step": 770 }, { "epoch": 2.4355971896955504, "grad_norm": 42.62053298950195, "learning_rate": 1e-05, "loss": 0.4299, "step": 780 }, { "epoch": 2.4668227946916472, "grad_norm": 14.81313705444336, "learning_rate": 1e-05, "loss": 0.2679, "step": 790 }, { "epoch": 2.498048399687744, "grad_norm": 16.247289657592773, "learning_rate": 1e-05, "loss": 0.2645, "step": 800 }, { "epoch": 2.498048399687744, "eval_accuracy": 0.7, "eval_loss": 0.490234375, "eval_runtime": 0.8679, "eval_samples_per_second": 11.522, "eval_steps_per_second": 1.152, "step": 800 }, { "epoch": 2.529274004683841, "grad_norm": 26.519615173339844, "learning_rate": 1e-05, "loss": 0.2979, "step": 810 }, { "epoch": 2.5604996096799377, "grad_norm": 35.26914596557617, "learning_rate": 1e-05, "loss": 0.2336, "step": 820 }, { "epoch": 2.5917252146760346, "grad_norm": 21.243257522583008, "learning_rate": 1e-05, "loss": 0.2344, "step": 830 }, { "epoch": 2.6229508196721314, "grad_norm": 59.89961624145508, "learning_rate": 1e-05, "loss": 0.2617, "step": 840 }, { "epoch": 2.654176424668228, "grad_norm": 19.667827606201172, "learning_rate": 1e-05, "loss": 0.197, "step": 850 }, { "epoch": 2.6854020296643246, "grad_norm": 27.412151336669922, "learning_rate": 1e-05, "loss": 0.1607, "step": 860 }, { "epoch": 2.7166276346604215, "grad_norm": 10.426700592041016, "learning_rate": 1e-05, "loss": 0.2341, "step": 870 }, { "epoch": 2.7478532396565183, "grad_norm": 25.850656509399414, "learning_rate": 1e-05, "loss": 0.1947, "step": 880 }, { "epoch": 2.779078844652615, "grad_norm": 33.998863220214844, "learning_rate": 1e-05, "loss": 0.2047, "step": 890 }, { "epoch": 2.810304449648712, "grad_norm": 17.702449798583984, "learning_rate": 1e-05, "loss": 0.238, "step": 900 }, { "epoch": 2.841530054644809, "grad_norm": 3.9858572483062744, "learning_rate": 1e-05, "loss": 0.2327, "step": 910 }, { "epoch": 2.8727556596409056, "grad_norm": 35.145668029785156, "learning_rate": 1e-05, "loss": 0.1995, "step": 920 }, { "epoch": 2.9039812646370025, "grad_norm": 46.61024856567383, "learning_rate": 1e-05, "loss": 0.1658, "step": 930 }, { "epoch": 2.9352068696330993, "grad_norm": 23.774057388305664, "learning_rate": 1e-05, "loss": 0.2819, "step": 940 }, { "epoch": 2.9664324746291957, "grad_norm": 15.349525451660156, "learning_rate": 1e-05, "loss": 0.1376, "step": 950 }, { "epoch": 2.9976580796252925, "grad_norm": 13.426594734191895, "learning_rate": 1e-05, "loss": 0.2882, "step": 960 }, { "epoch": 3.0288836846213893, "grad_norm": 6.281402587890625, "learning_rate": 1e-05, "loss": 0.0894, "step": 970 }, { "epoch": 3.060109289617486, "grad_norm": 2.655089855194092, "learning_rate": 1e-05, "loss": 0.0745, "step": 980 }, { "epoch": 3.091334894613583, "grad_norm": 3.948760986328125, "learning_rate": 1e-05, "loss": 0.07, "step": 990 }, { "epoch": 3.12256049960968, "grad_norm": 20.85759735107422, "learning_rate": 1e-05, "loss": 0.0706, "step": 1000 }, { "epoch": 3.1537861046057767, "grad_norm": 17.535884857177734, "learning_rate": 1e-05, "loss": 0.2045, "step": 1010 }, { "epoch": 3.1850117096018735, "grad_norm": 21.014545440673828, "learning_rate": 1e-05, "loss": 0.1267, "step": 1020 }, { "epoch": 3.2162373145979704, "grad_norm": 6.366164207458496, "learning_rate": 1e-05, "loss": 0.1616, "step": 1030 }, { "epoch": 3.247462919594067, "grad_norm": 20.15192222595215, "learning_rate": 1e-05, "loss": 0.0979, "step": 1040 }, { "epoch": 3.278688524590164, "grad_norm": 0.4769607186317444, "learning_rate": 1e-05, "loss": 0.1403, "step": 1050 }, { "epoch": 3.309914129586261, "grad_norm": 9.628069877624512, "learning_rate": 1e-05, "loss": 0.0501, "step": 1060 }, { "epoch": 3.3411397345823577, "grad_norm": 52.10974884033203, "learning_rate": 1e-05, "loss": 0.1405, "step": 1070 }, { "epoch": 3.3723653395784545, "grad_norm": 23.110986709594727, "learning_rate": 1e-05, "loss": 0.1177, "step": 1080 }, { "epoch": 3.4035909445745514, "grad_norm": 19.135101318359375, "learning_rate": 1e-05, "loss": 0.1152, "step": 1090 }, { "epoch": 3.4348165495706477, "grad_norm": 10.451769828796387, "learning_rate": 1e-05, "loss": 0.1005, "step": 1100 }, { "epoch": 3.4660421545667446, "grad_norm": 15.533573150634766, "learning_rate": 1e-05, "loss": 0.0585, "step": 1110 }, { "epoch": 3.4972677595628414, "grad_norm": 8.309584617614746, "learning_rate": 1e-05, "loss": 0.1311, "step": 1120 }, { "epoch": 3.5284933645589383, "grad_norm": 3.8508894443511963, "learning_rate": 1e-05, "loss": 0.0971, "step": 1130 }, { "epoch": 3.559718969555035, "grad_norm": 16.79774284362793, "learning_rate": 1e-05, "loss": 0.1476, "step": 1140 }, { "epoch": 3.590944574551132, "grad_norm": 1.4701294898986816, "learning_rate": 1e-05, "loss": 0.1692, "step": 1150 }, { "epoch": 3.6221701795472288, "grad_norm": 13.413945198059082, "learning_rate": 1e-05, "loss": 0.1472, "step": 1160 }, { "epoch": 3.6533957845433256, "grad_norm": 27.413959503173828, "learning_rate": 1e-05, "loss": 0.1762, "step": 1170 }, { "epoch": 3.6846213895394224, "grad_norm": 32.048553466796875, "learning_rate": 1e-05, "loss": 0.1562, "step": 1180 }, { "epoch": 3.7158469945355193, "grad_norm": 31.58294677734375, "learning_rate": 1e-05, "loss": 0.1347, "step": 1190 }, { "epoch": 3.747072599531616, "grad_norm": 17.824254989624023, "learning_rate": 1e-05, "loss": 0.1158, "step": 1200 }, { "epoch": 3.747072599531616, "eval_accuracy": 0.6, "eval_loss": 1.248046875, "eval_runtime": 0.8648, "eval_samples_per_second": 11.563, "eval_steps_per_second": 1.156, "step": 1200 }, { "epoch": 3.7782982045277125, "grad_norm": 46.47492599487305, "learning_rate": 1e-05, "loss": 0.1508, "step": 1210 }, { "epoch": 3.8095238095238093, "grad_norm": 13.830499649047852, "learning_rate": 1e-05, "loss": 0.0936, "step": 1220 }, { "epoch": 3.840749414519906, "grad_norm": 19.533958435058594, "learning_rate": 1e-05, "loss": 0.063, "step": 1230 }, { "epoch": 3.871975019516003, "grad_norm": 43.4871940612793, "learning_rate": 1e-05, "loss": 0.1794, "step": 1240 }, { "epoch": 3.9032006245121, "grad_norm": 17.626535415649414, "learning_rate": 1e-05, "loss": 0.1324, "step": 1250 }, { "epoch": 3.9344262295081966, "grad_norm": 18.589401245117188, "learning_rate": 1e-05, "loss": 0.1517, "step": 1260 }, { "epoch": 3.9656518345042935, "grad_norm": 8.064416885375977, "learning_rate": 1e-05, "loss": 0.108, "step": 1270 }, { "epoch": 3.9968774395003903, "grad_norm": 3.094780206680298, "learning_rate": 1e-05, "loss": 0.1716, "step": 1280 }, { "epoch": 4.028103044496487, "grad_norm": 9.602354049682617, "learning_rate": 1e-05, "loss": 0.0586, "step": 1290 }, { "epoch": 4.059328649492584, "grad_norm": 17.06719207763672, "learning_rate": 1e-05, "loss": 0.0568, "step": 1300 }, { "epoch": 4.090554254488681, "grad_norm": 23.80466079711914, "learning_rate": 1e-05, "loss": 0.0135, "step": 1310 }, { "epoch": 4.121779859484778, "grad_norm": 1.7121708393096924, "learning_rate": 1e-05, "loss": 0.0382, "step": 1320 }, { "epoch": 4.1530054644808745, "grad_norm": 0.5317578315734863, "learning_rate": 1e-05, "loss": 0.086, "step": 1330 }, { "epoch": 4.184231069476971, "grad_norm": 46.14189147949219, "learning_rate": 1e-05, "loss": 0.2723, "step": 1340 }, { "epoch": 4.215456674473068, "grad_norm": 14.067253112792969, "learning_rate": 1e-05, "loss": 0.1464, "step": 1350 }, { "epoch": 4.246682279469165, "grad_norm": 5.362925052642822, "learning_rate": 1e-05, "loss": 0.0283, "step": 1360 }, { "epoch": 4.277907884465262, "grad_norm": 6.1237874031066895, "learning_rate": 1e-05, "loss": 0.0601, "step": 1370 }, { "epoch": 4.309133489461359, "grad_norm": 1.5201495885849, "learning_rate": 1e-05, "loss": 0.0139, "step": 1380 }, { "epoch": 4.3403590944574555, "grad_norm": 12.532272338867188, "learning_rate": 1e-05, "loss": 0.0499, "step": 1390 }, { "epoch": 4.371584699453552, "grad_norm": 6.465614318847656, "learning_rate": 1e-05, "loss": 0.1156, "step": 1400 }, { "epoch": 4.402810304449648, "grad_norm": 32.81221389770508, "learning_rate": 1e-05, "loss": 0.0678, "step": 1410 }, { "epoch": 4.434035909445745, "grad_norm": 0.24042364954948425, "learning_rate": 1e-05, "loss": 0.1699, "step": 1420 }, { "epoch": 4.465261514441842, "grad_norm": 50.17581558227539, "learning_rate": 1e-05, "loss": 0.09, "step": 1430 }, { "epoch": 4.496487119437939, "grad_norm": 3.710916519165039, "learning_rate": 1e-05, "loss": 0.1545, "step": 1440 }, { "epoch": 4.527712724434036, "grad_norm": 7.061243534088135, "learning_rate": 1e-05, "loss": 0.2035, "step": 1450 }, { "epoch": 4.558938329430132, "grad_norm": 13.808802604675293, "learning_rate": 1e-05, "loss": 0.0959, "step": 1460 }, { "epoch": 4.590163934426229, "grad_norm": 7.443483352661133, "learning_rate": 1e-05, "loss": 0.0549, "step": 1470 }, { "epoch": 4.621389539422326, "grad_norm": 1.2829999923706055, "learning_rate": 1e-05, "loss": 0.1526, "step": 1480 }, { "epoch": 4.652615144418423, "grad_norm": 26.241554260253906, "learning_rate": 1e-05, "loss": 0.0783, "step": 1490 }, { "epoch": 4.68384074941452, "grad_norm": 43.98433303833008, "learning_rate": 1e-05, "loss": 0.0907, "step": 1500 }, { "epoch": 4.715066354410617, "grad_norm": 1.828418254852295, "learning_rate": 1e-05, "loss": 0.1057, "step": 1510 }, { "epoch": 4.7462919594067134, "grad_norm": 19.284440994262695, "learning_rate": 1e-05, "loss": 0.0701, "step": 1520 }, { "epoch": 4.77751756440281, "grad_norm": 18.53413963317871, "learning_rate": 1e-05, "loss": 0.1294, "step": 1530 }, { "epoch": 4.808743169398907, "grad_norm": 2.0131237506866455, "learning_rate": 1e-05, "loss": 0.1589, "step": 1540 }, { "epoch": 4.839968774395004, "grad_norm": 7.335690021514893, "learning_rate": 1e-05, "loss": 0.1426, "step": 1550 }, { "epoch": 4.871194379391101, "grad_norm": 28.594770431518555, "learning_rate": 1e-05, "loss": 0.113, "step": 1560 }, { "epoch": 4.902419984387198, "grad_norm": 4.218417644500732, "learning_rate": 1e-05, "loss": 0.1795, "step": 1570 }, { "epoch": 4.9336455893832944, "grad_norm": 37.12601089477539, "learning_rate": 1e-05, "loss": 0.1044, "step": 1580 }, { "epoch": 4.964871194379391, "grad_norm": 28.900989532470703, "learning_rate": 1e-05, "loss": 0.1998, "step": 1590 }, { "epoch": 4.996096799375488, "grad_norm": 15.175968170166016, "learning_rate": 1e-05, "loss": 0.0844, "step": 1600 }, { "epoch": 4.996096799375488, "eval_accuracy": 0.8, "eval_loss": 1.69921875, "eval_runtime": 0.8704, "eval_samples_per_second": 11.49, "eval_steps_per_second": 1.149, "step": 1600 }, { "epoch": 5.027322404371585, "grad_norm": 96.47978973388672, "learning_rate": 1e-05, "loss": 0.1599, "step": 1610 }, { "epoch": 5.058548009367682, "grad_norm": 5.848822116851807, "learning_rate": 1e-05, "loss": 0.0671, "step": 1620 }, { "epoch": 5.089773614363779, "grad_norm": 8.831692695617676, "learning_rate": 1e-05, "loss": 0.047, "step": 1630 }, { "epoch": 5.1209992193598755, "grad_norm": 0.2928885221481323, "learning_rate": 1e-05, "loss": 0.0895, "step": 1640 }, { "epoch": 5.152224824355972, "grad_norm": 4.588135242462158, "learning_rate": 1e-05, "loss": 0.0109, "step": 1650 }, { "epoch": 5.183450429352069, "grad_norm": 0.0034015802666544914, "learning_rate": 1e-05, "loss": 0.1328, "step": 1660 }, { "epoch": 5.214676034348165, "grad_norm": 2.1403472423553467, "learning_rate": 1e-05, "loss": 0.1678, "step": 1670 }, { "epoch": 5.245901639344262, "grad_norm": 38.722293853759766, "learning_rate": 1e-05, "loss": 0.0738, "step": 1680 }, { "epoch": 5.277127244340359, "grad_norm": 24.931602478027344, "learning_rate": 1e-05, "loss": 0.0893, "step": 1690 }, { "epoch": 5.308352849336456, "grad_norm": 8.807583808898926, "learning_rate": 1e-05, "loss": 0.0387, "step": 1700 }, { "epoch": 5.339578454332552, "grad_norm": 56.61589431762695, "learning_rate": 1e-05, "loss": 0.1091, "step": 1710 }, { "epoch": 5.370804059328649, "grad_norm": 3.9017961025238037, "learning_rate": 1e-05, "loss": 0.1022, "step": 1720 }, { "epoch": 5.402029664324746, "grad_norm": 13.145605087280273, "learning_rate": 1e-05, "loss": 0.0593, "step": 1730 }, { "epoch": 5.433255269320843, "grad_norm": 2.734715223312378, "learning_rate": 1e-05, "loss": 0.0412, "step": 1740 }, { "epoch": 5.46448087431694, "grad_norm": 11.634307861328125, "learning_rate": 1e-05, "loss": 0.0745, "step": 1750 }, { "epoch": 5.495706479313037, "grad_norm": 32.81011962890625, "learning_rate": 1e-05, "loss": 0.0778, "step": 1760 }, { "epoch": 5.526932084309133, "grad_norm": 4.1930975914001465, "learning_rate": 1e-05, "loss": 0.1031, "step": 1770 }, { "epoch": 5.55815768930523, "grad_norm": 1.3936034440994263, "learning_rate": 1e-05, "loss": 0.241, "step": 1780 }, { "epoch": 5.589383294301327, "grad_norm": 31.164995193481445, "learning_rate": 1e-05, "loss": 0.0586, "step": 1790 }, { "epoch": 5.620608899297424, "grad_norm": 2.2932653427124023, "learning_rate": 1e-05, "loss": 0.0132, "step": 1800 }, { "epoch": 5.651834504293521, "grad_norm": 0.4385182857513428, "learning_rate": 1e-05, "loss": 0.1186, "step": 1810 }, { "epoch": 5.683060109289618, "grad_norm": 25.183168411254883, "learning_rate": 1e-05, "loss": 0.0207, "step": 1820 }, { "epoch": 5.714285714285714, "grad_norm": 8.401402473449707, "learning_rate": 1e-05, "loss": 0.0865, "step": 1830 }, { "epoch": 5.745511319281811, "grad_norm": 6.048158168792725, "learning_rate": 1e-05, "loss": 0.1033, "step": 1840 }, { "epoch": 5.776736924277908, "grad_norm": 10.991080284118652, "learning_rate": 1e-05, "loss": 0.1009, "step": 1850 }, { "epoch": 5.807962529274005, "grad_norm": 5.008920669555664, "learning_rate": 1e-05, "loss": 0.1051, "step": 1860 }, { "epoch": 5.839188134270102, "grad_norm": 59.1823616027832, "learning_rate": 1e-05, "loss": 0.1015, "step": 1870 }, { "epoch": 5.870413739266199, "grad_norm": 31.044307708740234, "learning_rate": 1e-05, "loss": 0.0661, "step": 1880 }, { "epoch": 5.901639344262295, "grad_norm": 53.48557662963867, "learning_rate": 1e-05, "loss": 0.0645, "step": 1890 }, { "epoch": 5.932864949258392, "grad_norm": 85.64656066894531, "learning_rate": 1e-05, "loss": 0.0618, "step": 1900 }, { "epoch": 5.964090554254488, "grad_norm": 55.22670364379883, "learning_rate": 1e-05, "loss": 0.1957, "step": 1910 }, { "epoch": 5.995316159250585, "grad_norm": 20.682653427124023, "learning_rate": 1e-05, "loss": 0.0895, "step": 1920 }, { "epoch": 6.026541764246682, "grad_norm": 20.45547103881836, "learning_rate": 1e-05, "loss": 0.0349, "step": 1930 }, { "epoch": 6.057767369242779, "grad_norm": 0.7434096336364746, "learning_rate": 1e-05, "loss": 0.0839, "step": 1940 }, { "epoch": 6.0889929742388755, "grad_norm": 3.747971534729004, "learning_rate": 1e-05, "loss": 0.0582, "step": 1950 }, { "epoch": 6.120218579234972, "grad_norm": 12.133618354797363, "learning_rate": 1e-05, "loss": 0.1125, "step": 1960 }, { "epoch": 6.151444184231069, "grad_norm": 0.936773955821991, "learning_rate": 1e-05, "loss": 0.0583, "step": 1970 }, { "epoch": 6.182669789227166, "grad_norm": 50.38084411621094, "learning_rate": 1e-05, "loss": 0.0407, "step": 1980 }, { "epoch": 6.213895394223263, "grad_norm": 26.78063201904297, "learning_rate": 1e-05, "loss": 0.0461, "step": 1990 }, { "epoch": 6.24512099921936, "grad_norm": 0.004929454065859318, "learning_rate": 1e-05, "loss": 0.0454, "step": 2000 }, { "epoch": 6.24512099921936, "eval_accuracy": 0.8, "eval_loss": 0.7734375, "eval_runtime": 0.8692, "eval_samples_per_second": 11.505, "eval_steps_per_second": 1.151, "step": 2000 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8332508576095928e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }