|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.24512099921936, |
|
"eval_steps": 400, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00312256049960968, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 1.8678, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0312256049960968, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 1.7236, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0624512099921936, |
|
"grad_norm": 37.27561569213867, |
|
"learning_rate": 8e-08, |
|
"loss": 1.805, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0936768149882904, |
|
"grad_norm": 41.067867279052734, |
|
"learning_rate": 4.800000000000001e-07, |
|
"loss": 1.6883, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1249024199843872, |
|
"grad_norm": 43.672298431396484, |
|
"learning_rate": 8.400000000000001e-07, |
|
"loss": 1.6964, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.156128024980484, |
|
"grad_norm": 43.07832717895508, |
|
"learning_rate": 1.2400000000000002e-06, |
|
"loss": 1.6138, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1873536299765808, |
|
"grad_norm": 37.47705841064453, |
|
"learning_rate": 1.6400000000000002e-06, |
|
"loss": 1.5515, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2185792349726776, |
|
"grad_norm": 28.83339500427246, |
|
"learning_rate": 2.04e-06, |
|
"loss": 1.3408, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2498048399687744, |
|
"grad_norm": 31.222503662109375, |
|
"learning_rate": 2.4400000000000004e-06, |
|
"loss": 1.2731, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2810304449648712, |
|
"grad_norm": 23.76290512084961, |
|
"learning_rate": 2.84e-06, |
|
"loss": 1.2666, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.312256049960968, |
|
"grad_norm": 23.913143157958984, |
|
"learning_rate": 3.2400000000000003e-06, |
|
"loss": 1.1393, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3434816549570648, |
|
"grad_norm": 24.92310905456543, |
|
"learning_rate": 3.6400000000000003e-06, |
|
"loss": 1.1529, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3747072599531616, |
|
"grad_norm": 20.76234245300293, |
|
"learning_rate": 4.04e-06, |
|
"loss": 1.0776, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4059328649492584, |
|
"grad_norm": 30.90992546081543, |
|
"learning_rate": 4.440000000000001e-06, |
|
"loss": 1.0028, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4371584699453552, |
|
"grad_norm": 30.7198429107666, |
|
"learning_rate": 4.84e-06, |
|
"loss": 0.9807, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.468384074941452, |
|
"grad_norm": 22.76320457458496, |
|
"learning_rate": 5.240000000000001e-06, |
|
"loss": 0.992, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4996096799375488, |
|
"grad_norm": 24.735822677612305, |
|
"learning_rate": 5.64e-06, |
|
"loss": 0.8421, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5308352849336456, |
|
"grad_norm": 27.185937881469727, |
|
"learning_rate": 6.040000000000001e-06, |
|
"loss": 1.012, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5620608899297423, |
|
"grad_norm": 16.42388916015625, |
|
"learning_rate": 6.440000000000001e-06, |
|
"loss": 0.7315, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5932864949258392, |
|
"grad_norm": 25.17578887939453, |
|
"learning_rate": 6.8400000000000014e-06, |
|
"loss": 0.6995, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.624512099921936, |
|
"grad_norm": 19.550037384033203, |
|
"learning_rate": 7.24e-06, |
|
"loss": 0.8551, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 22.346853256225586, |
|
"learning_rate": 7.640000000000001e-06, |
|
"loss": 0.726, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6869633099141296, |
|
"grad_norm": 31.998685836791992, |
|
"learning_rate": 8.040000000000001e-06, |
|
"loss": 0.8553, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7181889149102264, |
|
"grad_norm": 23.751340866088867, |
|
"learning_rate": 8.44e-06, |
|
"loss": 0.7651, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7494145199063232, |
|
"grad_norm": 33.09165954589844, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.8395, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.78064012490242, |
|
"grad_norm": 35.236629486083984, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 0.7972, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8118657298985168, |
|
"grad_norm": 32.98189926147461, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.7247, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8430913348946136, |
|
"grad_norm": 26.376121520996094, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7572, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8743169398907104, |
|
"grad_norm": 40.748741149902344, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6943, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9055425448868072, |
|
"grad_norm": 55.08994674682617, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8476, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.936768149882904, |
|
"grad_norm": 40.200077056884766, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7114, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9679937548790007, |
|
"grad_norm": 24.698932647705078, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7889, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9992193598750976, |
|
"grad_norm": 20.618940353393555, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7606, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0304449648711944, |
|
"grad_norm": 24.90777587890625, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4925, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0616705698672912, |
|
"grad_norm": 28.75925636291504, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4349, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.092896174863388, |
|
"grad_norm": 306.2433166503906, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4855, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1241217798594847, |
|
"grad_norm": 30.801406860351562, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4829, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1553473848555815, |
|
"grad_norm": 20.874588012695312, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4967, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1865729898516784, |
|
"grad_norm": 15.966379165649414, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4283, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2177985948477752, |
|
"grad_norm": 82.65829467773438, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4268, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.249024199843872, |
|
"grad_norm": 32.251461029052734, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5603, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.249024199843872, |
|
"eval_accuracy": 0.7, |
|
"eval_loss": 0.378662109375, |
|
"eval_runtime": 0.8734, |
|
"eval_samples_per_second": 11.449, |
|
"eval_steps_per_second": 1.145, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2802498048399689, |
|
"grad_norm": 16.248600006103516, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4496, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3114754098360657, |
|
"grad_norm": 26.644573211669922, |
|
"learning_rate": 1e-05, |
|
"loss": 0.45, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3427010148321623, |
|
"grad_norm": 31.046363830566406, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4094, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3739266198282591, |
|
"grad_norm": 25.93197250366211, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3649, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.405152224824356, |
|
"grad_norm": 19.997283935546875, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5174, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4363778298204528, |
|
"grad_norm": 20.04343032836914, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4514, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4676034348165496, |
|
"grad_norm": 18.52043914794922, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3747, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4988290398126463, |
|
"grad_norm": 74.7401123046875, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4383, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.530054644808743, |
|
"grad_norm": 114.52285766601562, |
|
"learning_rate": 1e-05, |
|
"loss": 0.461, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.56128024980484, |
|
"grad_norm": 122.9369125366211, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6252, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5925058548009368, |
|
"grad_norm": 44.502681732177734, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7419, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6237314597970336, |
|
"grad_norm": 48.50262451171875, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6756, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6549570647931304, |
|
"grad_norm": 39.29521942138672, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6941, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6861826697892273, |
|
"grad_norm": 33.0960807800293, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6813, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.717408274785324, |
|
"grad_norm": 25.355117797851562, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7615, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.748633879781421, |
|
"grad_norm": 20.417200088500977, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6087, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7798594847775175, |
|
"grad_norm": 33.266746520996094, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7996, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8110850897736144, |
|
"grad_norm": 13.53630542755127, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6292, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8423106947697112, |
|
"grad_norm": 39.0125732421875, |
|
"learning_rate": 1e-05, |
|
"loss": 0.591, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8735362997658078, |
|
"grad_norm": 24.019407272338867, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6722, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 27.3595027923584, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5955, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9359875097580015, |
|
"grad_norm": 22.498308181762695, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5076, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9672131147540983, |
|
"grad_norm": 18.389278411865234, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6773, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9984387197501952, |
|
"grad_norm": 17.433815002441406, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5944, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.029664324746292, |
|
"grad_norm": 11.7727632522583, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1184, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.060889929742389, |
|
"grad_norm": 44.985408782958984, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4219, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.0921155347384857, |
|
"grad_norm": 27.04376220703125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1695, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.1233411397345825, |
|
"grad_norm": 29.073190689086914, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2694, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1545667447306793, |
|
"grad_norm": 30.895280838012695, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2046, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.185792349726776, |
|
"grad_norm": 10.022652626037598, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1136, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.2170179547228726, |
|
"grad_norm": 26.809078216552734, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1925, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.2482435597189694, |
|
"grad_norm": 36.76298141479492, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2269, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.279469164715066, |
|
"grad_norm": 15.884474754333496, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2236, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.310694769711163, |
|
"grad_norm": 48.100120544433594, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2063, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.34192037470726, |
|
"grad_norm": 7.69113302230835, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1649, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3731459797033567, |
|
"grad_norm": 37.846527099609375, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1523, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.4043715846994536, |
|
"grad_norm": 17.19913101196289, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2338, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.4355971896955504, |
|
"grad_norm": 42.62053298950195, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4299, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4668227946916472, |
|
"grad_norm": 14.81313705444336, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2679, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.498048399687744, |
|
"grad_norm": 16.247289657592773, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2645, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.498048399687744, |
|
"eval_accuracy": 0.7, |
|
"eval_loss": 0.490234375, |
|
"eval_runtime": 0.8679, |
|
"eval_samples_per_second": 11.522, |
|
"eval_steps_per_second": 1.152, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.529274004683841, |
|
"grad_norm": 26.519615173339844, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2979, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.5604996096799377, |
|
"grad_norm": 35.26914596557617, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2336, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.5917252146760346, |
|
"grad_norm": 21.243257522583008, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2344, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.6229508196721314, |
|
"grad_norm": 59.89961624145508, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2617, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.654176424668228, |
|
"grad_norm": 19.667827606201172, |
|
"learning_rate": 1e-05, |
|
"loss": 0.197, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.6854020296643246, |
|
"grad_norm": 27.412151336669922, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1607, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.7166276346604215, |
|
"grad_norm": 10.426700592041016, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2341, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.7478532396565183, |
|
"grad_norm": 25.850656509399414, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1947, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.779078844652615, |
|
"grad_norm": 33.998863220214844, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2047, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.810304449648712, |
|
"grad_norm": 17.702449798583984, |
|
"learning_rate": 1e-05, |
|
"loss": 0.238, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.841530054644809, |
|
"grad_norm": 3.9858572483062744, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2327, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.8727556596409056, |
|
"grad_norm": 35.145668029785156, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1995, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.9039812646370025, |
|
"grad_norm": 46.61024856567383, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1658, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.9352068696330993, |
|
"grad_norm": 23.774057388305664, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2819, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.9664324746291957, |
|
"grad_norm": 15.349525451660156, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1376, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.9976580796252925, |
|
"grad_norm": 13.426594734191895, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2882, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.0288836846213893, |
|
"grad_norm": 6.281402587890625, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0894, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.060109289617486, |
|
"grad_norm": 2.655089855194092, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0745, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.091334894613583, |
|
"grad_norm": 3.948760986328125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.07, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.12256049960968, |
|
"grad_norm": 20.85759735107422, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0706, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.1537861046057767, |
|
"grad_norm": 17.535884857177734, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2045, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.1850117096018735, |
|
"grad_norm": 21.014545440673828, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1267, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.2162373145979704, |
|
"grad_norm": 6.366164207458496, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1616, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.247462919594067, |
|
"grad_norm": 20.15192222595215, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0979, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.278688524590164, |
|
"grad_norm": 0.4769607186317444, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1403, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.309914129586261, |
|
"grad_norm": 9.628069877624512, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0501, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.3411397345823577, |
|
"grad_norm": 52.10974884033203, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1405, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.3723653395784545, |
|
"grad_norm": 23.110986709594727, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1177, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.4035909445745514, |
|
"grad_norm": 19.135101318359375, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1152, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.4348165495706477, |
|
"grad_norm": 10.451769828796387, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1005, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.4660421545667446, |
|
"grad_norm": 15.533573150634766, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0585, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.4972677595628414, |
|
"grad_norm": 8.309584617614746, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1311, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.5284933645589383, |
|
"grad_norm": 3.8508894443511963, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0971, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.559718969555035, |
|
"grad_norm": 16.79774284362793, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1476, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.590944574551132, |
|
"grad_norm": 1.4701294898986816, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1692, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.6221701795472288, |
|
"grad_norm": 13.413945198059082, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1472, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.6533957845433256, |
|
"grad_norm": 27.413959503173828, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1762, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.6846213895394224, |
|
"grad_norm": 32.048553466796875, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1562, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.7158469945355193, |
|
"grad_norm": 31.58294677734375, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1347, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.747072599531616, |
|
"grad_norm": 17.824254989624023, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1158, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.747072599531616, |
|
"eval_accuracy": 0.6, |
|
"eval_loss": 1.248046875, |
|
"eval_runtime": 0.8648, |
|
"eval_samples_per_second": 11.563, |
|
"eval_steps_per_second": 1.156, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.7782982045277125, |
|
"grad_norm": 46.47492599487305, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1508, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 13.830499649047852, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0936, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.840749414519906, |
|
"grad_norm": 19.533958435058594, |
|
"learning_rate": 1e-05, |
|
"loss": 0.063, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.871975019516003, |
|
"grad_norm": 43.4871940612793, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1794, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.9032006245121, |
|
"grad_norm": 17.626535415649414, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1324, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.9344262295081966, |
|
"grad_norm": 18.589401245117188, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1517, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.9656518345042935, |
|
"grad_norm": 8.064416885375977, |
|
"learning_rate": 1e-05, |
|
"loss": 0.108, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.9968774395003903, |
|
"grad_norm": 3.094780206680298, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1716, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.028103044496487, |
|
"grad_norm": 9.602354049682617, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0586, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.059328649492584, |
|
"grad_norm": 17.06719207763672, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0568, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.090554254488681, |
|
"grad_norm": 23.80466079711914, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0135, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.121779859484778, |
|
"grad_norm": 1.7121708393096924, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0382, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.1530054644808745, |
|
"grad_norm": 0.5317578315734863, |
|
"learning_rate": 1e-05, |
|
"loss": 0.086, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.184231069476971, |
|
"grad_norm": 46.14189147949219, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2723, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.215456674473068, |
|
"grad_norm": 14.067253112792969, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1464, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.246682279469165, |
|
"grad_norm": 5.362925052642822, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0283, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.277907884465262, |
|
"grad_norm": 6.1237874031066895, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0601, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.309133489461359, |
|
"grad_norm": 1.5201495885849, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0139, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.3403590944574555, |
|
"grad_norm": 12.532272338867188, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0499, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.371584699453552, |
|
"grad_norm": 6.465614318847656, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1156, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.402810304449648, |
|
"grad_norm": 32.81221389770508, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0678, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.434035909445745, |
|
"grad_norm": 0.24042364954948425, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1699, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.465261514441842, |
|
"grad_norm": 50.17581558227539, |
|
"learning_rate": 1e-05, |
|
"loss": 0.09, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.496487119437939, |
|
"grad_norm": 3.710916519165039, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1545, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.527712724434036, |
|
"grad_norm": 7.061243534088135, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2035, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.558938329430132, |
|
"grad_norm": 13.808802604675293, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0959, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.590163934426229, |
|
"grad_norm": 7.443483352661133, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0549, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.621389539422326, |
|
"grad_norm": 1.2829999923706055, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1526, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.652615144418423, |
|
"grad_norm": 26.241554260253906, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0783, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.68384074941452, |
|
"grad_norm": 43.98433303833008, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0907, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.715066354410617, |
|
"grad_norm": 1.828418254852295, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1057, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.7462919594067134, |
|
"grad_norm": 19.284440994262695, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0701, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.77751756440281, |
|
"grad_norm": 18.53413963317871, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1294, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.808743169398907, |
|
"grad_norm": 2.0131237506866455, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1589, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.839968774395004, |
|
"grad_norm": 7.335690021514893, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1426, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.871194379391101, |
|
"grad_norm": 28.594770431518555, |
|
"learning_rate": 1e-05, |
|
"loss": 0.113, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.902419984387198, |
|
"grad_norm": 4.218417644500732, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1795, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.9336455893832944, |
|
"grad_norm": 37.12601089477539, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1044, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.964871194379391, |
|
"grad_norm": 28.900989532470703, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1998, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.996096799375488, |
|
"grad_norm": 15.175968170166016, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0844, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.996096799375488, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 1.69921875, |
|
"eval_runtime": 0.8704, |
|
"eval_samples_per_second": 11.49, |
|
"eval_steps_per_second": 1.149, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.027322404371585, |
|
"grad_norm": 96.47978973388672, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1599, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 5.058548009367682, |
|
"grad_norm": 5.848822116851807, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0671, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 5.089773614363779, |
|
"grad_norm": 8.831692695617676, |
|
"learning_rate": 1e-05, |
|
"loss": 0.047, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 5.1209992193598755, |
|
"grad_norm": 0.2928885221481323, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0895, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 5.152224824355972, |
|
"grad_norm": 4.588135242462158, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0109, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.183450429352069, |
|
"grad_norm": 0.0034015802666544914, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1328, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.214676034348165, |
|
"grad_norm": 2.1403472423553467, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1678, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.245901639344262, |
|
"grad_norm": 38.722293853759766, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0738, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.277127244340359, |
|
"grad_norm": 24.931602478027344, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0893, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.308352849336456, |
|
"grad_norm": 8.807583808898926, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0387, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.339578454332552, |
|
"grad_norm": 56.61589431762695, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1091, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.370804059328649, |
|
"grad_norm": 3.9017961025238037, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1022, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.402029664324746, |
|
"grad_norm": 13.145605087280273, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0593, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.433255269320843, |
|
"grad_norm": 2.734715223312378, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0412, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.46448087431694, |
|
"grad_norm": 11.634307861328125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0745, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.495706479313037, |
|
"grad_norm": 32.81011962890625, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0778, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 5.526932084309133, |
|
"grad_norm": 4.1930975914001465, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1031, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 5.55815768930523, |
|
"grad_norm": 1.3936034440994263, |
|
"learning_rate": 1e-05, |
|
"loss": 0.241, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 5.589383294301327, |
|
"grad_norm": 31.164995193481445, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0586, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 5.620608899297424, |
|
"grad_norm": 2.2932653427124023, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0132, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.651834504293521, |
|
"grad_norm": 0.4385182857513428, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1186, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 5.683060109289618, |
|
"grad_norm": 25.183168411254883, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0207, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 8.401402473449707, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0865, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 5.745511319281811, |
|
"grad_norm": 6.048158168792725, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1033, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 5.776736924277908, |
|
"grad_norm": 10.991080284118652, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1009, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.807962529274005, |
|
"grad_norm": 5.008920669555664, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1051, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 5.839188134270102, |
|
"grad_norm": 59.1823616027832, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1015, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 5.870413739266199, |
|
"grad_norm": 31.044307708740234, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0661, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 5.901639344262295, |
|
"grad_norm": 53.48557662963867, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0645, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 5.932864949258392, |
|
"grad_norm": 85.64656066894531, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0618, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.964090554254488, |
|
"grad_norm": 55.22670364379883, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1957, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 5.995316159250585, |
|
"grad_norm": 20.682653427124023, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0895, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 6.026541764246682, |
|
"grad_norm": 20.45547103881836, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0349, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 6.057767369242779, |
|
"grad_norm": 0.7434096336364746, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0839, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 6.0889929742388755, |
|
"grad_norm": 3.747971534729004, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0582, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 6.120218579234972, |
|
"grad_norm": 12.133618354797363, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1125, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 6.151444184231069, |
|
"grad_norm": 0.936773955821991, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0583, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 6.182669789227166, |
|
"grad_norm": 50.38084411621094, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0407, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 6.213895394223263, |
|
"grad_norm": 26.78063201904297, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0461, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 6.24512099921936, |
|
"grad_norm": 0.004929454065859318, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0454, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.24512099921936, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.7734375, |
|
"eval_runtime": 0.8692, |
|
"eval_samples_per_second": 11.505, |
|
"eval_steps_per_second": 1.151, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 400, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8332508576095928e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|