|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 3959, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002525890376357666, |
|
"grad_norm": 4348443.0, |
|
"learning_rate": 4.987370548118212e-05, |
|
"loss": 11.2101, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005051780752715332, |
|
"grad_norm": 6754872.5, |
|
"learning_rate": 4.974741096236424e-05, |
|
"loss": 9.3824, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007577671129072998, |
|
"grad_norm": 5965014.0, |
|
"learning_rate": 4.962111644354635e-05, |
|
"loss": 7.8958, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.010103561505430665, |
|
"grad_norm": 2165972.25, |
|
"learning_rate": 4.949482192472847e-05, |
|
"loss": 7.421, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01262945188178833, |
|
"grad_norm": 3444951.25, |
|
"learning_rate": 4.936852740591059e-05, |
|
"loss": 7.2076, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.015155342258145996, |
|
"grad_norm": 1671924.125, |
|
"learning_rate": 4.92422328870927e-05, |
|
"loss": 7.121, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.017681232634503663, |
|
"grad_norm": 1477403.75, |
|
"learning_rate": 4.911593836827482e-05, |
|
"loss": 6.8903, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02020712301086133, |
|
"grad_norm": 1541815.25, |
|
"learning_rate": 4.898964384945693e-05, |
|
"loss": 6.7837, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.022733013387218996, |
|
"grad_norm": 9298988.0, |
|
"learning_rate": 4.886334933063905e-05, |
|
"loss": 6.9096, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02525890376357666, |
|
"grad_norm": 1347703.0, |
|
"learning_rate": 4.873705481182117e-05, |
|
"loss": 6.4829, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.027784794139934326, |
|
"grad_norm": 744675.0625, |
|
"learning_rate": 4.861076029300329e-05, |
|
"loss": 6.292, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.030310684516291993, |
|
"grad_norm": 1124820.25, |
|
"learning_rate": 4.84844657741854e-05, |
|
"loss": 5.9801, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.032836574892649656, |
|
"grad_norm": 926771.5, |
|
"learning_rate": 4.835817125536752e-05, |
|
"loss": 6.5963, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.035362465269007326, |
|
"grad_norm": 929847.5, |
|
"learning_rate": 4.823187673654964e-05, |
|
"loss": 6.235, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03788835564536499, |
|
"grad_norm": 2526786.75, |
|
"learning_rate": 4.810558221773176e-05, |
|
"loss": 5.9566, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04041424602172266, |
|
"grad_norm": 914504.5, |
|
"learning_rate": 4.797928769891387e-05, |
|
"loss": 5.9669, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04294013639808032, |
|
"grad_norm": 795588.8125, |
|
"learning_rate": 4.785299318009598e-05, |
|
"loss": 5.7371, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04546602677443799, |
|
"grad_norm": 954969.25, |
|
"learning_rate": 4.77266986612781e-05, |
|
"loss": 5.5412, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.047991917150795656, |
|
"grad_norm": 964213.375, |
|
"learning_rate": 4.760040414246022e-05, |
|
"loss": 5.5661, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05051780752715332, |
|
"grad_norm": 1348911.125, |
|
"learning_rate": 4.747410962364234e-05, |
|
"loss": 5.7872, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05304369790351099, |
|
"grad_norm": 793627.5, |
|
"learning_rate": 4.7347815104824456e-05, |
|
"loss": 5.6837, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.05556958827986865, |
|
"grad_norm": 762008.625, |
|
"learning_rate": 4.722152058600657e-05, |
|
"loss": 5.3987, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.05809547865622632, |
|
"grad_norm": 724261.0625, |
|
"learning_rate": 4.709522606718869e-05, |
|
"loss": 5.4719, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.060621369032583985, |
|
"grad_norm": 1137774.75, |
|
"learning_rate": 4.6968931548370805e-05, |
|
"loss": 5.4004, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.06314725940894166, |
|
"grad_norm": 752281.3125, |
|
"learning_rate": 4.6842637029552924e-05, |
|
"loss": 5.3922, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06567314978529931, |
|
"grad_norm": 558181.375, |
|
"learning_rate": 4.6716342510735036e-05, |
|
"loss": 5.2856, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.06819904016165698, |
|
"grad_norm": 927075.5, |
|
"learning_rate": 4.6590047991917155e-05, |
|
"loss": 5.1218, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07072493053801465, |
|
"grad_norm": 1678542.0, |
|
"learning_rate": 4.646375347309927e-05, |
|
"loss": 5.0974, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.07325082091437232, |
|
"grad_norm": 1064729.5, |
|
"learning_rate": 4.6337458954281386e-05, |
|
"loss": 5.2749, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07577671129072998, |
|
"grad_norm": 1193670.375, |
|
"learning_rate": 4.6211164435463505e-05, |
|
"loss": 5.1134, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07830260166708765, |
|
"grad_norm": 979860.8125, |
|
"learning_rate": 4.6084869916645617e-05, |
|
"loss": 4.9763, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.08082849204344532, |
|
"grad_norm": 868298.8125, |
|
"learning_rate": 4.5958575397827735e-05, |
|
"loss": 5.0413, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.08335438241980297, |
|
"grad_norm": 1145414.625, |
|
"learning_rate": 4.5832280879009854e-05, |
|
"loss": 4.8234, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.08588027279616064, |
|
"grad_norm": 1516276.375, |
|
"learning_rate": 4.570598636019197e-05, |
|
"loss": 4.7487, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08840616317251832, |
|
"grad_norm": 1310860.25, |
|
"learning_rate": 4.557969184137409e-05, |
|
"loss": 4.869, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09093205354887599, |
|
"grad_norm": 961273.9375, |
|
"learning_rate": 4.5453397322556204e-05, |
|
"loss": 5.2202, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.09345794392523364, |
|
"grad_norm": 888523.3125, |
|
"learning_rate": 4.532710280373832e-05, |
|
"loss": 5.1888, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.09598383430159131, |
|
"grad_norm": 1027426.9375, |
|
"learning_rate": 4.5200808284920434e-05, |
|
"loss": 4.8891, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.09850972467794898, |
|
"grad_norm": 1760436.75, |
|
"learning_rate": 4.507451376610255e-05, |
|
"loss": 4.7838, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.10103561505430664, |
|
"grad_norm": 835284.875, |
|
"learning_rate": 4.4948219247284665e-05, |
|
"loss": 4.9398, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10356150543066431, |
|
"grad_norm": 1165127.125, |
|
"learning_rate": 4.4821924728466784e-05, |
|
"loss": 5.0785, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.10608739580702198, |
|
"grad_norm": 1061038.75, |
|
"learning_rate": 4.46956302096489e-05, |
|
"loss": 4.9144, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.10861328618337965, |
|
"grad_norm": 1058956.875, |
|
"learning_rate": 4.456933569083102e-05, |
|
"loss": 4.7658, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1111391765597373, |
|
"grad_norm": 838523.1875, |
|
"learning_rate": 4.444304117201314e-05, |
|
"loss": 4.9675, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.11366506693609497, |
|
"grad_norm": 1297820.5, |
|
"learning_rate": 4.431674665319525e-05, |
|
"loss": 4.8902, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.11619095731245264, |
|
"grad_norm": 841045.25, |
|
"learning_rate": 4.419045213437737e-05, |
|
"loss": 4.7037, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1187168476888103, |
|
"grad_norm": 1158120.75, |
|
"learning_rate": 4.406415761555949e-05, |
|
"loss": 4.7918, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.12124273806516797, |
|
"grad_norm": 646241.4375, |
|
"learning_rate": 4.39378630967416e-05, |
|
"loss": 4.7433, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.12376862844152564, |
|
"grad_norm": 641319.3125, |
|
"learning_rate": 4.381156857792372e-05, |
|
"loss": 4.608, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1262945188178833, |
|
"grad_norm": 1063831.625, |
|
"learning_rate": 4.368527405910583e-05, |
|
"loss": 4.8131, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12882040919424098, |
|
"grad_norm": 1552814.75, |
|
"learning_rate": 4.355897954028795e-05, |
|
"loss": 4.6981, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.13134629957059862, |
|
"grad_norm": 915990.25, |
|
"learning_rate": 4.343268502147007e-05, |
|
"loss": 4.8095, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.1338721899469563, |
|
"grad_norm": 838948.625, |
|
"learning_rate": 4.330639050265219e-05, |
|
"loss": 4.8535, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.13639808032331396, |
|
"grad_norm": 697365.9375, |
|
"learning_rate": 4.31800959838343e-05, |
|
"loss": 5.0165, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.13892397069967163, |
|
"grad_norm": 853774.25, |
|
"learning_rate": 4.305380146501642e-05, |
|
"loss": 4.6141, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.1414498610760293, |
|
"grad_norm": 714720.875, |
|
"learning_rate": 4.292750694619854e-05, |
|
"loss": 4.6416, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.14397575145238697, |
|
"grad_norm": 953777.125, |
|
"learning_rate": 4.280121242738066e-05, |
|
"loss": 4.8611, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.14650164182874464, |
|
"grad_norm": 1033054.5, |
|
"learning_rate": 4.2674917908562776e-05, |
|
"loss": 4.2724, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1490275322051023, |
|
"grad_norm": 877196.1875, |
|
"learning_rate": 4.254862338974489e-05, |
|
"loss": 4.7521, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.15155342258145996, |
|
"grad_norm": 1127457.0, |
|
"learning_rate": 4.2422328870927e-05, |
|
"loss": 4.3, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.15407931295781763, |
|
"grad_norm": 627419.4375, |
|
"learning_rate": 4.229603435210912e-05, |
|
"loss": 4.5331, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.1566052033341753, |
|
"grad_norm": 691871.3125, |
|
"learning_rate": 4.216973983329124e-05, |
|
"loss": 4.5695, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.15913109371053297, |
|
"grad_norm": 637530.0625, |
|
"learning_rate": 4.2043445314473356e-05, |
|
"loss": 4.4994, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.16165698408689064, |
|
"grad_norm": 692771.5625, |
|
"learning_rate": 4.191715079565547e-05, |
|
"loss": 4.403, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1641828744632483, |
|
"grad_norm": 738835.1875, |
|
"learning_rate": 4.179085627683759e-05, |
|
"loss": 4.5743, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.16670876483960595, |
|
"grad_norm": 1637430.25, |
|
"learning_rate": 4.1664561758019706e-05, |
|
"loss": 4.5377, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.16923465521596362, |
|
"grad_norm": 760962.875, |
|
"learning_rate": 4.1538267239201825e-05, |
|
"loss": 4.5168, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.1717605455923213, |
|
"grad_norm": 1142129.25, |
|
"learning_rate": 4.1411972720383937e-05, |
|
"loss": 4.2554, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.17428643596867896, |
|
"grad_norm": 614473.6875, |
|
"learning_rate": 4.1285678201566055e-05, |
|
"loss": 4.8585, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.17681232634503663, |
|
"grad_norm": 1348435.875, |
|
"learning_rate": 4.115938368274817e-05, |
|
"loss": 4.4252, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1793382167213943, |
|
"grad_norm": 1183288.25, |
|
"learning_rate": 4.1033089163930286e-05, |
|
"loss": 4.4951, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.18186410709775197, |
|
"grad_norm": 579328.4375, |
|
"learning_rate": 4.0906794645112405e-05, |
|
"loss": 4.322, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.1843899974741096, |
|
"grad_norm": 1145040.25, |
|
"learning_rate": 4.078050012629452e-05, |
|
"loss": 4.1513, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.18691588785046728, |
|
"grad_norm": 1142186.875, |
|
"learning_rate": 4.0654205607476636e-05, |
|
"loss": 4.1724, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.18944177822682495, |
|
"grad_norm": 1073306.125, |
|
"learning_rate": 4.0527911088658754e-05, |
|
"loss": 4.2603, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.19196766860318262, |
|
"grad_norm": 856185.875, |
|
"learning_rate": 4.040161656984087e-05, |
|
"loss": 4.1743, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.1944935589795403, |
|
"grad_norm": 1259884.75, |
|
"learning_rate": 4.027532205102299e-05, |
|
"loss": 4.4978, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.19701944935589796, |
|
"grad_norm": 988211.375, |
|
"learning_rate": 4.0149027532205104e-05, |
|
"loss": 4.4756, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.19954533973225563, |
|
"grad_norm": 1129890.375, |
|
"learning_rate": 4.002273301338722e-05, |
|
"loss": 4.4766, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.20207123010861328, |
|
"grad_norm": 1127913.0, |
|
"learning_rate": 3.989643849456934e-05, |
|
"loss": 4.1696, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.20459712048497095, |
|
"grad_norm": 643195.0625, |
|
"learning_rate": 3.9770143975751453e-05, |
|
"loss": 4.2825, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.20712301086132862, |
|
"grad_norm": 585030.1875, |
|
"learning_rate": 3.9643849456933565e-05, |
|
"loss": 4.5449, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.20964890123768629, |
|
"grad_norm": 964589.9375, |
|
"learning_rate": 3.9517554938115684e-05, |
|
"loss": 4.2085, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.21217479161404396, |
|
"grad_norm": 1307139.375, |
|
"learning_rate": 3.93912604192978e-05, |
|
"loss": 4.2636, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.21470068199040163, |
|
"grad_norm": 1642451.125, |
|
"learning_rate": 3.926496590047992e-05, |
|
"loss": 4.1194, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2172265723667593, |
|
"grad_norm": 767172.5625, |
|
"learning_rate": 3.913867138166204e-05, |
|
"loss": 3.9216, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.21975246274311694, |
|
"grad_norm": 1053997.0, |
|
"learning_rate": 3.901237686284415e-05, |
|
"loss": 4.4344, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.2222783531194746, |
|
"grad_norm": 1236466.5, |
|
"learning_rate": 3.888608234402627e-05, |
|
"loss": 4.5718, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.22480424349583228, |
|
"grad_norm": 1578047.5, |
|
"learning_rate": 3.875978782520839e-05, |
|
"loss": 4.2614, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.22733013387218995, |
|
"grad_norm": 1341923.75, |
|
"learning_rate": 3.863349330639051e-05, |
|
"loss": 4.2356, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.22985602424854762, |
|
"grad_norm": 1259278.75, |
|
"learning_rate": 3.850719878757262e-05, |
|
"loss": 4.1374, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.2323819146249053, |
|
"grad_norm": 1208085.125, |
|
"learning_rate": 3.838090426875473e-05, |
|
"loss": 4.3741, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.23490780500126293, |
|
"grad_norm": 713612.375, |
|
"learning_rate": 3.825460974993685e-05, |
|
"loss": 4.2538, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.2374336953776206, |
|
"grad_norm": 773853.25, |
|
"learning_rate": 3.812831523111897e-05, |
|
"loss": 4.0877, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.23995958575397827, |
|
"grad_norm": 1925775.125, |
|
"learning_rate": 3.800202071230109e-05, |
|
"loss": 3.8277, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.24248547613033594, |
|
"grad_norm": 814602.125, |
|
"learning_rate": 3.78757261934832e-05, |
|
"loss": 4.016, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2450113665066936, |
|
"grad_norm": 1109952.0, |
|
"learning_rate": 3.774943167466532e-05, |
|
"loss": 3.9932, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.24753725688305128, |
|
"grad_norm": 744110.125, |
|
"learning_rate": 3.762313715584744e-05, |
|
"loss": 4.0229, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.2500631472594089, |
|
"grad_norm": 904255.75, |
|
"learning_rate": 3.749684263702956e-05, |
|
"loss": 4.4272, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.2525890376357666, |
|
"grad_norm": 829616.3125, |
|
"learning_rate": 3.7370548118211676e-05, |
|
"loss": 4.3034, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.25511492801212426, |
|
"grad_norm": 1236616.375, |
|
"learning_rate": 3.724425359939379e-05, |
|
"loss": 3.9168, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.25764081838848196, |
|
"grad_norm": 696752.75, |
|
"learning_rate": 3.711795908057591e-05, |
|
"loss": 4.2012, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.2601667087648396, |
|
"grad_norm": 680934.875, |
|
"learning_rate": 3.699166456175802e-05, |
|
"loss": 4.2596, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.26269259914119725, |
|
"grad_norm": 1334373.125, |
|
"learning_rate": 3.686537004294014e-05, |
|
"loss": 4.461, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.26521848951755495, |
|
"grad_norm": 1002765.875, |
|
"learning_rate": 3.6739075524122257e-05, |
|
"loss": 4.2859, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.2677443798939126, |
|
"grad_norm": 507567.5, |
|
"learning_rate": 3.661278100530437e-05, |
|
"loss": 3.8771, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 968382.5, |
|
"learning_rate": 3.648648648648649e-05, |
|
"loss": 4.1681, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.2727961606466279, |
|
"grad_norm": 1306089.125, |
|
"learning_rate": 3.6360191967668606e-05, |
|
"loss": 4.2425, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.2753220510229856, |
|
"grad_norm": 1039757.5, |
|
"learning_rate": 3.6233897448850725e-05, |
|
"loss": 4.3264, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.27784794139934327, |
|
"grad_norm": 1464928.5, |
|
"learning_rate": 3.610760293003284e-05, |
|
"loss": 4.2827, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2803738317757009, |
|
"grad_norm": 824237.6875, |
|
"learning_rate": 3.5981308411214956e-05, |
|
"loss": 3.8312, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.2828997221520586, |
|
"grad_norm": 942513.125, |
|
"learning_rate": 3.5855013892397074e-05, |
|
"loss": 4.1363, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.28542561252841625, |
|
"grad_norm": 697307.1875, |
|
"learning_rate": 3.5728719373579186e-05, |
|
"loss": 3.9892, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.28795150290477395, |
|
"grad_norm": 1249596.5, |
|
"learning_rate": 3.5602424854761305e-05, |
|
"loss": 4.0763, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.2904773932811316, |
|
"grad_norm": 963218.0625, |
|
"learning_rate": 3.547613033594342e-05, |
|
"loss": 4.1946, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.2930032836574893, |
|
"grad_norm": 1025589.625, |
|
"learning_rate": 3.5349835817125536e-05, |
|
"loss": 4.1447, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.29552917403384693, |
|
"grad_norm": 696336.3125, |
|
"learning_rate": 3.5223541298307655e-05, |
|
"loss": 4.1982, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.2980550644102046, |
|
"grad_norm": 826398.875, |
|
"learning_rate": 3.5097246779489773e-05, |
|
"loss": 4.1935, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.30058095478656227, |
|
"grad_norm": 527454.375, |
|
"learning_rate": 3.497095226067189e-05, |
|
"loss": 3.8515, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3031068451629199, |
|
"grad_norm": 1154428.125, |
|
"learning_rate": 3.4844657741854004e-05, |
|
"loss": 4.123, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3056327355392776, |
|
"grad_norm": 843105.25, |
|
"learning_rate": 3.471836322303612e-05, |
|
"loss": 4.0649, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.30815862591563525, |
|
"grad_norm": 677728.0, |
|
"learning_rate": 3.459206870421824e-05, |
|
"loss": 4.1858, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.31068451629199295, |
|
"grad_norm": 843633.8125, |
|
"learning_rate": 3.4465774185400354e-05, |
|
"loss": 3.9904, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.3132104066683506, |
|
"grad_norm": 1079652.625, |
|
"learning_rate": 3.433947966658247e-05, |
|
"loss": 4.1296, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.31573629704470824, |
|
"grad_norm": 1375458.875, |
|
"learning_rate": 3.4213185147764585e-05, |
|
"loss": 3.845, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.31826218742106593, |
|
"grad_norm": 1572799.75, |
|
"learning_rate": 3.40868906289467e-05, |
|
"loss": 3.8965, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.3207880777974236, |
|
"grad_norm": 626954.0625, |
|
"learning_rate": 3.396059611012882e-05, |
|
"loss": 4.2417, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.3233139681737813, |
|
"grad_norm": 778080.8125, |
|
"learning_rate": 3.383430159131094e-05, |
|
"loss": 4.0585, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.3258398585501389, |
|
"grad_norm": 1268475.375, |
|
"learning_rate": 3.370800707249305e-05, |
|
"loss": 3.5957, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.3283657489264966, |
|
"grad_norm": 983964.5, |
|
"learning_rate": 3.358171255367517e-05, |
|
"loss": 4.4113, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.33089163930285426, |
|
"grad_norm": 1136581.5, |
|
"learning_rate": 3.345541803485729e-05, |
|
"loss": 4.2545, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.3334175296792119, |
|
"grad_norm": 1313611.375, |
|
"learning_rate": 3.332912351603941e-05, |
|
"loss": 4.0466, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.3359434200555696, |
|
"grad_norm": 595656.0625, |
|
"learning_rate": 3.320282899722152e-05, |
|
"loss": 4.1121, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.33846931043192724, |
|
"grad_norm": 1134474.625, |
|
"learning_rate": 3.307653447840364e-05, |
|
"loss": 3.8602, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.34099520080828494, |
|
"grad_norm": 932185.5, |
|
"learning_rate": 3.295023995958575e-05, |
|
"loss": 4.0099, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.3435210911846426, |
|
"grad_norm": 992654.875, |
|
"learning_rate": 3.282394544076787e-05, |
|
"loss": 3.8907, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.3460469815610003, |
|
"grad_norm": 798724.1875, |
|
"learning_rate": 3.269765092194999e-05, |
|
"loss": 3.4421, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.3485728719373579, |
|
"grad_norm": 1000527.125, |
|
"learning_rate": 3.257135640313211e-05, |
|
"loss": 4.0111, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.35109876231371556, |
|
"grad_norm": 1207395.875, |
|
"learning_rate": 3.244506188431422e-05, |
|
"loss": 4.332, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.35362465269007326, |
|
"grad_norm": 736049.8125, |
|
"learning_rate": 3.231876736549634e-05, |
|
"loss": 4.0475, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3561505430664309, |
|
"grad_norm": 902288.75, |
|
"learning_rate": 3.219247284667846e-05, |
|
"loss": 3.7045, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.3586764334427886, |
|
"grad_norm": 1179507.375, |
|
"learning_rate": 3.2066178327860577e-05, |
|
"loss": 3.7423, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.36120232381914624, |
|
"grad_norm": 1319296.25, |
|
"learning_rate": 3.193988380904269e-05, |
|
"loss": 3.7602, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.36372821419550394, |
|
"grad_norm": 1040202.3125, |
|
"learning_rate": 3.181358929022481e-05, |
|
"loss": 3.871, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.3662541045718616, |
|
"grad_norm": 892262.9375, |
|
"learning_rate": 3.168729477140692e-05, |
|
"loss": 4.1026, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.3687799949482192, |
|
"grad_norm": 1401292.5, |
|
"learning_rate": 3.156100025258904e-05, |
|
"loss": 3.9609, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.3713058853245769, |
|
"grad_norm": 1242199.5, |
|
"learning_rate": 3.143470573377116e-05, |
|
"loss": 3.9826, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.37383177570093457, |
|
"grad_norm": 953926.4375, |
|
"learning_rate": 3.130841121495327e-05, |
|
"loss": 3.7857, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.37635766607729226, |
|
"grad_norm": 1005284.125, |
|
"learning_rate": 3.118211669613539e-05, |
|
"loss": 3.8462, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.3788835564536499, |
|
"grad_norm": 617377.0, |
|
"learning_rate": 3.1055822177317506e-05, |
|
"loss": 4.0298, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3814094468300076, |
|
"grad_norm": 1349709.0, |
|
"learning_rate": 3.0929527658499625e-05, |
|
"loss": 4.108, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.38393533720636525, |
|
"grad_norm": 863097.375, |
|
"learning_rate": 3.0803233139681744e-05, |
|
"loss": 3.7545, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.3864612275827229, |
|
"grad_norm": 1074415.75, |
|
"learning_rate": 3.0676938620863856e-05, |
|
"loss": 3.7492, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.3889871179590806, |
|
"grad_norm": 1269458.125, |
|
"learning_rate": 3.0550644102045975e-05, |
|
"loss": 4.0702, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.39151300833543823, |
|
"grad_norm": 623983.6875, |
|
"learning_rate": 3.042434958322809e-05, |
|
"loss": 3.7522, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.3940388987117959, |
|
"grad_norm": 847152.8125, |
|
"learning_rate": 3.029805506441021e-05, |
|
"loss": 3.9036, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.39656478908815357, |
|
"grad_norm": 1389634.5, |
|
"learning_rate": 3.017176054559232e-05, |
|
"loss": 3.7832, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.39909067946451127, |
|
"grad_norm": 983121.3125, |
|
"learning_rate": 3.004546602677444e-05, |
|
"loss": 3.6402, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.4016165698408689, |
|
"grad_norm": 1020898.8125, |
|
"learning_rate": 2.9919171507956555e-05, |
|
"loss": 3.6429, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.40414246021722655, |
|
"grad_norm": 847203.9375, |
|
"learning_rate": 2.9792876989138674e-05, |
|
"loss": 3.8984, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.40666835059358425, |
|
"grad_norm": 1133185.5, |
|
"learning_rate": 2.9666582470320793e-05, |
|
"loss": 3.7906, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.4091942409699419, |
|
"grad_norm": 1127204.625, |
|
"learning_rate": 2.9540287951502905e-05, |
|
"loss": 4.143, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.4117201313462996, |
|
"grad_norm": 1315488.625, |
|
"learning_rate": 2.9413993432685023e-05, |
|
"loss": 3.8023, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.41424602172265723, |
|
"grad_norm": 776120.5625, |
|
"learning_rate": 2.928769891386714e-05, |
|
"loss": 3.4144, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.41677191209901493, |
|
"grad_norm": 832920.9375, |
|
"learning_rate": 2.9161404395049257e-05, |
|
"loss": 3.6427, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.41929780247537257, |
|
"grad_norm": 1192992.25, |
|
"learning_rate": 2.9035109876231376e-05, |
|
"loss": 3.9277, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.4218236928517302, |
|
"grad_norm": 918210.4375, |
|
"learning_rate": 2.8908815357413488e-05, |
|
"loss": 3.6729, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.4243495832280879, |
|
"grad_norm": 924215.25, |
|
"learning_rate": 2.8782520838595607e-05, |
|
"loss": 3.7461, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.42687547360444555, |
|
"grad_norm": 715260.8125, |
|
"learning_rate": 2.8656226319777722e-05, |
|
"loss": 3.7783, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.42940136398080325, |
|
"grad_norm": 1466543.5, |
|
"learning_rate": 2.852993180095984e-05, |
|
"loss": 3.8766, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4319272543571609, |
|
"grad_norm": 669040.375, |
|
"learning_rate": 2.8403637282141953e-05, |
|
"loss": 3.9012, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.4344531447335186, |
|
"grad_norm": 2193973.75, |
|
"learning_rate": 2.8277342763324072e-05, |
|
"loss": 3.3848, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.43697903510987623, |
|
"grad_norm": 831080.625, |
|
"learning_rate": 2.815104824450619e-05, |
|
"loss": 3.6074, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.4395049254862339, |
|
"grad_norm": 960923.75, |
|
"learning_rate": 2.8024753725688306e-05, |
|
"loss": 3.6234, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.4420308158625916, |
|
"grad_norm": 1489447.5, |
|
"learning_rate": 2.7898459206870425e-05, |
|
"loss": 3.6699, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.4445567062389492, |
|
"grad_norm": 611681.375, |
|
"learning_rate": 2.7772164688052537e-05, |
|
"loss": 3.389, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.4470825966153069, |
|
"grad_norm": 2048307.875, |
|
"learning_rate": 2.7645870169234656e-05, |
|
"loss": 3.8727, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.44960848699166456, |
|
"grad_norm": 855010.75, |
|
"learning_rate": 2.7519575650416774e-05, |
|
"loss": 3.839, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.45213437736802226, |
|
"grad_norm": 954567.125, |
|
"learning_rate": 2.739328113159889e-05, |
|
"loss": 3.507, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.4546602677443799, |
|
"grad_norm": 1057372.25, |
|
"learning_rate": 2.726698661278101e-05, |
|
"loss": 4.0177, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.45718615812073754, |
|
"grad_norm": 1196149.375, |
|
"learning_rate": 2.714069209396312e-05, |
|
"loss": 3.6044, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.45971204849709524, |
|
"grad_norm": 664877.5, |
|
"learning_rate": 2.701439757514524e-05, |
|
"loss": 3.7145, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.4622379388734529, |
|
"grad_norm": 1227852.0, |
|
"learning_rate": 2.6888103056327358e-05, |
|
"loss": 3.7588, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.4647638292498106, |
|
"grad_norm": 1403871.0, |
|
"learning_rate": 2.6761808537509473e-05, |
|
"loss": 3.672, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.4672897196261682, |
|
"grad_norm": 1831221.875, |
|
"learning_rate": 2.663551401869159e-05, |
|
"loss": 3.8373, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.46981561000252586, |
|
"grad_norm": 1008128.75, |
|
"learning_rate": 2.6509219499873704e-05, |
|
"loss": 3.6977, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.47234150037888356, |
|
"grad_norm": 704558.8125, |
|
"learning_rate": 2.6382924981055823e-05, |
|
"loss": 3.9759, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.4748673907552412, |
|
"grad_norm": 1501172.875, |
|
"learning_rate": 2.6256630462237942e-05, |
|
"loss": 3.3989, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.4773932811315989, |
|
"grad_norm": 1022078.375, |
|
"learning_rate": 2.613033594342006e-05, |
|
"loss": 3.7731, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.47991917150795654, |
|
"grad_norm": 892236.75, |
|
"learning_rate": 2.6004041424602173e-05, |
|
"loss": 3.6186, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.48244506188431424, |
|
"grad_norm": 592257.875, |
|
"learning_rate": 2.5877746905784288e-05, |
|
"loss": 3.3699, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.4849709522606719, |
|
"grad_norm": 761213.9375, |
|
"learning_rate": 2.5751452386966407e-05, |
|
"loss": 3.6678, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.4874968426370295, |
|
"grad_norm": 1076466.0, |
|
"learning_rate": 2.5625157868148525e-05, |
|
"loss": 3.4251, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.4900227330133872, |
|
"grad_norm": 1370577.375, |
|
"learning_rate": 2.5498863349330644e-05, |
|
"loss": 3.6471, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.49254862338974487, |
|
"grad_norm": 682627.5625, |
|
"learning_rate": 2.5372568830512756e-05, |
|
"loss": 3.4545, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.49507451376610256, |
|
"grad_norm": 812808.125, |
|
"learning_rate": 2.524627431169487e-05, |
|
"loss": 3.64, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.4976004041424602, |
|
"grad_norm": 1190569.0, |
|
"learning_rate": 2.511997979287699e-05, |
|
"loss": 3.5669, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5001262945188178, |
|
"grad_norm": 1053372.125, |
|
"learning_rate": 2.4993685274059106e-05, |
|
"loss": 3.561, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5026521848951756, |
|
"grad_norm": 1154567.75, |
|
"learning_rate": 2.4867390755241225e-05, |
|
"loss": 3.7747, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5051780752715332, |
|
"grad_norm": 1105378.875, |
|
"learning_rate": 2.4741096236423343e-05, |
|
"loss": 3.5388, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5077039656478909, |
|
"grad_norm": 975523.9375, |
|
"learning_rate": 2.4614801717605455e-05, |
|
"loss": 3.7567, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.5102298560242485, |
|
"grad_norm": 1316863.5, |
|
"learning_rate": 2.4488507198787574e-05, |
|
"loss": 3.4911, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5127557464006062, |
|
"grad_norm": 1235055.375, |
|
"learning_rate": 2.436221267996969e-05, |
|
"loss": 3.3298, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.5152816367769639, |
|
"grad_norm": 1190251.5, |
|
"learning_rate": 2.4235918161151808e-05, |
|
"loss": 3.7792, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5178075271533216, |
|
"grad_norm": 1828954.875, |
|
"learning_rate": 2.4109623642333924e-05, |
|
"loss": 3.3407, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.5203334175296792, |
|
"grad_norm": 734125.1875, |
|
"learning_rate": 2.398332912351604e-05, |
|
"loss": 3.6435, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.5228593079060369, |
|
"grad_norm": 1114272.0, |
|
"learning_rate": 2.3857034604698158e-05, |
|
"loss": 3.603, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.5253851982823945, |
|
"grad_norm": 1464147.0, |
|
"learning_rate": 2.3730740085880273e-05, |
|
"loss": 3.6587, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.5279110886587522, |
|
"grad_norm": 2074921.0, |
|
"learning_rate": 2.3604445567062392e-05, |
|
"loss": 3.7505, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.5304369790351099, |
|
"grad_norm": 1063793.625, |
|
"learning_rate": 2.3478151048244507e-05, |
|
"loss": 3.8397, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5329628694114675, |
|
"grad_norm": 702127.6875, |
|
"learning_rate": 2.3351856529426626e-05, |
|
"loss": 3.4248, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.5354887597878252, |
|
"grad_norm": 1237508.5, |
|
"learning_rate": 2.3225562010608738e-05, |
|
"loss": 3.6119, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.5380146501641829, |
|
"grad_norm": 678885.625, |
|
"learning_rate": 2.3099267491790857e-05, |
|
"loss": 3.4523, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 1506679.875, |
|
"learning_rate": 2.2972972972972976e-05, |
|
"loss": 3.4496, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.5430664309168982, |
|
"grad_norm": 1016625.6875, |
|
"learning_rate": 2.284667845415509e-05, |
|
"loss": 3.6607, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.5455923212932559, |
|
"grad_norm": 916826.3125, |
|
"learning_rate": 2.272038393533721e-05, |
|
"loss": 3.6147, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.5481182116696135, |
|
"grad_norm": 1150088.625, |
|
"learning_rate": 2.2594089416519322e-05, |
|
"loss": 3.3045, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.5506441020459713, |
|
"grad_norm": 1215418.875, |
|
"learning_rate": 2.246779489770144e-05, |
|
"loss": 3.6112, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.5531699924223289, |
|
"grad_norm": 610014.5, |
|
"learning_rate": 2.2341500378883556e-05, |
|
"loss": 3.5236, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.5556958827986865, |
|
"grad_norm": 834213.625, |
|
"learning_rate": 2.2215205860065675e-05, |
|
"loss": 3.4063, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5582217731750442, |
|
"grad_norm": 976218.8125, |
|
"learning_rate": 2.2088911341247793e-05, |
|
"loss": 3.311, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.5607476635514018, |
|
"grad_norm": 940372.0, |
|
"learning_rate": 2.196261682242991e-05, |
|
"loss": 3.6945, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.5632735539277596, |
|
"grad_norm": 903718.9375, |
|
"learning_rate": 2.1836322303612024e-05, |
|
"loss": 3.3955, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.5657994443041172, |
|
"grad_norm": 791363.5, |
|
"learning_rate": 2.171002778479414e-05, |
|
"loss": 3.5137, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.5683253346804749, |
|
"grad_norm": 958748.625, |
|
"learning_rate": 2.158373326597626e-05, |
|
"loss": 3.3765, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.5708512250568325, |
|
"grad_norm": 1431520.25, |
|
"learning_rate": 2.1457438747158374e-05, |
|
"loss": 3.1066, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.5733771154331903, |
|
"grad_norm": 793628.1875, |
|
"learning_rate": 2.1331144228340493e-05, |
|
"loss": 3.5486, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.5759030058095479, |
|
"grad_norm": 935731.3125, |
|
"learning_rate": 2.1204849709522608e-05, |
|
"loss": 3.427, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.5784288961859055, |
|
"grad_norm": 1024607.625, |
|
"learning_rate": 2.1078555190704723e-05, |
|
"loss": 3.2279, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.5809547865622632, |
|
"grad_norm": 1027219.3125, |
|
"learning_rate": 2.0952260671886842e-05, |
|
"loss": 3.7803, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5834806769386208, |
|
"grad_norm": 762017.8125, |
|
"learning_rate": 2.0825966153068957e-05, |
|
"loss": 3.7272, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.5860065673149786, |
|
"grad_norm": 1251829.125, |
|
"learning_rate": 2.0699671634251076e-05, |
|
"loss": 3.6896, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.5885324576913362, |
|
"grad_norm": 841272.5, |
|
"learning_rate": 2.057337711543319e-05, |
|
"loss": 3.3452, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.5910583480676939, |
|
"grad_norm": 1126517.625, |
|
"learning_rate": 2.0447082596615307e-05, |
|
"loss": 3.386, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.5935842384440515, |
|
"grad_norm": 982217.3125, |
|
"learning_rate": 2.0320788077797426e-05, |
|
"loss": 3.7347, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.5961101288204091, |
|
"grad_norm": 644312.75, |
|
"learning_rate": 2.019449355897954e-05, |
|
"loss": 3.1686, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.5986360191967669, |
|
"grad_norm": 1135732.625, |
|
"learning_rate": 2.006819904016166e-05, |
|
"loss": 3.6648, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.6011619095731245, |
|
"grad_norm": 886122.0, |
|
"learning_rate": 1.9941904521343775e-05, |
|
"loss": 3.3114, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.6036877999494822, |
|
"grad_norm": 1676023.875, |
|
"learning_rate": 1.981561000252589e-05, |
|
"loss": 3.8232, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.6062136903258398, |
|
"grad_norm": 1056139.0, |
|
"learning_rate": 1.9689315483708006e-05, |
|
"loss": 3.3829, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6087395807021975, |
|
"grad_norm": 1303276.625, |
|
"learning_rate": 1.9563020964890125e-05, |
|
"loss": 3.7356, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.6112654710785552, |
|
"grad_norm": 1168222.5, |
|
"learning_rate": 1.9436726446072244e-05, |
|
"loss": 3.4672, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.6137913614549129, |
|
"grad_norm": 974863.5, |
|
"learning_rate": 1.931043192725436e-05, |
|
"loss": 3.7479, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.6163172518312705, |
|
"grad_norm": 727352.375, |
|
"learning_rate": 1.9184137408436474e-05, |
|
"loss": 3.4183, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.6188431422076281, |
|
"grad_norm": 1103530.0, |
|
"learning_rate": 1.905784288961859e-05, |
|
"loss": 3.5818, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.6213690325839859, |
|
"grad_norm": 866723.3125, |
|
"learning_rate": 1.893154837080071e-05, |
|
"loss": 3.4123, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.6238949229603435, |
|
"grad_norm": 1022545.8125, |
|
"learning_rate": 1.8805253851982824e-05, |
|
"loss": 3.1739, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.6264208133367012, |
|
"grad_norm": 1114618.75, |
|
"learning_rate": 1.8678959333164943e-05, |
|
"loss": 3.6795, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.6289467037130588, |
|
"grad_norm": 1178612.875, |
|
"learning_rate": 1.8552664814347058e-05, |
|
"loss": 3.3061, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.6314725940894165, |
|
"grad_norm": 757466.0, |
|
"learning_rate": 1.8426370295529173e-05, |
|
"loss": 3.4692, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6339984844657742, |
|
"grad_norm": 932833.75, |
|
"learning_rate": 1.8300075776711292e-05, |
|
"loss": 3.3951, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.6365243748421319, |
|
"grad_norm": 811280.75, |
|
"learning_rate": 1.8173781257893408e-05, |
|
"loss": 3.42, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.6390502652184895, |
|
"grad_norm": 828370.875, |
|
"learning_rate": 1.8047486739075526e-05, |
|
"loss": 3.5068, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.6415761555948472, |
|
"grad_norm": 796540.6875, |
|
"learning_rate": 1.7921192220257642e-05, |
|
"loss": 3.4725, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.6441020459712048, |
|
"grad_norm": 1101170.0, |
|
"learning_rate": 1.7794897701439757e-05, |
|
"loss": 3.5315, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.6466279363475625, |
|
"grad_norm": 1084591.375, |
|
"learning_rate": 1.7668603182621876e-05, |
|
"loss": 3.3357, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.6491538267239202, |
|
"grad_norm": 846566.375, |
|
"learning_rate": 1.754230866380399e-05, |
|
"loss": 3.4352, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.6516797171002778, |
|
"grad_norm": 817152.9375, |
|
"learning_rate": 1.741601414498611e-05, |
|
"loss": 3.3913, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.6542056074766355, |
|
"grad_norm": 896931.875, |
|
"learning_rate": 1.7289719626168225e-05, |
|
"loss": 3.3725, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.6567314978529932, |
|
"grad_norm": 580791.0, |
|
"learning_rate": 1.716342510735034e-05, |
|
"loss": 3.3803, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6592573882293509, |
|
"grad_norm": 817709.375, |
|
"learning_rate": 1.7037130588532456e-05, |
|
"loss": 3.4564, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.6617832786057085, |
|
"grad_norm": 1013458.75, |
|
"learning_rate": 1.6910836069714575e-05, |
|
"loss": 3.4659, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.6643091689820662, |
|
"grad_norm": 855889.0625, |
|
"learning_rate": 1.6784541550896694e-05, |
|
"loss": 3.1951, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.6668350593584238, |
|
"grad_norm": 1114807.125, |
|
"learning_rate": 1.665824703207881e-05, |
|
"loss": 3.2604, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.6693609497347816, |
|
"grad_norm": 1472619.75, |
|
"learning_rate": 1.6531952513260925e-05, |
|
"loss": 3.2474, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.6718868401111392, |
|
"grad_norm": 933670.625, |
|
"learning_rate": 1.640565799444304e-05, |
|
"loss": 3.2033, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.6744127304874968, |
|
"grad_norm": 752061.375, |
|
"learning_rate": 1.627936347562516e-05, |
|
"loss": 3.5689, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.6769386208638545, |
|
"grad_norm": 1359021.375, |
|
"learning_rate": 1.6153068956807274e-05, |
|
"loss": 3.289, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.6794645112402121, |
|
"grad_norm": 755337.0625, |
|
"learning_rate": 1.6026774437989393e-05, |
|
"loss": 3.4877, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.6819904016165699, |
|
"grad_norm": 797617.3125, |
|
"learning_rate": 1.5900479919171508e-05, |
|
"loss": 3.4128, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6845162919929275, |
|
"grad_norm": 754639.75, |
|
"learning_rate": 1.5774185400353624e-05, |
|
"loss": 3.4954, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.6870421823692852, |
|
"grad_norm": 853287.1875, |
|
"learning_rate": 1.5647890881535742e-05, |
|
"loss": 3.2564, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.6895680727456428, |
|
"grad_norm": 554583.1875, |
|
"learning_rate": 1.5521596362717858e-05, |
|
"loss": 3.2221, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.6920939631220006, |
|
"grad_norm": 722306.8125, |
|
"learning_rate": 1.5395301843899977e-05, |
|
"loss": 3.1547, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.6946198534983582, |
|
"grad_norm": 833465.875, |
|
"learning_rate": 1.5269007325082092e-05, |
|
"loss": 3.1672, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.6971457438747158, |
|
"grad_norm": 801353.3125, |
|
"learning_rate": 1.5142712806264209e-05, |
|
"loss": 3.4491, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.6996716342510735, |
|
"grad_norm": 621898.875, |
|
"learning_rate": 1.5016418287446326e-05, |
|
"loss": 3.4968, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.7021975246274311, |
|
"grad_norm": 926238.0625, |
|
"learning_rate": 1.4890123768628441e-05, |
|
"loss": 3.5613, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.7047234150037889, |
|
"grad_norm": 991762.4375, |
|
"learning_rate": 1.476382924981056e-05, |
|
"loss": 3.1534, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.7072493053801465, |
|
"grad_norm": 876887.3125, |
|
"learning_rate": 1.4637534730992674e-05, |
|
"loss": 3.1666, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.7097751957565042, |
|
"grad_norm": 748469.75, |
|
"learning_rate": 1.4511240212174793e-05, |
|
"loss": 3.3201, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.7123010861328618, |
|
"grad_norm": 913840.125, |
|
"learning_rate": 1.4384945693356908e-05, |
|
"loss": 3.3263, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.7148269765092194, |
|
"grad_norm": 776508.25, |
|
"learning_rate": 1.4258651174539025e-05, |
|
"loss": 3.3736, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.7173528668855772, |
|
"grad_norm": 1003093.6875, |
|
"learning_rate": 1.4132356655721144e-05, |
|
"loss": 3.3369, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.7198787572619348, |
|
"grad_norm": 480733.9375, |
|
"learning_rate": 1.400606213690326e-05, |
|
"loss": 3.4599, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.7224046476382925, |
|
"grad_norm": 921517.8125, |
|
"learning_rate": 1.3879767618085376e-05, |
|
"loss": 3.3327, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.7249305380146501, |
|
"grad_norm": 1106659.0, |
|
"learning_rate": 1.3753473099267492e-05, |
|
"loss": 3.4347, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.7274564283910079, |
|
"grad_norm": 506619.90625, |
|
"learning_rate": 1.3627178580449609e-05, |
|
"loss": 3.2675, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.7299823187673655, |
|
"grad_norm": 862991.0625, |
|
"learning_rate": 1.3500884061631724e-05, |
|
"loss": 3.2461, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.7325082091437232, |
|
"grad_norm": 835895.8125, |
|
"learning_rate": 1.3374589542813843e-05, |
|
"loss": 3.3976, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.7350340995200808, |
|
"grad_norm": 699905.9375, |
|
"learning_rate": 1.324829502399596e-05, |
|
"loss": 3.4889, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.7375599898964385, |
|
"grad_norm": 824201.75, |
|
"learning_rate": 1.3122000505178075e-05, |
|
"loss": 3.548, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.7400858802727962, |
|
"grad_norm": 871363.75, |
|
"learning_rate": 1.2995705986360193e-05, |
|
"loss": 3.5843, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.7426117706491538, |
|
"grad_norm": 627847.1875, |
|
"learning_rate": 1.2869411467542308e-05, |
|
"loss": 3.1553, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.7451376610255115, |
|
"grad_norm": 758722.4375, |
|
"learning_rate": 1.2743116948724427e-05, |
|
"loss": 3.4285, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.7476635514018691, |
|
"grad_norm": 851608.8125, |
|
"learning_rate": 1.2616822429906542e-05, |
|
"loss": 3.3232, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.7501894417782268, |
|
"grad_norm": 917633.8125, |
|
"learning_rate": 1.249052791108866e-05, |
|
"loss": 3.1254, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.7527153321545845, |
|
"grad_norm": 1110027.375, |
|
"learning_rate": 1.2364233392270776e-05, |
|
"loss": 3.4867, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.7552412225309422, |
|
"grad_norm": 817171.625, |
|
"learning_rate": 1.2237938873452892e-05, |
|
"loss": 3.4251, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.7577671129072998, |
|
"grad_norm": 1074504.25, |
|
"learning_rate": 1.211164435463501e-05, |
|
"loss": 3.4581, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7602930032836575, |
|
"grad_norm": 777395.4375, |
|
"learning_rate": 1.1985349835817126e-05, |
|
"loss": 3.3139, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.7628188936600152, |
|
"grad_norm": 657749.75, |
|
"learning_rate": 1.1859055316999243e-05, |
|
"loss": 3.2542, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.7653447840363728, |
|
"grad_norm": 536018.1875, |
|
"learning_rate": 1.173276079818136e-05, |
|
"loss": 3.4512, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.7678706744127305, |
|
"grad_norm": 1034483.5625, |
|
"learning_rate": 1.1606466279363475e-05, |
|
"loss": 3.3698, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.7703965647890881, |
|
"grad_norm": 1303588.5, |
|
"learning_rate": 1.1480171760545592e-05, |
|
"loss": 3.1435, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.7729224551654458, |
|
"grad_norm": 961475.75, |
|
"learning_rate": 1.135387724172771e-05, |
|
"loss": 3.1625, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.7754483455418035, |
|
"grad_norm": 811610.3125, |
|
"learning_rate": 1.1227582722909827e-05, |
|
"loss": 3.2445, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.7779742359181612, |
|
"grad_norm": 573133.25, |
|
"learning_rate": 1.1101288204091944e-05, |
|
"loss": 3.2385, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.7805001262945188, |
|
"grad_norm": 719160.0625, |
|
"learning_rate": 1.0974993685274059e-05, |
|
"loss": 3.4963, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.7830260166708765, |
|
"grad_norm": 625430.0625, |
|
"learning_rate": 1.0848699166456176e-05, |
|
"loss": 3.219, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7855519070472341, |
|
"grad_norm": 847258.625, |
|
"learning_rate": 1.0722404647638293e-05, |
|
"loss": 3.3977, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.7880777974235919, |
|
"grad_norm": 909602.0625, |
|
"learning_rate": 1.0596110128820409e-05, |
|
"loss": 3.1271, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.7906036877999495, |
|
"grad_norm": 2303105.75, |
|
"learning_rate": 1.0469815610002526e-05, |
|
"loss": 3.1443, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.7931295781763071, |
|
"grad_norm": 627608.9375, |
|
"learning_rate": 1.0343521091184644e-05, |
|
"loss": 3.1124, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.7956554685526648, |
|
"grad_norm": 683938.75, |
|
"learning_rate": 1.021722657236676e-05, |
|
"loss": 3.5188, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.7981813589290225, |
|
"grad_norm": 742955.25, |
|
"learning_rate": 1.0090932053548877e-05, |
|
"loss": 3.3953, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.8007072493053802, |
|
"grad_norm": 871770.5, |
|
"learning_rate": 9.964637534730994e-06, |
|
"loss": 3.2334, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.8032331396817378, |
|
"grad_norm": 751897.5625, |
|
"learning_rate": 9.83834301591311e-06, |
|
"loss": 3.2724, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.8057590300580955, |
|
"grad_norm": 740156.9375, |
|
"learning_rate": 9.712048497095226e-06, |
|
"loss": 3.344, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.8082849204344531, |
|
"grad_norm": 634736.5, |
|
"learning_rate": 9.585753978277342e-06, |
|
"loss": 3.1028, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 1092338.25, |
|
"learning_rate": 9.45945945945946e-06, |
|
"loss": 3.5916, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.8133367011871685, |
|
"grad_norm": 915638.875, |
|
"learning_rate": 9.333164940641578e-06, |
|
"loss": 3.126, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.8158625915635261, |
|
"grad_norm": 919722.3125, |
|
"learning_rate": 9.206870421823693e-06, |
|
"loss": 3.1032, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.8183884819398838, |
|
"grad_norm": 791748.625, |
|
"learning_rate": 9.08057590300581e-06, |
|
"loss": 3.4718, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.8209143723162414, |
|
"grad_norm": 607153.8125, |
|
"learning_rate": 8.954281384187927e-06, |
|
"loss": 2.989, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.8234402626925992, |
|
"grad_norm": 800489.625, |
|
"learning_rate": 8.827986865370043e-06, |
|
"loss": 3.1845, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.8259661530689568, |
|
"grad_norm": 748984.9375, |
|
"learning_rate": 8.70169234655216e-06, |
|
"loss": 3.2721, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.8284920434453145, |
|
"grad_norm": 686388.0625, |
|
"learning_rate": 8.575397827734277e-06, |
|
"loss": 3.178, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.8310179338216721, |
|
"grad_norm": 634646.5, |
|
"learning_rate": 8.449103308916394e-06, |
|
"loss": 3.1601, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.8335438241980299, |
|
"grad_norm": 973350.375, |
|
"learning_rate": 8.322808790098511e-06, |
|
"loss": 3.1181, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.8360697145743875, |
|
"grad_norm": 802978.1875, |
|
"learning_rate": 8.196514271280626e-06, |
|
"loss": 3.1694, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.8385956049507451, |
|
"grad_norm": 543352.25, |
|
"learning_rate": 8.070219752462743e-06, |
|
"loss": 3.0763, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.8411214953271028, |
|
"grad_norm": 965499.0625, |
|
"learning_rate": 7.94392523364486e-06, |
|
"loss": 3.3403, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.8436473857034604, |
|
"grad_norm": 1031962.0, |
|
"learning_rate": 7.817630714826977e-06, |
|
"loss": 3.1295, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.8461732760798182, |
|
"grad_norm": 900595.875, |
|
"learning_rate": 7.691336196009095e-06, |
|
"loss": 3.0841, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.8486991664561758, |
|
"grad_norm": 878680.5625, |
|
"learning_rate": 7.565041677191211e-06, |
|
"loss": 3.2706, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.8512250568325335, |
|
"grad_norm": 541207.75, |
|
"learning_rate": 7.438747158373327e-06, |
|
"loss": 3.2398, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.8537509472088911, |
|
"grad_norm": 689078.875, |
|
"learning_rate": 7.312452639555443e-06, |
|
"loss": 3.0991, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.8562768375852488, |
|
"grad_norm": 831574.1875, |
|
"learning_rate": 7.18615812073756e-06, |
|
"loss": 3.2631, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.8588027279616065, |
|
"grad_norm": 777584.0, |
|
"learning_rate": 7.0598636019196766e-06, |
|
"loss": 3.4074, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.8613286183379641, |
|
"grad_norm": 742726.625, |
|
"learning_rate": 6.9335690831017945e-06, |
|
"loss": 3.166, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.8638545087143218, |
|
"grad_norm": 580725.0625, |
|
"learning_rate": 6.807274564283911e-06, |
|
"loss": 3.1598, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.8663803990906794, |
|
"grad_norm": 588559.3125, |
|
"learning_rate": 6.680980045466027e-06, |
|
"loss": 3.0765, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.8689062894670372, |
|
"grad_norm": 512794.3125, |
|
"learning_rate": 6.554685526648144e-06, |
|
"loss": 3.3832, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.8714321798433948, |
|
"grad_norm": 876623.1875, |
|
"learning_rate": 6.42839100783026e-06, |
|
"loss": 3.4546, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.8739580702197525, |
|
"grad_norm": 716818.125, |
|
"learning_rate": 6.3020964890123765e-06, |
|
"loss": 3.2477, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.8764839605961101, |
|
"grad_norm": 1114704.75, |
|
"learning_rate": 6.175801970194494e-06, |
|
"loss": 3.1437, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.8790098509724678, |
|
"grad_norm": 639491.625, |
|
"learning_rate": 6.049507451376611e-06, |
|
"loss": 3.1617, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.8815357413488255, |
|
"grad_norm": 717235.0625, |
|
"learning_rate": 5.923212932558727e-06, |
|
"loss": 3.1024, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.8840616317251832, |
|
"grad_norm": 736368.0, |
|
"learning_rate": 5.796918413740844e-06, |
|
"loss": 3.3481, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8865875221015408, |
|
"grad_norm": 527303.1875, |
|
"learning_rate": 5.670623894922961e-06, |
|
"loss": 3.0436, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.8891134124778984, |
|
"grad_norm": 1408582.375, |
|
"learning_rate": 5.544329376105077e-06, |
|
"loss": 3.3846, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.8916393028542561, |
|
"grad_norm": 557347.875, |
|
"learning_rate": 5.418034857287194e-06, |
|
"loss": 3.3017, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.8941651932306138, |
|
"grad_norm": 533418.25, |
|
"learning_rate": 5.2917403384693106e-06, |
|
"loss": 2.9057, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.8966910836069715, |
|
"grad_norm": 940764.0, |
|
"learning_rate": 5.165445819651428e-06, |
|
"loss": 2.9862, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.8992169739833291, |
|
"grad_norm": 753501.4375, |
|
"learning_rate": 5.039151300833544e-06, |
|
"loss": 3.0074, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.9017428643596868, |
|
"grad_norm": 641765.1875, |
|
"learning_rate": 4.912856782015661e-06, |
|
"loss": 3.2161, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.9042687547360445, |
|
"grad_norm": 846279.5625, |
|
"learning_rate": 4.786562263197777e-06, |
|
"loss": 3.1604, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.9067946451124022, |
|
"grad_norm": 623566.875, |
|
"learning_rate": 4.660267744379894e-06, |
|
"loss": 3.1331, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.9093205354887598, |
|
"grad_norm": 711430.625, |
|
"learning_rate": 4.533973225562011e-06, |
|
"loss": 3.2304, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.9118464258651174, |
|
"grad_norm": 656724.5, |
|
"learning_rate": 4.4076787067441276e-06, |
|
"loss": 3.4538, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.9143723162414751, |
|
"grad_norm": 595240.0, |
|
"learning_rate": 4.281384187926244e-06, |
|
"loss": 3.2896, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.9168982066178328, |
|
"grad_norm": 662666.375, |
|
"learning_rate": 4.155089669108361e-06, |
|
"loss": 3.0247, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.9194240969941905, |
|
"grad_norm": 671792.5625, |
|
"learning_rate": 4.028795150290478e-06, |
|
"loss": 3.2846, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.9219499873705481, |
|
"grad_norm": 946814.3125, |
|
"learning_rate": 3.902500631472594e-06, |
|
"loss": 3.2046, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.9244758777469058, |
|
"grad_norm": 795244.75, |
|
"learning_rate": 3.776206112654711e-06, |
|
"loss": 3.1295, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.9270017681232634, |
|
"grad_norm": 670945.125, |
|
"learning_rate": 3.649911593836828e-06, |
|
"loss": 3.1636, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.9295276584996212, |
|
"grad_norm": 752728.5625, |
|
"learning_rate": 3.5236170750189446e-06, |
|
"loss": 3.3796, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.9320535488759788, |
|
"grad_norm": 798886.1875, |
|
"learning_rate": 3.3973225562010608e-06, |
|
"loss": 3.101, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.9345794392523364, |
|
"grad_norm": 470130.46875, |
|
"learning_rate": 3.2710280373831774e-06, |
|
"loss": 3.3339, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.9371053296286941, |
|
"grad_norm": 437737.59375, |
|
"learning_rate": 3.1447335185652945e-06, |
|
"loss": 3.0232, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.9396312200050517, |
|
"grad_norm": 497686.5625, |
|
"learning_rate": 3.018438999747411e-06, |
|
"loss": 3.2697, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.9421571103814095, |
|
"grad_norm": 851300.5, |
|
"learning_rate": 2.892144480929528e-06, |
|
"loss": 3.4282, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.9446830007577671, |
|
"grad_norm": 715128.5, |
|
"learning_rate": 2.7658499621116445e-06, |
|
"loss": 3.1052, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.9472088911341248, |
|
"grad_norm": 904129.3125, |
|
"learning_rate": 2.639555443293761e-06, |
|
"loss": 3.1504, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.9497347815104824, |
|
"grad_norm": 897233.625, |
|
"learning_rate": 2.5132609244758778e-06, |
|
"loss": 3.2904, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.9522606718868402, |
|
"grad_norm": 722250.1875, |
|
"learning_rate": 2.3869664056579944e-06, |
|
"loss": 3.1392, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.9547865622631978, |
|
"grad_norm": 721083.9375, |
|
"learning_rate": 2.2606718868401115e-06, |
|
"loss": 3.137, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.9573124526395554, |
|
"grad_norm": 637189.25, |
|
"learning_rate": 2.1343773680222277e-06, |
|
"loss": 3.1711, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.9598383430159131, |
|
"grad_norm": 858492.0, |
|
"learning_rate": 2.008082849204345e-06, |
|
"loss": 3.2843, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.9623642333922707, |
|
"grad_norm": 714733.625, |
|
"learning_rate": 1.8817883303864613e-06, |
|
"loss": 3.305, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.9648901237686285, |
|
"grad_norm": 655039.75, |
|
"learning_rate": 1.7554938115685781e-06, |
|
"loss": 3.4754, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.9674160141449861, |
|
"grad_norm": 537881.875, |
|
"learning_rate": 1.6291992927506948e-06, |
|
"loss": 3.1539, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.9699419045213438, |
|
"grad_norm": 654013.3125, |
|
"learning_rate": 1.5029047739328114e-06, |
|
"loss": 3.3037, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.9724677948977014, |
|
"grad_norm": 654474.0, |
|
"learning_rate": 1.376610255114928e-06, |
|
"loss": 3.1471, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.974993685274059, |
|
"grad_norm": 925252.9375, |
|
"learning_rate": 1.2503157362970447e-06, |
|
"loss": 3.0344, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.9775195756504168, |
|
"grad_norm": 690294.6875, |
|
"learning_rate": 1.1240212174791616e-06, |
|
"loss": 3.0319, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.9800454660267744, |
|
"grad_norm": 654045.1875, |
|
"learning_rate": 9.977266986612783e-07, |
|
"loss": 3.2279, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.9825713564031321, |
|
"grad_norm": 593023.5, |
|
"learning_rate": 8.714321798433948e-07, |
|
"loss": 3.3599, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.9850972467794897, |
|
"grad_norm": 451262.59375, |
|
"learning_rate": 7.451376610255115e-07, |
|
"loss": 2.9683, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.9876231371558475, |
|
"grad_norm": 814919.1875, |
|
"learning_rate": 6.188431422076282e-07, |
|
"loss": 3.4731, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.9901490275322051, |
|
"grad_norm": 596075.0, |
|
"learning_rate": 4.925486233897449e-07, |
|
"loss": 3.0758, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.9926749179085628, |
|
"grad_norm": 436784.15625, |
|
"learning_rate": 3.6625410457186164e-07, |
|
"loss": 3.0476, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.9952008082849204, |
|
"grad_norm": 646142.9375, |
|
"learning_rate": 2.399595857539783e-07, |
|
"loss": 3.0169, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.997726698661278, |
|
"grad_norm": 558096.9375, |
|
"learning_rate": 1.1366506693609499e-07, |
|
"loss": 3.1317, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 3959, |
|
"total_flos": 2.3966994296890982e+17, |
|
"train_loss": 3.909515940200563, |
|
"train_runtime": 39767.0605, |
|
"train_samples_per_second": 0.398, |
|
"train_steps_per_second": 0.1 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3959, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 3959, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.3966994296890982e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|