|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001, |
|
"grad_norm": 1.7063226699829102, |
|
"learning_rate": 4.995e-05, |
|
"loss": 9.6305, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.002, |
|
"grad_norm": 1.467505693435669, |
|
"learning_rate": 4.99e-05, |
|
"loss": 8.8474, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.003, |
|
"grad_norm": 1.3338744640350342, |
|
"learning_rate": 4.9850000000000006e-05, |
|
"loss": 8.4272, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 1.194218635559082, |
|
"learning_rate": 4.9800000000000004e-05, |
|
"loss": 7.9969, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 0.9542586207389832, |
|
"learning_rate": 4.975e-05, |
|
"loss": 7.8018, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.006, |
|
"grad_norm": 0.8312947154045105, |
|
"learning_rate": 4.97e-05, |
|
"loss": 7.5303, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.007, |
|
"grad_norm": 0.6978892683982849, |
|
"learning_rate": 4.965e-05, |
|
"loss": 7.3733, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 0.6895764470100403, |
|
"learning_rate": 4.96e-05, |
|
"loss": 7.2434, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.009, |
|
"grad_norm": 0.5555976033210754, |
|
"learning_rate": 4.9550000000000005e-05, |
|
"loss": 7.0877, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.836391806602478, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 7.0338, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.011, |
|
"grad_norm": 0.782464861869812, |
|
"learning_rate": 4.945e-05, |
|
"loss": 6.878, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.012, |
|
"grad_norm": 1.3705933094024658, |
|
"learning_rate": 4.94e-05, |
|
"loss": 6.5874, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.013, |
|
"grad_norm": 0.7560775876045227, |
|
"learning_rate": 4.935e-05, |
|
"loss": 6.4978, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.014, |
|
"grad_norm": 1.3238508701324463, |
|
"learning_rate": 4.93e-05, |
|
"loss": 6.3998, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 0.7834548950195312, |
|
"learning_rate": 4.9250000000000004e-05, |
|
"loss": 6.2838, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 0.762347400188446, |
|
"learning_rate": 4.92e-05, |
|
"loss": 6.0387, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.017, |
|
"grad_norm": 0.7799501419067383, |
|
"learning_rate": 4.915e-05, |
|
"loss": 6.0241, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.018, |
|
"grad_norm": 0.7948866486549377, |
|
"learning_rate": 4.91e-05, |
|
"loss": 5.8776, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.019, |
|
"grad_norm": 0.9890483021736145, |
|
"learning_rate": 4.905e-05, |
|
"loss": 5.747, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9131263494491577, |
|
"learning_rate": 4.9e-05, |
|
"loss": 5.644, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.021, |
|
"grad_norm": 1.7073436975479126, |
|
"learning_rate": 4.8950000000000004e-05, |
|
"loss": 5.778, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.022, |
|
"grad_norm": 0.8059922456741333, |
|
"learning_rate": 4.89e-05, |
|
"loss": 5.4755, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.023, |
|
"grad_norm": 1.2500686645507812, |
|
"learning_rate": 4.885e-05, |
|
"loss": 5.3769, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 1.3848680257797241, |
|
"learning_rate": 4.88e-05, |
|
"loss": 5.2105, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 1.2381746768951416, |
|
"learning_rate": 4.875e-05, |
|
"loss": 5.1444, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.026, |
|
"grad_norm": 2.7005224227905273, |
|
"learning_rate": 4.87e-05, |
|
"loss": 5.1608, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.027, |
|
"grad_norm": 1.1472671031951904, |
|
"learning_rate": 4.8650000000000003e-05, |
|
"loss": 4.9456, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.028, |
|
"grad_norm": 1.9849270582199097, |
|
"learning_rate": 4.86e-05, |
|
"loss": 4.8466, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.029, |
|
"grad_norm": 1.857001781463623, |
|
"learning_rate": 4.855e-05, |
|
"loss": 4.7323, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.6731220483779907, |
|
"learning_rate": 4.85e-05, |
|
"loss": 4.5786, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.031, |
|
"grad_norm": 1.7968906164169312, |
|
"learning_rate": 4.845e-05, |
|
"loss": 4.4588, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 1.7908226251602173, |
|
"learning_rate": 4.8400000000000004e-05, |
|
"loss": 4.3645, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.033, |
|
"grad_norm": 2.538881540298462, |
|
"learning_rate": 4.835e-05, |
|
"loss": 4.1489, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.034, |
|
"grad_norm": 2.306257486343384, |
|
"learning_rate": 4.83e-05, |
|
"loss": 3.9798, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 2.1730940341949463, |
|
"learning_rate": 4.825e-05, |
|
"loss": 4.0231, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.036, |
|
"grad_norm": 2.4211463928222656, |
|
"learning_rate": 4.82e-05, |
|
"loss": 3.8495, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.037, |
|
"grad_norm": 2.3698794841766357, |
|
"learning_rate": 4.815e-05, |
|
"loss": 3.6977, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.038, |
|
"grad_norm": 2.147799491882324, |
|
"learning_rate": 4.8100000000000004e-05, |
|
"loss": 3.8008, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.039, |
|
"grad_norm": 2.3577606678009033, |
|
"learning_rate": 4.805e-05, |
|
"loss": 3.6983, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.065912961959839, |
|
"learning_rate": 4.8e-05, |
|
"loss": 3.5738, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.041, |
|
"grad_norm": 2.930288314819336, |
|
"learning_rate": 4.795e-05, |
|
"loss": 3.5117, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.042, |
|
"grad_norm": 2.3703155517578125, |
|
"learning_rate": 4.79e-05, |
|
"loss": 3.2483, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.043, |
|
"grad_norm": 2.6050736904144287, |
|
"learning_rate": 4.785e-05, |
|
"loss": 3.2342, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.044, |
|
"grad_norm": 2.0790674686431885, |
|
"learning_rate": 4.78e-05, |
|
"loss": 3.1452, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 2.2497427463531494, |
|
"learning_rate": 4.775e-05, |
|
"loss": 3.0316, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.046, |
|
"grad_norm": 2.507902145385742, |
|
"learning_rate": 4.77e-05, |
|
"loss": 2.8938, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.047, |
|
"grad_norm": 2.517744541168213, |
|
"learning_rate": 4.765e-05, |
|
"loss": 2.8137, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 3.9981460571289062, |
|
"learning_rate": 4.76e-05, |
|
"loss": 2.9864, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.049, |
|
"grad_norm": 2.265026569366455, |
|
"learning_rate": 4.755e-05, |
|
"loss": 2.7839, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.257293701171875, |
|
"learning_rate": 4.75e-05, |
|
"loss": 2.6834, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.051, |
|
"grad_norm": 2.6932270526885986, |
|
"learning_rate": 4.745e-05, |
|
"loss": 2.5755, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.052, |
|
"grad_norm": 1.7177081108093262, |
|
"learning_rate": 4.74e-05, |
|
"loss": 2.425, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.053, |
|
"grad_norm": 2.2452073097229004, |
|
"learning_rate": 4.735e-05, |
|
"loss": 2.5261, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.054, |
|
"grad_norm": 2.2109947204589844, |
|
"learning_rate": 4.73e-05, |
|
"loss": 2.3825, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 2.574531078338623, |
|
"learning_rate": 4.7249999999999997e-05, |
|
"loss": 2.3087, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 2.3631017208099365, |
|
"learning_rate": 4.72e-05, |
|
"loss": 2.3099, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.057, |
|
"grad_norm": 2.3809709548950195, |
|
"learning_rate": 4.715e-05, |
|
"loss": 2.3001, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.058, |
|
"grad_norm": 2.0683534145355225, |
|
"learning_rate": 4.71e-05, |
|
"loss": 2.0813, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.059, |
|
"grad_norm": 2.5471837520599365, |
|
"learning_rate": 4.705e-05, |
|
"loss": 2.0378, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.585564374923706, |
|
"learning_rate": 4.7e-05, |
|
"loss": 2.2062, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.061, |
|
"grad_norm": 2.062100648880005, |
|
"learning_rate": 4.695e-05, |
|
"loss": 1.9914, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.062, |
|
"grad_norm": 2.1019210815429688, |
|
"learning_rate": 4.69e-05, |
|
"loss": 1.9635, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.063, |
|
"grad_norm": 2.630436658859253, |
|
"learning_rate": 4.685000000000001e-05, |
|
"loss": 1.9123, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 2.1028494834899902, |
|
"learning_rate": 4.6800000000000006e-05, |
|
"loss": 1.7583, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 2.392193078994751, |
|
"learning_rate": 4.6750000000000005e-05, |
|
"loss": 1.7532, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.066, |
|
"grad_norm": 2.004413366317749, |
|
"learning_rate": 4.6700000000000003e-05, |
|
"loss": 1.6978, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.067, |
|
"grad_norm": 2.210513114929199, |
|
"learning_rate": 4.665e-05, |
|
"loss": 1.6311, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.068, |
|
"grad_norm": 1.8464936017990112, |
|
"learning_rate": 4.660000000000001e-05, |
|
"loss": 1.5507, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.069, |
|
"grad_norm": 2.0246541500091553, |
|
"learning_rate": 4.655000000000001e-05, |
|
"loss": 1.5637, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.199751138687134, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 1.5603, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.071, |
|
"grad_norm": 2.2002196311950684, |
|
"learning_rate": 4.6450000000000004e-05, |
|
"loss": 1.4558, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 1.7826759815216064, |
|
"learning_rate": 4.64e-05, |
|
"loss": 1.4309, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.073, |
|
"grad_norm": 1.760297417640686, |
|
"learning_rate": 4.635e-05, |
|
"loss": 1.3531, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.074, |
|
"grad_norm": 2.0505475997924805, |
|
"learning_rate": 4.630000000000001e-05, |
|
"loss": 1.3641, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 2.1375396251678467, |
|
"learning_rate": 4.6250000000000006e-05, |
|
"loss": 1.3259, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.076, |
|
"grad_norm": 1.8252328634262085, |
|
"learning_rate": 4.6200000000000005e-05, |
|
"loss": 1.2026, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.077, |
|
"grad_norm": 1.8945906162261963, |
|
"learning_rate": 4.6150000000000004e-05, |
|
"loss": 1.2878, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.078, |
|
"grad_norm": 1.7990881204605103, |
|
"learning_rate": 4.61e-05, |
|
"loss": 1.1853, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.079, |
|
"grad_norm": 1.4897470474243164, |
|
"learning_rate": 4.605e-05, |
|
"loss": 1.1279, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.2804617881774902, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.0804, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.081, |
|
"grad_norm": 1.4800664186477661, |
|
"learning_rate": 4.5950000000000006e-05, |
|
"loss": 1.0361, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.082, |
|
"grad_norm": 1.3526049852371216, |
|
"learning_rate": 4.5900000000000004e-05, |
|
"loss": 1.0585, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.083, |
|
"grad_norm": 1.534173607826233, |
|
"learning_rate": 4.585e-05, |
|
"loss": 1.0206, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.084, |
|
"grad_norm": 1.4844435453414917, |
|
"learning_rate": 4.58e-05, |
|
"loss": 0.9758, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 1.533679485321045, |
|
"learning_rate": 4.575e-05, |
|
"loss": 0.9168, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.086, |
|
"grad_norm": 1.456162691116333, |
|
"learning_rate": 4.5700000000000006e-05, |
|
"loss": 0.8913, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.087, |
|
"grad_norm": 1.7335631847381592, |
|
"learning_rate": 4.5650000000000005e-05, |
|
"loss": 0.9154, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 1.3331761360168457, |
|
"learning_rate": 4.5600000000000004e-05, |
|
"loss": 0.8483, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.089, |
|
"grad_norm": 1.6703053712844849, |
|
"learning_rate": 4.555e-05, |
|
"loss": 0.8116, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.275975227355957, |
|
"learning_rate": 4.55e-05, |
|
"loss": 0.7869, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.091, |
|
"grad_norm": 1.3800309896469116, |
|
"learning_rate": 4.545000000000001e-05, |
|
"loss": 0.7637, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.092, |
|
"grad_norm": 1.9472386837005615, |
|
"learning_rate": 4.5400000000000006e-05, |
|
"loss": 0.7212, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.093, |
|
"grad_norm": 1.3451333045959473, |
|
"learning_rate": 4.5350000000000005e-05, |
|
"loss": 0.6829, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.094, |
|
"grad_norm": 1.5209784507751465, |
|
"learning_rate": 4.53e-05, |
|
"loss": 0.729, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 1.3944469690322876, |
|
"learning_rate": 4.525e-05, |
|
"loss": 0.6732, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 1.2177132368087769, |
|
"learning_rate": 4.52e-05, |
|
"loss": 0.6188, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.097, |
|
"grad_norm": 1.5988528728485107, |
|
"learning_rate": 4.5150000000000006e-05, |
|
"loss": 0.6622, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.098, |
|
"grad_norm": 1.3636531829833984, |
|
"learning_rate": 4.5100000000000005e-05, |
|
"loss": 0.5792, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.099, |
|
"grad_norm": 1.377453088760376, |
|
"learning_rate": 4.5050000000000004e-05, |
|
"loss": 0.6062, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.295713186264038, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.5709, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.101, |
|
"grad_norm": 1.35196852684021, |
|
"learning_rate": 4.495e-05, |
|
"loss": 0.5521, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.102, |
|
"grad_norm": 1.0617187023162842, |
|
"learning_rate": 4.49e-05, |
|
"loss": 0.5147, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.103, |
|
"grad_norm": 1.3035167455673218, |
|
"learning_rate": 4.4850000000000006e-05, |
|
"loss": 0.5081, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 1.2835568189620972, |
|
"learning_rate": 4.4800000000000005e-05, |
|
"loss": 0.5, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 1.0403038263320923, |
|
"learning_rate": 4.4750000000000004e-05, |
|
"loss": 0.4825, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.106, |
|
"grad_norm": 0.9538235068321228, |
|
"learning_rate": 4.47e-05, |
|
"loss": 0.4316, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.107, |
|
"grad_norm": 1.4246289730072021, |
|
"learning_rate": 4.465e-05, |
|
"loss": 0.4304, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.108, |
|
"grad_norm": 1.1217833757400513, |
|
"learning_rate": 4.46e-05, |
|
"loss": 0.4397, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.109, |
|
"grad_norm": 1.0411335229873657, |
|
"learning_rate": 4.4550000000000005e-05, |
|
"loss": 0.4057, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.8498069643974304, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 0.3933, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.111, |
|
"grad_norm": 1.1270406246185303, |
|
"learning_rate": 4.445e-05, |
|
"loss": 0.366, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 1.189041256904602, |
|
"learning_rate": 4.44e-05, |
|
"loss": 0.3407, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.113, |
|
"grad_norm": 0.9837467670440674, |
|
"learning_rate": 4.435e-05, |
|
"loss": 0.3511, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.114, |
|
"grad_norm": 1.0432955026626587, |
|
"learning_rate": 4.43e-05, |
|
"loss": 0.3381, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 0.9529951810836792, |
|
"learning_rate": 4.4250000000000005e-05, |
|
"loss": 0.3189, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.116, |
|
"grad_norm": 1.008836030960083, |
|
"learning_rate": 4.4200000000000004e-05, |
|
"loss": 0.3077, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.117, |
|
"grad_norm": 1.0005086660385132, |
|
"learning_rate": 4.415e-05, |
|
"loss": 0.3001, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.118, |
|
"grad_norm": 1.1065175533294678, |
|
"learning_rate": 4.41e-05, |
|
"loss": 0.28, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.119, |
|
"grad_norm": 0.6701949834823608, |
|
"learning_rate": 4.405e-05, |
|
"loss": 0.2692, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7154658436775208, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.2663, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.121, |
|
"grad_norm": 0.6997113823890686, |
|
"learning_rate": 4.3950000000000004e-05, |
|
"loss": 0.2595, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.122, |
|
"grad_norm": 0.9047608971595764, |
|
"learning_rate": 4.39e-05, |
|
"loss": 0.2558, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.123, |
|
"grad_norm": 0.8508415222167969, |
|
"learning_rate": 4.385e-05, |
|
"loss": 0.2459, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.124, |
|
"grad_norm": 0.6505220532417297, |
|
"learning_rate": 4.38e-05, |
|
"loss": 0.2236, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.5360460877418518, |
|
"learning_rate": 4.375e-05, |
|
"loss": 0.2189, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.126, |
|
"grad_norm": 0.560817539691925, |
|
"learning_rate": 4.3700000000000005e-05, |
|
"loss": 0.2166, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.127, |
|
"grad_norm": 0.7089666128158569, |
|
"learning_rate": 4.3650000000000004e-05, |
|
"loss": 0.2026, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.5265817046165466, |
|
"learning_rate": 4.36e-05, |
|
"loss": 0.197, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.129, |
|
"grad_norm": 0.6629377007484436, |
|
"learning_rate": 4.355e-05, |
|
"loss": 0.1934, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.0730735063552856, |
|
"learning_rate": 4.35e-05, |
|
"loss": 0.1807, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.131, |
|
"grad_norm": 0.6990699172019958, |
|
"learning_rate": 4.345e-05, |
|
"loss": 0.1845, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.132, |
|
"grad_norm": 0.5047340393066406, |
|
"learning_rate": 4.3400000000000005e-05, |
|
"loss": 0.1725, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.133, |
|
"grad_norm": 0.6830994486808777, |
|
"learning_rate": 4.335e-05, |
|
"loss": 0.1687, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.134, |
|
"grad_norm": 0.5861710906028748, |
|
"learning_rate": 4.33e-05, |
|
"loss": 0.1671, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 0.43594300746917725, |
|
"learning_rate": 4.325e-05, |
|
"loss": 0.1467, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 0.44587692618370056, |
|
"learning_rate": 4.32e-05, |
|
"loss": 0.1509, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.137, |
|
"grad_norm": 0.5523977875709534, |
|
"learning_rate": 4.315e-05, |
|
"loss": 0.1434, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.138, |
|
"grad_norm": 0.6139170527458191, |
|
"learning_rate": 4.3100000000000004e-05, |
|
"loss": 0.1433, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.139, |
|
"grad_norm": 0.6169497966766357, |
|
"learning_rate": 4.305e-05, |
|
"loss": 0.1365, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.49120134115219116, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.1287, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.141, |
|
"grad_norm": 0.451753169298172, |
|
"learning_rate": 4.295e-05, |
|
"loss": 0.1142, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.142, |
|
"grad_norm": 0.5429627895355225, |
|
"learning_rate": 4.29e-05, |
|
"loss": 0.134, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.143, |
|
"grad_norm": 0.7613041400909424, |
|
"learning_rate": 4.285e-05, |
|
"loss": 0.1391, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.4953358471393585, |
|
"learning_rate": 4.2800000000000004e-05, |
|
"loss": 0.1197, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 0.3657626509666443, |
|
"learning_rate": 4.275e-05, |
|
"loss": 0.1071, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.146, |
|
"grad_norm": 0.44240206480026245, |
|
"learning_rate": 4.27e-05, |
|
"loss": 0.1111, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.147, |
|
"grad_norm": 0.5007165670394897, |
|
"learning_rate": 4.265e-05, |
|
"loss": 0.1056, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.148, |
|
"grad_norm": 0.4580256938934326, |
|
"learning_rate": 4.26e-05, |
|
"loss": 0.1049, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.149, |
|
"grad_norm": 0.4970822036266327, |
|
"learning_rate": 4.2550000000000004e-05, |
|
"loss": 0.1032, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.4138182997703552, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.0961, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.151, |
|
"grad_norm": 0.4013712406158447, |
|
"learning_rate": 4.245e-05, |
|
"loss": 0.0949, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 0.3868940770626068, |
|
"learning_rate": 4.24e-05, |
|
"loss": 0.0837, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.153, |
|
"grad_norm": 0.3113015294075012, |
|
"learning_rate": 4.235e-05, |
|
"loss": 0.0909, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.154, |
|
"grad_norm": 0.3569623529911041, |
|
"learning_rate": 4.23e-05, |
|
"loss": 0.0908, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 0.3841746151447296, |
|
"learning_rate": 4.2250000000000004e-05, |
|
"loss": 0.0806, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.156, |
|
"grad_norm": 0.6565550565719604, |
|
"learning_rate": 4.22e-05, |
|
"loss": 0.075, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.157, |
|
"grad_norm": 0.4816874563694, |
|
"learning_rate": 4.215e-05, |
|
"loss": 0.0858, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.158, |
|
"grad_norm": 0.30408933758735657, |
|
"learning_rate": 4.21e-05, |
|
"loss": 0.0704, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.159, |
|
"grad_norm": 0.43388792872428894, |
|
"learning_rate": 4.205e-05, |
|
"loss": 0.0671, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.33304253220558167, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.07, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.161, |
|
"grad_norm": 0.4260387420654297, |
|
"learning_rate": 4.195e-05, |
|
"loss": 0.0691, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.162, |
|
"grad_norm": 0.37930798530578613, |
|
"learning_rate": 4.19e-05, |
|
"loss": 0.0715, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.163, |
|
"grad_norm": 0.3198983669281006, |
|
"learning_rate": 4.185e-05, |
|
"loss": 0.0651, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.164, |
|
"grad_norm": 0.3510359823703766, |
|
"learning_rate": 4.18e-05, |
|
"loss": 0.058, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 0.41047966480255127, |
|
"learning_rate": 4.175e-05, |
|
"loss": 0.065, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.166, |
|
"grad_norm": 0.3054174482822418, |
|
"learning_rate": 4.17e-05, |
|
"loss": 0.0564, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.167, |
|
"grad_norm": 0.29319772124290466, |
|
"learning_rate": 4.165e-05, |
|
"loss": 0.0599, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 0.257354736328125, |
|
"learning_rate": 4.16e-05, |
|
"loss": 0.0536, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.169, |
|
"grad_norm": 0.25215694308280945, |
|
"learning_rate": 4.155e-05, |
|
"loss": 0.0587, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.4573931097984314, |
|
"learning_rate": 4.15e-05, |
|
"loss": 0.0524, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.171, |
|
"grad_norm": 0.3514876663684845, |
|
"learning_rate": 4.145e-05, |
|
"loss": 0.0551, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.172, |
|
"grad_norm": 0.3239930272102356, |
|
"learning_rate": 4.14e-05, |
|
"loss": 0.0499, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.173, |
|
"grad_norm": 0.20213039219379425, |
|
"learning_rate": 4.135e-05, |
|
"loss": 0.0521, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.174, |
|
"grad_norm": 0.21831783652305603, |
|
"learning_rate": 4.13e-05, |
|
"loss": 0.0469, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 0.2585163712501526, |
|
"learning_rate": 4.125e-05, |
|
"loss": 0.0469, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.21717113256454468, |
|
"learning_rate": 4.12e-05, |
|
"loss": 0.0455, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.177, |
|
"grad_norm": 0.27248838543891907, |
|
"learning_rate": 4.115e-05, |
|
"loss": 0.046, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.178, |
|
"grad_norm": 0.2503461241722107, |
|
"learning_rate": 4.11e-05, |
|
"loss": 0.0447, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.179, |
|
"grad_norm": 0.27404382824897766, |
|
"learning_rate": 4.105e-05, |
|
"loss": 0.0437, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.23549066483974457, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.0423, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.181, |
|
"grad_norm": 0.19369937479496002, |
|
"learning_rate": 4.095e-05, |
|
"loss": 0.0408, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.182, |
|
"grad_norm": 0.20560242235660553, |
|
"learning_rate": 4.09e-05, |
|
"loss": 0.0379, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.183, |
|
"grad_norm": 0.34989863634109497, |
|
"learning_rate": 4.085e-05, |
|
"loss": 0.0364, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 0.2310326248407364, |
|
"learning_rate": 4.08e-05, |
|
"loss": 0.0385, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 0.21055462956428528, |
|
"learning_rate": 4.075e-05, |
|
"loss": 0.0351, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.186, |
|
"grad_norm": 0.3251895308494568, |
|
"learning_rate": 4.07e-05, |
|
"loss": 0.0381, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.187, |
|
"grad_norm": 0.2887445390224457, |
|
"learning_rate": 4.065e-05, |
|
"loss": 0.0341, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.188, |
|
"grad_norm": 0.15948843955993652, |
|
"learning_rate": 4.0600000000000004e-05, |
|
"loss": 0.0313, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.189, |
|
"grad_norm": 0.2413359135389328, |
|
"learning_rate": 4.055e-05, |
|
"loss": 0.0338, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2132706493139267, |
|
"learning_rate": 4.05e-05, |
|
"loss": 0.0339, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.191, |
|
"grad_norm": 0.17968431115150452, |
|
"learning_rate": 4.045000000000001e-05, |
|
"loss": 0.0317, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.15828929841518402, |
|
"learning_rate": 4.0400000000000006e-05, |
|
"loss": 0.0302, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.193, |
|
"grad_norm": 0.18106874823570251, |
|
"learning_rate": 4.0350000000000005e-05, |
|
"loss": 0.0331, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.194, |
|
"grad_norm": 0.34827324748039246, |
|
"learning_rate": 4.0300000000000004e-05, |
|
"loss": 0.032, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 0.21621111035346985, |
|
"learning_rate": 4.025e-05, |
|
"loss": 0.0317, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.196, |
|
"grad_norm": 0.2159423679113388, |
|
"learning_rate": 4.02e-05, |
|
"loss": 0.0296, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.197, |
|
"grad_norm": 0.17750391364097595, |
|
"learning_rate": 4.015000000000001e-05, |
|
"loss": 0.0297, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.198, |
|
"grad_norm": 0.13952311873435974, |
|
"learning_rate": 4.0100000000000006e-05, |
|
"loss": 0.0279, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.199, |
|
"grad_norm": 0.19622887670993805, |
|
"learning_rate": 4.0050000000000004e-05, |
|
"loss": 0.0278, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.14959514141082764, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0251, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.3924406673408e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|