|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.4, |
|
"eval_steps": 2000, |
|
"global_step": 8000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 20.5, |
|
"learning_rate": 2.9999999999999997e-06, |
|
"loss": 1.7805, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 19.625, |
|
"learning_rate": 5.999999999999999e-06, |
|
"loss": 1.8292, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 16.875, |
|
"learning_rate": 8.999999999999999e-06, |
|
"loss": 2.0677, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 17.875, |
|
"learning_rate": 1.1999999999999999e-05, |
|
"loss": 1.6027, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 21.625, |
|
"learning_rate": 1.4999999999999999e-05, |
|
"loss": 1.7499, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 21.125, |
|
"learning_rate": 1.7999999999999997e-05, |
|
"loss": 2.0222, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 22.75, |
|
"learning_rate": 2.1e-05, |
|
"loss": 1.4821, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 25.75, |
|
"learning_rate": 2.3999999999999997e-05, |
|
"loss": 1.7637, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 17.25, |
|
"learning_rate": 2.6999999999999996e-05, |
|
"loss": 1.6784, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 1.6543, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 25.875, |
|
"learning_rate": 3.2999999999999996e-05, |
|
"loss": 1.7161, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 38.5, |
|
"learning_rate": 3.5999999999999994e-05, |
|
"loss": 1.8143, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 19.375, |
|
"learning_rate": 3.9e-05, |
|
"loss": 1.5947, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 16.5, |
|
"learning_rate": 4.2e-05, |
|
"loss": 2.1477, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 18.875, |
|
"learning_rate": 4.4999999999999996e-05, |
|
"loss": 1.8095, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 17.875, |
|
"learning_rate": 4.7999999999999994e-05, |
|
"loss": 1.7693, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 5.1e-05, |
|
"loss": 1.9624, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 5.399999999999999e-05, |
|
"loss": 1.7321, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 18.125, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 1.7955, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 15.9375, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 1.7011, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 6.299999999999999e-05, |
|
"loss": 1.5647, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 6.599999999999999e-05, |
|
"loss": 1.8383, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 16.125, |
|
"learning_rate": 6.9e-05, |
|
"loss": 1.9608, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.5, |
|
"learning_rate": 7.199999999999999e-05, |
|
"loss": 1.4068, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 11.0, |
|
"learning_rate": 7.5e-05, |
|
"loss": 1.9903, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 7.8e-05, |
|
"loss": 2.0868, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 8.1e-05, |
|
"loss": 2.0174, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 8.4e-05, |
|
"loss": 2.0084, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 10.375, |
|
"learning_rate": 8.699999999999999e-05, |
|
"loss": 1.9688, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 10.625, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 1.9387, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 10.875, |
|
"learning_rate": 9.3e-05, |
|
"loss": 2.1617, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 9.599999999999999e-05, |
|
"loss": 1.9997, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 9.9e-05, |
|
"loss": 1.5052, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 0.000102, |
|
"loss": 2.1021, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 0.00010499999999999999, |
|
"loss": 1.8825, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 13.625, |
|
"learning_rate": 0.00010799999999999998, |
|
"loss": 1.9024, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 18.125, |
|
"learning_rate": 0.00011099999999999999, |
|
"loss": 2.2218, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 1.9759, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 0.000117, |
|
"loss": 1.9719, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 1.7527, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 8.5, |
|
"learning_rate": 0.00012299999999999998, |
|
"loss": 2.3111, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 0.00012599999999999997, |
|
"loss": 2.185, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 8.375, |
|
"learning_rate": 0.000129, |
|
"loss": 2.0823, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 9.5, |
|
"learning_rate": 0.00013199999999999998, |
|
"loss": 2.155, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 0.000135, |
|
"loss": 2.1027, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.75, |
|
"learning_rate": 0.000138, |
|
"loss": 2.3035, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.875, |
|
"learning_rate": 0.00014099999999999998, |
|
"loss": 1.9003, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 9.125, |
|
"learning_rate": 0.00014399999999999998, |
|
"loss": 2.4984, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 8.0, |
|
"learning_rate": 0.000147, |
|
"loss": 2.1406, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 8.625, |
|
"learning_rate": 0.00015, |
|
"loss": 2.0724, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 0.00014984210526315788, |
|
"loss": 2.2684, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 0.00014968421052631578, |
|
"loss": 2.0504, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 8.125, |
|
"learning_rate": 0.00014952631578947368, |
|
"loss": 2.3278, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 0.00014936842105263157, |
|
"loss": 2.1526, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 0.00014921052631578947, |
|
"loss": 2.3046, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.625, |
|
"learning_rate": 0.00014905263157894737, |
|
"loss": 2.1977, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.25, |
|
"learning_rate": 0.00014889473684210526, |
|
"loss": 2.0804, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 0.00014873684210526313, |
|
"loss": 1.9854, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 0.00014857894736842103, |
|
"loss": 2.319, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.53125, |
|
"learning_rate": 0.00014842105263157893, |
|
"loss": 2.5894, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.625, |
|
"learning_rate": 0.00014826315789473682, |
|
"loss": 1.7258, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 0.00014810526315789472, |
|
"loss": 2.1868, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.00014794736842105262, |
|
"loss": 2.2646, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 0.0001477894736842105, |
|
"loss": 2.0835, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 0.0001476315789473684, |
|
"loss": 2.1409, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 0.0001474736842105263, |
|
"loss": 2.0042, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 0.0001473157894736842, |
|
"loss": 2.3385, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 0.0001471578947368421, |
|
"loss": 2.0918, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.000147, |
|
"loss": 1.9163, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 0.0001468421052631579, |
|
"loss": 2.2878, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 0.0001466842105263158, |
|
"loss": 2.0185, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 0.00014652631578947366, |
|
"loss": 2.1358, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.00014636842105263158, |
|
"loss": 2.4189, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 0.00014621052631578945, |
|
"loss": 1.9183, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 0.00014605263157894735, |
|
"loss": 2.2969, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.625, |
|
"learning_rate": 0.00014589473684210524, |
|
"loss": 2.2832, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.625, |
|
"learning_rate": 0.00014573684210526314, |
|
"loss": 2.1994, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 0.00014557894736842104, |
|
"loss": 2.0053, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.65625, |
|
"learning_rate": 0.00014542105263157893, |
|
"loss": 2.2771, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 0.00014526315789473683, |
|
"loss": 2.1748, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 0.00014510526315789473, |
|
"loss": 1.9644, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 0.00014494736842105262, |
|
"loss": 1.6369, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 0.00014478947368421052, |
|
"loss": 2.2513, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 0.00014463157894736842, |
|
"loss": 2.0433, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 0.00014447368421052631, |
|
"loss": 1.8816, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.75, |
|
"learning_rate": 0.00014431578947368418, |
|
"loss": 2.0584, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.0001441578947368421, |
|
"loss": 1.9437, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.625, |
|
"learning_rate": 0.00014399999999999998, |
|
"loss": 2.1637, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 0.00014384210526315787, |
|
"loss": 1.6931, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 0.00014368421052631577, |
|
"loss": 2.1308, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.75, |
|
"learning_rate": 0.00014352631578947367, |
|
"loss": 1.9802, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 0.00014336842105263156, |
|
"loss": 1.8052, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 0.00014321052631578946, |
|
"loss": 2.2819, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.25, |
|
"learning_rate": 0.00014305263157894736, |
|
"loss": 1.9152, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 0.00014289473684210525, |
|
"loss": 1.6635, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 0.00014273684210526315, |
|
"loss": 1.8765, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 0.00014257894736842105, |
|
"loss": 2.5049, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 0.00014242105263157894, |
|
"loss": 1.8173, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 0.00014226315789473684, |
|
"loss": 1.8379, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 7.40625, |
|
"learning_rate": 0.0001421052631578947, |
|
"loss": 2.0426, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 7.25, |
|
"learning_rate": 0.00014194736842105263, |
|
"loss": 2.1772, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 0.0001417894736842105, |
|
"loss": 1.947, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.00014163157894736843, |
|
"loss": 2.2714, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.0001414736842105263, |
|
"loss": 1.8783, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 0.0001413157894736842, |
|
"loss": 2.409, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 0.0001411578947368421, |
|
"loss": 2.0452, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.75, |
|
"learning_rate": 0.00014099999999999998, |
|
"loss": 1.9155, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 0.00014084210526315788, |
|
"loss": 1.9305, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 0.00014068421052631578, |
|
"loss": 2.0101, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 0.00014052631578947367, |
|
"loss": 2.1938, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 0.00014036842105263157, |
|
"loss": 1.8036, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 0.00014021052631578947, |
|
"loss": 1.8334, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 0.00014005263157894736, |
|
"loss": 1.7881, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 0.00013989473684210523, |
|
"loss": 1.9495, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 0.00013973684210526316, |
|
"loss": 2.2284, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 0.00013957894736842103, |
|
"loss": 2.3546, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 0.00013942105263157895, |
|
"loss": 2.1805, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 0.00013926315789473682, |
|
"loss": 2.1401, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 0.00013910526315789474, |
|
"loss": 1.9132, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 0.00013894736842105261, |
|
"loss": 1.9242, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 0.0001387894736842105, |
|
"loss": 2.3808, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.0, |
|
"learning_rate": 0.0001386315789473684, |
|
"loss": 1.8303, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 0.0001384736842105263, |
|
"loss": 2.1689, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 0.0001383157894736842, |
|
"loss": 2.1055, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 0.0001381578947368421, |
|
"loss": 2.1595, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 0.000138, |
|
"loss": 2.2264, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.25, |
|
"learning_rate": 0.0001378421052631579, |
|
"loss": 2.0981, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 0.00013768421052631576, |
|
"loss": 2.0902, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 6.125, |
|
"learning_rate": 0.00013752631578947368, |
|
"loss": 1.9157, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 0.00013736842105263155, |
|
"loss": 1.7149, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 0.00013721052631578948, |
|
"loss": 2.0319, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 0.00013705263157894735, |
|
"loss": 1.9478, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.0, |
|
"learning_rate": 0.00013689473684210527, |
|
"loss": 1.7866, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 0.00013673684210526314, |
|
"loss": 2.0339, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.00013657894736842104, |
|
"loss": 1.7241, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.25, |
|
"learning_rate": 0.00013642105263157893, |
|
"loss": 1.8559, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 0.00013626315789473683, |
|
"loss": 1.847, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 0.00013610526315789473, |
|
"loss": 1.8128, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 0.00013594736842105262, |
|
"loss": 2.0117, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 0.00013578947368421052, |
|
"loss": 2.319, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 10.5, |
|
"learning_rate": 0.00013563157894736842, |
|
"loss": 2.0572, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 0.0001354736842105263, |
|
"loss": 1.9633, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 0.0001353157894736842, |
|
"loss": 1.9709, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 0.00013515789473684208, |
|
"loss": 1.9295, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.000135, |
|
"loss": 1.7119, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 0.00013484210526315787, |
|
"loss": 1.8842, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 0.0001346842105263158, |
|
"loss": 1.8175, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 0.00013452631578947366, |
|
"loss": 2.3633, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.875, |
|
"learning_rate": 0.0001343684210526316, |
|
"loss": 2.2177, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 0.00013421052631578946, |
|
"loss": 1.9366, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 0.00013405263157894735, |
|
"loss": 1.9658, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 0.00013389473684210525, |
|
"loss": 1.9321, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 0.00013373684210526315, |
|
"loss": 1.8851, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 0.00013357894736842104, |
|
"loss": 2.3404, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 0.00013342105263157894, |
|
"loss": 1.9765, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 0.00013326315789473684, |
|
"loss": 2.2503, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 0.00013310526315789473, |
|
"loss": 1.9346, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.75, |
|
"learning_rate": 0.0001329473684210526, |
|
"loss": 2.0895, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 0.00013278947368421053, |
|
"loss": 1.956, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 0.0001326315789473684, |
|
"loss": 1.8528, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.00013247368421052632, |
|
"loss": 1.9809, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 0.0001323157894736842, |
|
"loss": 1.8755, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 0.0001321578947368421, |
|
"loss": 1.9421, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 0.00013199999999999998, |
|
"loss": 1.8252, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 0.00013184210526315788, |
|
"loss": 2.0638, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 0.00013168421052631578, |
|
"loss": 1.9106, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 0.00013152631578947367, |
|
"loss": 1.7636, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 0.00013136842105263157, |
|
"loss": 1.7, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 0.00013121052631578947, |
|
"loss": 1.9783, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.75, |
|
"learning_rate": 0.00013105263157894736, |
|
"loss": 1.8911, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.00013089473684210526, |
|
"loss": 1.6132, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 0.00013073684210526313, |
|
"loss": 1.825, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 0.00013057894736842105, |
|
"loss": 1.8831, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 0.00013042105263157892, |
|
"loss": 2.0333, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 0.00013026315789473685, |
|
"loss": 1.5719, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 0.00013010526315789472, |
|
"loss": 1.6725, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.625, |
|
"learning_rate": 0.00012994736842105264, |
|
"loss": 1.5617, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 0.0001297894736842105, |
|
"loss": 1.969, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 0.0001296315789473684, |
|
"loss": 1.9816, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 0.0001294736842105263, |
|
"loss": 1.8061, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 0.0001293157894736842, |
|
"loss": 1.796, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 0.0001291578947368421, |
|
"loss": 1.9946, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.0, |
|
"learning_rate": 0.000129, |
|
"loss": 1.9254, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 0.0001288421052631579, |
|
"loss": 1.9615, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.25, |
|
"learning_rate": 0.00012868421052631578, |
|
"loss": 1.9225, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 0.00012852631578947365, |
|
"loss": 1.7678, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 0.00012836842105263158, |
|
"loss": 2.0424, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.125, |
|
"learning_rate": 0.00012821052631578945, |
|
"loss": 1.6981, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 0.00012805263157894737, |
|
"loss": 2.013, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 0.00012789473684210524, |
|
"loss": 1.7872, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 0.00012773684210526316, |
|
"loss": 1.7634, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 0.00012757894736842103, |
|
"loss": 1.4218, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.00012742105263157893, |
|
"loss": 1.9503, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 0.00012726315789473683, |
|
"loss": 1.6645, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 0.00012710526315789472, |
|
"loss": 1.6245, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 0.00012694736842105262, |
|
"loss": 1.934, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 0.00012678947368421052, |
|
"loss": 1.696, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 0.0001266315789473684, |
|
"loss": 1.5946, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 0.0001264736842105263, |
|
"loss": 1.6951, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 0.00012631578947368418, |
|
"loss": 2.1469, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 1.864747405052185, |
|
"eval_runtime": 30.7527, |
|
"eval_samples_per_second": 32.517, |
|
"eval_steps_per_second": 32.517, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 0.0001261578947368421, |
|
"loss": 1.7456, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 0.00012599999999999997, |
|
"loss": 1.9554, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 0.0001258421052631579, |
|
"loss": 1.8303, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.375, |
|
"learning_rate": 0.00012568421052631577, |
|
"loss": 1.7321, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 0.0001255263157894737, |
|
"loss": 2.0467, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.625, |
|
"learning_rate": 0.00012536842105263156, |
|
"loss": 1.5569, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 0.00012521052631578946, |
|
"loss": 1.6945, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.25, |
|
"learning_rate": 0.00012505263157894735, |
|
"loss": 1.9183, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 0.00012489473684210525, |
|
"loss": 1.8488, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 0.00012473684210526315, |
|
"loss": 1.7148, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.625, |
|
"learning_rate": 0.00012457894736842104, |
|
"loss": 1.9033, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 0.00012442105263157894, |
|
"loss": 1.4302, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 0.00012426315789473684, |
|
"loss": 1.7065, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 0.00012410526315789473, |
|
"loss": 1.9168, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 0.00012394736842105263, |
|
"loss": 1.993, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.875, |
|
"learning_rate": 0.0001237894736842105, |
|
"loss": 1.7291, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 0.00012363157894736842, |
|
"loss": 1.6801, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 0.0001234736842105263, |
|
"loss": 1.4205, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 0.00012331578947368421, |
|
"loss": 1.8141, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.75, |
|
"learning_rate": 0.00012315789473684208, |
|
"loss": 1.9035, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 0.00012299999999999998, |
|
"loss": 1.846, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 0.00012284210526315788, |
|
"loss": 1.9915, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 0.00012268421052631577, |
|
"loss": 1.9927, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 0.00012252631578947367, |
|
"loss": 1.5576, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 0.00012236842105263157, |
|
"loss": 1.7842, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 0.00012221052631578946, |
|
"loss": 1.6442, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.625, |
|
"learning_rate": 0.00012205263157894736, |
|
"loss": 2.0708, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 0.00012189473684210524, |
|
"loss": 1.7107, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.0, |
|
"learning_rate": 0.00012173684210526315, |
|
"loss": 1.6121, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 0.00012157894736842104, |
|
"loss": 1.7322, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 0.00012142105263157895, |
|
"loss": 1.9742, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 0.00012126315789473683, |
|
"loss": 2.2318, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 0.00012110526315789473, |
|
"loss": 2.1988, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 0.00012094736842105262, |
|
"loss": 1.7186, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.375, |
|
"learning_rate": 0.0001207894736842105, |
|
"loss": 1.7444, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.875, |
|
"learning_rate": 0.00012063157894736842, |
|
"loss": 1.6583, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 0.0001204736842105263, |
|
"loss": 1.6891, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.875, |
|
"learning_rate": 0.00012031578947368421, |
|
"loss": 1.5208, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 0.00012015789473684209, |
|
"loss": 1.9399, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.0, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 1.9137, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 0.00011984210526315789, |
|
"loss": 1.8328, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 0.00011968421052631577, |
|
"loss": 1.7356, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.875, |
|
"learning_rate": 0.00011952631578947368, |
|
"loss": 2.0099, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 0.00011936842105263156, |
|
"loss": 1.8541, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 0.00011921052631578947, |
|
"loss": 1.6947, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.0, |
|
"learning_rate": 0.00011905263157894736, |
|
"loss": 1.7433, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 0.00011889473684210527, |
|
"loss": 1.6962, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 0.00011873684210526315, |
|
"loss": 2.0064, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 0.00011857894736842103, |
|
"loss": 1.8146, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.25, |
|
"learning_rate": 0.00011842105263157894, |
|
"loss": 1.6956, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.5, |
|
"learning_rate": 0.00011826315789473683, |
|
"loss": 1.7244, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 0.00011810526315789474, |
|
"loss": 1.7706, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 6.375, |
|
"learning_rate": 0.00011794736842105262, |
|
"loss": 1.7227, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 0.00011778947368421053, |
|
"loss": 1.8638, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 0.00011763157894736841, |
|
"loss": 1.8712, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 0.0001174736842105263, |
|
"loss": 1.8399, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 0.0001173157894736842, |
|
"loss": 1.7568, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 0.00011715789473684209, |
|
"loss": 1.4781, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 5.5, |
|
"learning_rate": 0.000117, |
|
"loss": 1.6916, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 0.00011684210526315788, |
|
"loss": 1.8282, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 0.00011668421052631579, |
|
"loss": 1.7929, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.75, |
|
"learning_rate": 0.00011652631578947367, |
|
"loss": 1.6475, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 0.00011636842105263156, |
|
"loss": 1.6647, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 0.00011621052631578947, |
|
"loss": 1.5357, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.00011605263157894735, |
|
"loss": 1.8162, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 6.125, |
|
"learning_rate": 0.00011589473684210526, |
|
"loss": 1.7852, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 0.00011573684210526314, |
|
"loss": 1.4552, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.00011557894736842105, |
|
"loss": 1.7193, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 0.00011542105263157894, |
|
"loss": 1.6664, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.0, |
|
"learning_rate": 0.00011526315789473682, |
|
"loss": 1.7187, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 0.00011510526315789473, |
|
"loss": 1.7427, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 0.00011494736842105261, |
|
"loss": 1.9004, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 0.00011478947368421052, |
|
"loss": 2.0091, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 0.0001146315789473684, |
|
"loss": 1.7475, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 0.00011447368421052632, |
|
"loss": 1.4338, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.375, |
|
"learning_rate": 0.0001143157894736842, |
|
"loss": 1.7836, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 0.00011415789473684208, |
|
"loss": 1.6579, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 1.9117, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.5, |
|
"learning_rate": 0.00011384210526315788, |
|
"loss": 1.9963, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 0.00011368421052631579, |
|
"loss": 1.2637, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 0.00011352631578947367, |
|
"loss": 1.7052, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 0.00011336842105263158, |
|
"loss": 1.3585, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 0.00011321052631578946, |
|
"loss": 2.0309, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 0.00011305263157894735, |
|
"loss": 1.527, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.00011289473684210526, |
|
"loss": 1.7948, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 0.00011273684210526314, |
|
"loss": 1.8825, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 0.00011257894736842105, |
|
"loss": 1.7341, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 0.00011242105263157893, |
|
"loss": 1.9529, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 0.00011226315789473684, |
|
"loss": 1.6345, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 6.5, |
|
"learning_rate": 0.00011210526315789472, |
|
"loss": 1.6036, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.00011194736842105264, |
|
"loss": 1.7851, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.625, |
|
"learning_rate": 0.00011178947368421052, |
|
"loss": 1.9272, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.0001116315789473684, |
|
"loss": 1.9839, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.00011147368421052631, |
|
"loss": 1.5799, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 0.0001113157894736842, |
|
"loss": 1.599, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 0.0001111578947368421, |
|
"loss": 1.8219, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 0.00011099999999999999, |
|
"loss": 1.5486, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 0.0001108421052631579, |
|
"loss": 1.419, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 0.00011068421052631578, |
|
"loss": 1.7484, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 0.00011052631578947366, |
|
"loss": 1.7116, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 0.00011036842105263157, |
|
"loss": 1.3924, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.125, |
|
"learning_rate": 0.00011021052631578946, |
|
"loss": 1.4203, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.25, |
|
"learning_rate": 0.00011005263157894737, |
|
"loss": 2.0427, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 0.00010989473684210525, |
|
"loss": 1.7216, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.00010973684210526316, |
|
"loss": 1.9249, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 0.00010957894736842104, |
|
"loss": 1.9464, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 0.00010942105263157893, |
|
"loss": 1.9662, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.5, |
|
"learning_rate": 0.00010926315789473684, |
|
"loss": 1.5929, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 0.00010910526315789472, |
|
"loss": 1.8971, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 0.00010894736842105263, |
|
"loss": 1.834, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 0.00010878947368421051, |
|
"loss": 1.8009, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 0.00010863157894736842, |
|
"loss": 1.7885, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.5, |
|
"learning_rate": 0.0001084736842105263, |
|
"loss": 1.3803, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 0.00010831578947368419, |
|
"loss": 1.6063, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 0.0001081578947368421, |
|
"loss": 1.6058, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 0.00010799999999999998, |
|
"loss": 1.7769, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 0.00010784210526315789, |
|
"loss": 1.8996, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.375, |
|
"learning_rate": 0.00010768421052631578, |
|
"loss": 1.5069, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 0.00010752631578947369, |
|
"loss": 1.4708, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 0.00010736842105263157, |
|
"loss": 1.7289, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.75, |
|
"learning_rate": 0.00010721052631578945, |
|
"loss": 1.509, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.375, |
|
"learning_rate": 0.00010705263157894736, |
|
"loss": 1.5922, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 0.00010689473684210525, |
|
"loss": 1.8839, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 0.00010673684210526316, |
|
"loss": 1.6906, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 0.00010657894736842104, |
|
"loss": 1.6712, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 0.00010642105263157895, |
|
"loss": 1.8901, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 0.00010626315789473683, |
|
"loss": 1.4667, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 0.00010610526315789471, |
|
"loss": 1.8619, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 0.00010594736842105262, |
|
"loss": 1.6954, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 0.00010578947368421051, |
|
"loss": 1.5175, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 0.00010563157894736842, |
|
"loss": 1.7939, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 0.0001054736842105263, |
|
"loss": 1.6237, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 0.00010531578947368421, |
|
"loss": 1.6149, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 0.0001051578947368421, |
|
"loss": 1.7133, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.5, |
|
"learning_rate": 0.00010499999999999999, |
|
"loss": 1.3957, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 0.00010484210526315789, |
|
"loss": 1.9061, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.25, |
|
"learning_rate": 0.00010468421052631577, |
|
"loss": 1.6927, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 0.00010452631578947368, |
|
"loss": 1.6389, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 0.00010436842105263156, |
|
"loss": 1.6523, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 0.00010421052631578947, |
|
"loss": 1.5598, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 0.00010405263157894736, |
|
"loss": 1.2804, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 0.00010389473684210525, |
|
"loss": 1.2989, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 0.00010373684210526315, |
|
"loss": 1.4779, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 0.00010357894736842103, |
|
"loss": 1.7402, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 0.00010342105263157894, |
|
"loss": 1.6356, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.625, |
|
"learning_rate": 0.00010326315789473683, |
|
"loss": 1.9104, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 8.75, |
|
"learning_rate": 0.00010310526315789474, |
|
"loss": 1.5426, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.375, |
|
"learning_rate": 0.00010294736842105262, |
|
"loss": 1.5519, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.00010278947368421052, |
|
"loss": 1.3926, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 0.00010263157894736841, |
|
"loss": 2.0501, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 0.0001024736842105263, |
|
"loss": 1.8807, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 0.0001023157894736842, |
|
"loss": 1.8071, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.0, |
|
"learning_rate": 0.00010215789473684209, |
|
"loss": 2.0167, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.5, |
|
"learning_rate": 0.000102, |
|
"loss": 1.8896, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 0.00010184210526315788, |
|
"loss": 1.8863, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 0.00010168421052631578, |
|
"loss": 1.7345, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.0, |
|
"learning_rate": 0.00010152631578947368, |
|
"loss": 1.4917, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 0.00010136842105263157, |
|
"loss": 1.6955, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 0.00010121052631578947, |
|
"loss": 2.1419, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 0.00010105263157894735, |
|
"loss": 1.7818, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 0.00010089473684210526, |
|
"loss": 1.8861, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.625, |
|
"learning_rate": 0.00010073684210526314, |
|
"loss": 1.8296, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.5, |
|
"learning_rate": 0.00010057894736842104, |
|
"loss": 1.4939, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 0.00010042105263157894, |
|
"loss": 1.6977, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 0.00010026315789473683, |
|
"loss": 1.6357, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.75, |
|
"learning_rate": 0.00010010526315789473, |
|
"loss": 1.4076, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 9.994736842105261e-05, |
|
"loss": 1.9151, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.25, |
|
"learning_rate": 9.978947368421052e-05, |
|
"loss": 1.5325, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 9.963157894736841e-05, |
|
"loss": 1.6342, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 9.94736842105263e-05, |
|
"loss": 1.3291, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 9.93157894736842e-05, |
|
"loss": 1.7364, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 9.91578947368421e-05, |
|
"loss": 1.4092, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 9.9e-05, |
|
"loss": 1.515, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.625, |
|
"learning_rate": 9.884210526315788e-05, |
|
"loss": 1.4621, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 9.868421052631579e-05, |
|
"loss": 1.7776, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 9.852631578947367e-05, |
|
"loss": 1.8841, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 9.836842105263157e-05, |
|
"loss": 2.119, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 9.821052631578946e-05, |
|
"loss": 1.4981, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 9.805263157894736e-05, |
|
"loss": 1.3002, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 9.789473684210526e-05, |
|
"loss": 1.5721, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 9.773684210526315e-05, |
|
"loss": 1.6575, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 9.757894736842105e-05, |
|
"loss": 1.9991, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 9.742105263157893e-05, |
|
"loss": 1.4024, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 9.726315789473683e-05, |
|
"loss": 1.7038, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 9.710526315789473e-05, |
|
"loss": 1.6067, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.875, |
|
"learning_rate": 9.694736842105262e-05, |
|
"loss": 1.6456, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 9.678947368421052e-05, |
|
"loss": 1.8511, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 9.663157894736842e-05, |
|
"loss": 1.7312, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 9.647368421052631e-05, |
|
"loss": 1.6937, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.125, |
|
"learning_rate": 9.63157894736842e-05, |
|
"loss": 1.2875, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 9.615789473684209e-05, |
|
"loss": 1.6645, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.875, |
|
"learning_rate": 9.599999999999999e-05, |
|
"loss": 1.8489, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 9.584210526315789e-05, |
|
"loss": 1.6001, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 9.568421052631578e-05, |
|
"loss": 1.546, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 9.552631578947368e-05, |
|
"loss": 1.2363, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.0, |
|
"learning_rate": 9.536842105263158e-05, |
|
"loss": 1.6655, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.0, |
|
"learning_rate": 9.521052631578946e-05, |
|
"loss": 1.5445, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.625, |
|
"learning_rate": 9.505263157894735e-05, |
|
"loss": 1.4026, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 9.489473684210525e-05, |
|
"loss": 1.6248, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 9.473684210526315e-05, |
|
"loss": 1.9961, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 1.6376639604568481, |
|
"eval_runtime": 30.5452, |
|
"eval_samples_per_second": 32.738, |
|
"eval_steps_per_second": 32.738, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 9.457894736842104e-05, |
|
"loss": 1.4331, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 9.442105263157894e-05, |
|
"loss": 1.5178, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 9.426315789473684e-05, |
|
"loss": 1.7301, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.75, |
|
"learning_rate": 9.410526315789473e-05, |
|
"loss": 1.5035, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 9.394736842105262e-05, |
|
"loss": 1.5749, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 9.378947368421051e-05, |
|
"loss": 1.7267, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.25, |
|
"learning_rate": 9.363157894736841e-05, |
|
"loss": 1.872, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 9.347368421052631e-05, |
|
"loss": 1.991, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.75, |
|
"learning_rate": 9.33157894736842e-05, |
|
"loss": 1.4165, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 9.31578947368421e-05, |
|
"loss": 1.5004, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 9.3e-05, |
|
"loss": 1.4191, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 7.0, |
|
"learning_rate": 9.284210526315788e-05, |
|
"loss": 1.7576, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 9.268421052631578e-05, |
|
"loss": 1.5489, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 9.252631578947367e-05, |
|
"loss": 1.8809, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 9.236842105263157e-05, |
|
"loss": 1.8825, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.0, |
|
"learning_rate": 9.221052631578947e-05, |
|
"loss": 1.5439, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 9.205263157894736e-05, |
|
"loss": 1.5064, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 9.189473684210526e-05, |
|
"loss": 1.756, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.0, |
|
"learning_rate": 9.173684210526314e-05, |
|
"loss": 1.683, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.125, |
|
"learning_rate": 9.157894736842104e-05, |
|
"loss": 1.6729, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 7.71875, |
|
"learning_rate": 9.142105263157894e-05, |
|
"loss": 1.6747, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 9.126315789473683e-05, |
|
"loss": 1.4337, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.625, |
|
"learning_rate": 9.110526315789473e-05, |
|
"loss": 1.6269, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 9.094736842105263e-05, |
|
"loss": 1.7594, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.75, |
|
"learning_rate": 9.078947368421052e-05, |
|
"loss": 1.6758, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.875, |
|
"learning_rate": 9.06315789473684e-05, |
|
"loss": 1.6468, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.625, |
|
"learning_rate": 9.047368421052632e-05, |
|
"loss": 1.9438, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 9.03157894736842e-05, |
|
"loss": 1.3541, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 9.01578947368421e-05, |
|
"loss": 1.5609, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 1.846, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 8.984210526315789e-05, |
|
"loss": 1.6363, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.875, |
|
"learning_rate": 8.968421052631579e-05, |
|
"loss": 1.7652, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 8.952631578947367e-05, |
|
"loss": 1.3733, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.0, |
|
"learning_rate": 8.936842105263158e-05, |
|
"loss": 1.4821, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 8.921052631578946e-05, |
|
"loss": 1.6647, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 8.905263157894736e-05, |
|
"loss": 1.5257, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.75, |
|
"learning_rate": 8.889473684210525e-05, |
|
"loss": 1.4503, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 8.873684210526315e-05, |
|
"loss": 1.7228, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 8.857894736842105e-05, |
|
"loss": 1.4673, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 8.842105263157893e-05, |
|
"loss": 1.7307, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.25, |
|
"learning_rate": 8.826315789473684e-05, |
|
"loss": 1.6506, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.125, |
|
"learning_rate": 8.810526315789472e-05, |
|
"loss": 1.8352, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 8.794736842105262e-05, |
|
"loss": 1.8344, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 8.778947368421052e-05, |
|
"loss": 1.5421, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.875, |
|
"learning_rate": 8.763157894736841e-05, |
|
"loss": 1.7763, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 8.747368421052631e-05, |
|
"loss": 1.3265, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.5, |
|
"learning_rate": 8.731578947368421e-05, |
|
"loss": 1.4655, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 8.71578947368421e-05, |
|
"loss": 1.4654, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 10.75, |
|
"learning_rate": 8.699999999999999e-05, |
|
"loss": 1.4957, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 8.68421052631579e-05, |
|
"loss": 1.4691, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 8.668421052631578e-05, |
|
"loss": 1.4734, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 8.652631578947368e-05, |
|
"loss": 1.5839, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.25, |
|
"learning_rate": 8.636842105263157e-05, |
|
"loss": 1.4329, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 8.621052631578947e-05, |
|
"loss": 1.6005, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 8.605263157894737e-05, |
|
"loss": 1.6974, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.875, |
|
"learning_rate": 8.589473684210525e-05, |
|
"loss": 1.633, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 8.573684210526316e-05, |
|
"loss": 1.6251, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 8.557894736842104e-05, |
|
"loss": 1.3481, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 8.542105263157894e-05, |
|
"loss": 1.0885, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 8.526315789473684e-05, |
|
"loss": 1.6525, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 8.510526315789473e-05, |
|
"loss": 1.7211, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 8.494736842105263e-05, |
|
"loss": 2.0992, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 8.478947368421051e-05, |
|
"loss": 1.8482, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 8.463157894736842e-05, |
|
"loss": 1.2918, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 8.44736842105263e-05, |
|
"loss": 1.7271, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 8.43157894736842e-05, |
|
"loss": 1.6719, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 8.41578947368421e-05, |
|
"loss": 1.305, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.7411, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.0, |
|
"learning_rate": 8.384210526315789e-05, |
|
"loss": 1.5826, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 8.368421052631578e-05, |
|
"loss": 1.687, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 8.352631578947369e-05, |
|
"loss": 1.4675, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 8.336842105263157e-05, |
|
"loss": 1.6521, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 8.321052631578948e-05, |
|
"loss": 1.7566, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 8.305263157894736e-05, |
|
"loss": 1.6841, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 8.289473684210526e-05, |
|
"loss": 1.6672, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 8.273684210526315e-05, |
|
"loss": 1.5411, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 8.257894736842104e-05, |
|
"loss": 1.721, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 8.242105263157895e-05, |
|
"loss": 1.7061, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 8.226315789473683e-05, |
|
"loss": 1.5112, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.75, |
|
"learning_rate": 8.210526315789474e-05, |
|
"loss": 1.3537, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.125, |
|
"learning_rate": 8.194736842105262e-05, |
|
"loss": 1.3723, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 8.178947368421052e-05, |
|
"loss": 1.9272, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 8.163157894736842e-05, |
|
"loss": 1.3583, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 10.25, |
|
"learning_rate": 8.14736842105263e-05, |
|
"loss": 1.7233, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 8.131578947368421e-05, |
|
"loss": 1.6048, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 8.11578947368421e-05, |
|
"loss": 1.7952, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 8.1e-05, |
|
"loss": 1.6838, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 8.084210526315789e-05, |
|
"loss": 1.9311, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 8.068421052631578e-05, |
|
"loss": 1.5036, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 8.052631578947368e-05, |
|
"loss": 1.55, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 8.036842105263156e-05, |
|
"loss": 1.5131, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 8.021052631578947e-05, |
|
"loss": 1.5671, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 8.005263157894736e-05, |
|
"loss": 1.6518, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 7.989473684210527e-05, |
|
"loss": 1.7936, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 7.973684210526315e-05, |
|
"loss": 1.5896, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 7.957894736842106e-05, |
|
"loss": 1.6038, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 7.942105263157894e-05, |
|
"loss": 1.4422, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.625, |
|
"learning_rate": 7.926315789473683e-05, |
|
"loss": 1.4781, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 7.910526315789474e-05, |
|
"loss": 1.554, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.25, |
|
"learning_rate": 7.894736842105262e-05, |
|
"loss": 1.4183, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 7.878947368421053e-05, |
|
"loss": 1.5773, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 7.863157894736841e-05, |
|
"loss": 1.7739, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 7.847368421052632e-05, |
|
"loss": 1.6497, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 7.83157894736842e-05, |
|
"loss": 1.6733, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 7.815789473684209e-05, |
|
"loss": 1.509, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 7.8e-05, |
|
"loss": 1.7451, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 7.784210526315788e-05, |
|
"loss": 1.1635, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 7.768421052631579e-05, |
|
"loss": 1.3425, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 7.752631578947367e-05, |
|
"loss": 1.6612, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 7.736842105263159e-05, |
|
"loss": 1.5136, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.375, |
|
"learning_rate": 7.721052631578947e-05, |
|
"loss": 1.4066, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 7.705263157894735e-05, |
|
"loss": 1.5721, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.75, |
|
"learning_rate": 7.689473684210526e-05, |
|
"loss": 1.7248, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 7.673684210526314e-05, |
|
"loss": 1.5728, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 7.657894736842105e-05, |
|
"loss": 1.6477, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 7.642105263157894e-05, |
|
"loss": 1.7276, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 7.626315789473685e-05, |
|
"loss": 1.5607, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 7.610526315789473e-05, |
|
"loss": 1.7054, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 7.594736842105261e-05, |
|
"loss": 1.5041, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 7.578947368421052e-05, |
|
"loss": 1.3939, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 7.563157894736841e-05, |
|
"loss": 1.3901, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 7.547368421052632e-05, |
|
"loss": 1.3635, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.5, |
|
"learning_rate": 7.53157894736842e-05, |
|
"loss": 1.6156, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 7.515789473684211e-05, |
|
"loss": 1.4224, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 7.5e-05, |
|
"loss": 1.5507, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.5, |
|
"learning_rate": 7.484210526315789e-05, |
|
"loss": 1.5214, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 7.468421052631579e-05, |
|
"loss": 1.4005, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 7.452631578947368e-05, |
|
"loss": 1.649, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.25, |
|
"learning_rate": 7.436842105263157e-05, |
|
"loss": 1.3222, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.875, |
|
"learning_rate": 7.421052631578946e-05, |
|
"loss": 1.4316, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 7.405263157894736e-05, |
|
"loss": 1.4701, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 7.389473684210526e-05, |
|
"loss": 1.8238, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.375, |
|
"learning_rate": 7.373684210526315e-05, |
|
"loss": 1.4409, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 5.5, |
|
"learning_rate": 7.357894736842105e-05, |
|
"loss": 1.4107, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.0, |
|
"learning_rate": 7.342105263157895e-05, |
|
"loss": 1.5908, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 7.326315789473683e-05, |
|
"loss": 1.1515, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 5.625, |
|
"learning_rate": 7.310526315789473e-05, |
|
"loss": 1.8387, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 7.294736842105262e-05, |
|
"loss": 1.7581, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 7.278947368421052e-05, |
|
"loss": 1.8786, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 7.263157894736842e-05, |
|
"loss": 1.5486, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.375, |
|
"learning_rate": 7.247368421052631e-05, |
|
"loss": 1.6973, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 7.231578947368421e-05, |
|
"loss": 1.1965, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.375, |
|
"learning_rate": 7.215789473684209e-05, |
|
"loss": 1.4671, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 7.199999999999999e-05, |
|
"loss": 1.4568, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 7.184210526315788e-05, |
|
"loss": 1.4513, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 7.168421052631578e-05, |
|
"loss": 1.2652, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 5.125, |
|
"learning_rate": 7.152631578947368e-05, |
|
"loss": 1.3024, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 7.136842105263157e-05, |
|
"loss": 1.3969, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 7.121052631578947e-05, |
|
"loss": 1.5425, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 7.105263157894735e-05, |
|
"loss": 1.5634, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 7.089473684210525e-05, |
|
"loss": 1.3567, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.25, |
|
"learning_rate": 7.073684210526315e-05, |
|
"loss": 1.5746, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.875, |
|
"learning_rate": 7.057894736842104e-05, |
|
"loss": 1.6727, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 7.042105263157894e-05, |
|
"loss": 1.709, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 7.026315789473684e-05, |
|
"loss": 1.6377, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.75, |
|
"learning_rate": 7.010526315789473e-05, |
|
"loss": 1.368, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 11.0, |
|
"learning_rate": 6.994736842105262e-05, |
|
"loss": 1.1897, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 6.978947368421051e-05, |
|
"loss": 1.4087, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 6.963157894736841e-05, |
|
"loss": 1.4641, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 6.947368421052631e-05, |
|
"loss": 1.2938, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 6.93157894736842e-05, |
|
"loss": 1.2211, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 6.91578947368421e-05, |
|
"loss": 1.239, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 6.375, |
|
"learning_rate": 6.9e-05, |
|
"loss": 1.4557, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 6.884210526315788e-05, |
|
"loss": 1.5778, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 6.868421052631578e-05, |
|
"loss": 1.5556, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 6.852631578947367e-05, |
|
"loss": 1.2642, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 6.836842105263157e-05, |
|
"loss": 1.3846, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 6.821052631578947e-05, |
|
"loss": 1.6962, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 6.805263157894736e-05, |
|
"loss": 1.7847, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 6.789473684210526e-05, |
|
"loss": 1.618, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 6.773684210526316e-05, |
|
"loss": 1.2276, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.0, |
|
"learning_rate": 6.757894736842104e-05, |
|
"loss": 1.4253, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 6.742105263157894e-05, |
|
"loss": 1.5658, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 6.875, |
|
"learning_rate": 6.726315789473683e-05, |
|
"loss": 1.6033, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 6.710526315789473e-05, |
|
"loss": 1.3583, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 6.694736842105263e-05, |
|
"loss": 1.4659, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 6.678947368421052e-05, |
|
"loss": 1.7884, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 6.663157894736842e-05, |
|
"loss": 2.0443, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 6.64736842105263e-05, |
|
"loss": 1.7151, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 6.63157894736842e-05, |
|
"loss": 1.6927, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 6.61578947368421e-05, |
|
"loss": 1.4678, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 6.599999999999999e-05, |
|
"loss": 1.2995, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 6.584210526315789e-05, |
|
"loss": 1.503, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 6.568421052631578e-05, |
|
"loss": 1.3394, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 6.552631578947368e-05, |
|
"loss": 1.1829, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 6.536842105263156e-05, |
|
"loss": 1.1117, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 6.0, |
|
"learning_rate": 6.521052631578946e-05, |
|
"loss": 1.7312, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 6.505263157894736e-05, |
|
"loss": 1.459, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 6.489473684210525e-05, |
|
"loss": 1.179, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 6.473684210526315e-05, |
|
"loss": 1.4745, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 6.457894736842105e-05, |
|
"loss": 1.7963, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 6.442105263157894e-05, |
|
"loss": 1.23, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 6.426315789473683e-05, |
|
"loss": 1.3367, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 6.410526315789472e-05, |
|
"loss": 1.4904, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 4.375, |
|
"learning_rate": 6.394736842105262e-05, |
|
"loss": 1.2374, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 6.378947368421052e-05, |
|
"loss": 1.4946, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 6.363157894736841e-05, |
|
"loss": 1.4075, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 6.347368421052631e-05, |
|
"loss": 1.6144, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 6.33157894736842e-05, |
|
"loss": 1.2361, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.625, |
|
"learning_rate": 6.315789473684209e-05, |
|
"loss": 1.4764, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 1.4902899265289307, |
|
"eval_runtime": 30.9163, |
|
"eval_samples_per_second": 32.345, |
|
"eval_steps_per_second": 32.345, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 4.875, |
|
"learning_rate": 6.299999999999999e-05, |
|
"loss": 1.568, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 6.284210526315788e-05, |
|
"loss": 1.3994, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 6.268421052631578e-05, |
|
"loss": 1.5673, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 6.252631578947368e-05, |
|
"loss": 1.4264, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 6.236842105263157e-05, |
|
"loss": 1.6823, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 6.221052631578947e-05, |
|
"loss": 1.4544, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 6.205263157894737e-05, |
|
"loss": 1.4708, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.0, |
|
"learning_rate": 6.189473684210525e-05, |
|
"loss": 1.4171, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 6.173684210526315e-05, |
|
"loss": 1.8525, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 6.157894736842104e-05, |
|
"loss": 1.386, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 6.142105263157894e-05, |
|
"loss": 1.2856, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 6.126315789473684e-05, |
|
"loss": 1.4263, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 8.0, |
|
"learning_rate": 6.110526315789473e-05, |
|
"loss": 1.4666, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.875, |
|
"learning_rate": 6.094736842105262e-05, |
|
"loss": 1.3614, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 6.078947368421052e-05, |
|
"loss": 1.3097, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 6.0631578947368415e-05, |
|
"loss": 1.5042, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 6.047368421052631e-05, |
|
"loss": 1.0817, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 6.031578947368421e-05, |
|
"loss": 1.5722, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 6.0157894736842105e-05, |
|
"loss": 1.366, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 1.6359, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.125, |
|
"learning_rate": 5.9842105263157885e-05, |
|
"loss": 1.7341, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.625, |
|
"learning_rate": 5.968421052631578e-05, |
|
"loss": 1.7589, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 5.952631578947368e-05, |
|
"loss": 1.6357, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 7.25, |
|
"learning_rate": 5.9368421052631574e-05, |
|
"loss": 1.4806, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 5.921052631578947e-05, |
|
"loss": 1.5461, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 5.905263157894737e-05, |
|
"loss": 1.3267, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 5.8894736842105264e-05, |
|
"loss": 1.5908, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 5.873684210526315e-05, |
|
"loss": 1.5187, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.875, |
|
"learning_rate": 5.8578947368421044e-05, |
|
"loss": 1.5094, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 7.625, |
|
"learning_rate": 5.842105263157894e-05, |
|
"loss": 1.5578, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 5.826315789473684e-05, |
|
"loss": 1.5853, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.25, |
|
"learning_rate": 5.8105263157894734e-05, |
|
"loss": 1.0628, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 5.794736842105263e-05, |
|
"loss": 1.6518, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.25, |
|
"learning_rate": 5.778947368421053e-05, |
|
"loss": 1.7742, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.625, |
|
"learning_rate": 5.763157894736841e-05, |
|
"loss": 1.3831, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 5.7473684210526307e-05, |
|
"loss": 1.2819, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 5.73157894736842e-05, |
|
"loss": 1.7622, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 5.71578947368421e-05, |
|
"loss": 1.3274, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 1.5017, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 5.684210526315789e-05, |
|
"loss": 1.5522, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 5.668421052631579e-05, |
|
"loss": 1.752, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 5.652631578947367e-05, |
|
"loss": 1.3857, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 5.636842105263157e-05, |
|
"loss": 1.4945, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 5.6210526315789466e-05, |
|
"loss": 1.5736, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 5.605263157894736e-05, |
|
"loss": 1.4432, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 9.625, |
|
"learning_rate": 5.589473684210526e-05, |
|
"loss": 1.499, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 5.5736842105263156e-05, |
|
"loss": 1.54, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 5.557894736842105e-05, |
|
"loss": 1.3533, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 5.542105263157895e-05, |
|
"loss": 1.335, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.875, |
|
"learning_rate": 5.526315789473683e-05, |
|
"loss": 1.498, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 5.510526315789473e-05, |
|
"loss": 1.37, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 5.4947368421052625e-05, |
|
"loss": 1.3711, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 5.478947368421052e-05, |
|
"loss": 1.3383, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 5.463157894736842e-05, |
|
"loss": 1.5205, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 5.4473684210526315e-05, |
|
"loss": 1.4338, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 5.431578947368421e-05, |
|
"loss": 1.4553, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 5.4157894736842095e-05, |
|
"loss": 1.5879, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 5.399999999999999e-05, |
|
"loss": 1.4582, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.0, |
|
"learning_rate": 5.384210526315789e-05, |
|
"loss": 1.3241, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 5.3684210526315784e-05, |
|
"loss": 1.7072, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 5.352631578947368e-05, |
|
"loss": 1.5041, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 5.336842105263158e-05, |
|
"loss": 1.7467, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.375, |
|
"learning_rate": 5.3210526315789474e-05, |
|
"loss": 1.3781, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.0, |
|
"learning_rate": 5.305263157894736e-05, |
|
"loss": 1.3092, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 5.2894736842105254e-05, |
|
"loss": 1.2497, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 5.273684210526315e-05, |
|
"loss": 1.0076, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 5.257894736842105e-05, |
|
"loss": 1.4536, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 5.2421052631578944e-05, |
|
"loss": 1.3639, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 5.226315789473684e-05, |
|
"loss": 1.5545, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 5.210526315789474e-05, |
|
"loss": 1.2892, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 5.125, |
|
"learning_rate": 5.194736842105263e-05, |
|
"loss": 1.4664, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.875, |
|
"learning_rate": 5.1789473684210517e-05, |
|
"loss": 1.542, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 5.163157894736841e-05, |
|
"loss": 1.6154, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 5.147368421052631e-05, |
|
"loss": 1.289, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.75, |
|
"learning_rate": 5.1315789473684206e-05, |
|
"loss": 1.4106, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 5.625, |
|
"learning_rate": 5.11578947368421e-05, |
|
"loss": 1.4117, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.875, |
|
"learning_rate": 5.1e-05, |
|
"loss": 1.5164, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.25, |
|
"learning_rate": 5.084210526315789e-05, |
|
"loss": 1.3858, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 7.3125, |
|
"learning_rate": 5.0684210526315786e-05, |
|
"loss": 1.6531, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.125, |
|
"learning_rate": 5.0526315789473676e-05, |
|
"loss": 1.4197, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.625, |
|
"learning_rate": 5.036842105263157e-05, |
|
"loss": 1.6369, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 9.125, |
|
"learning_rate": 5.021052631578947e-05, |
|
"loss": 1.4371, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 5.25, |
|
"learning_rate": 5.0052631578947366e-05, |
|
"loss": 1.3158, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 5.25, |
|
"learning_rate": 4.989473684210526e-05, |
|
"loss": 1.421, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.75, |
|
"learning_rate": 4.973684210526315e-05, |
|
"loss": 1.3778, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 4.957894736842105e-05, |
|
"loss": 1.4383, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 4.942105263157894e-05, |
|
"loss": 1.4528, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 4.9263157894736835e-05, |
|
"loss": 1.4387, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.125, |
|
"learning_rate": 4.910526315789473e-05, |
|
"loss": 1.4136, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.375, |
|
"learning_rate": 4.894736842105263e-05, |
|
"loss": 1.5636, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 4.8789473684210525e-05, |
|
"loss": 1.3611, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 4.8631578947368415e-05, |
|
"loss": 1.6587, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 4.847368421052631e-05, |
|
"loss": 1.4203, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.0, |
|
"learning_rate": 4.831578947368421e-05, |
|
"loss": 1.3195, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 4.81578947368421e-05, |
|
"loss": 1.5523, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.0, |
|
"learning_rate": 4.7999999999999994e-05, |
|
"loss": 1.2051, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 5.625, |
|
"learning_rate": 4.784210526315789e-05, |
|
"loss": 1.3228, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 4.768421052631579e-05, |
|
"loss": 1.4439, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.875, |
|
"learning_rate": 4.752631578947368e-05, |
|
"loss": 1.2889, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 4.7368421052631574e-05, |
|
"loss": 1.6213, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 4.721052631578947e-05, |
|
"loss": 1.2401, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.125, |
|
"learning_rate": 4.705263157894737e-05, |
|
"loss": 1.6266, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 4.689473684210526e-05, |
|
"loss": 1.3183, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 4.6736842105263154e-05, |
|
"loss": 1.4612, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 4.657894736842105e-05, |
|
"loss": 1.3345, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 4.642105263157894e-05, |
|
"loss": 1.3447, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 4.626315789473684e-05, |
|
"loss": 1.4236, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 4.610526315789473e-05, |
|
"loss": 0.9894, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 4.594736842105263e-05, |
|
"loss": 1.6438, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.125, |
|
"learning_rate": 4.578947368421052e-05, |
|
"loss": 1.432, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 4.5631578947368416e-05, |
|
"loss": 1.4258, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 4.547368421052631e-05, |
|
"loss": 1.2104, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 4.53157894736842e-05, |
|
"loss": 1.5062, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 4.51578947368421e-05, |
|
"loss": 1.1665, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.4999999999999996e-05, |
|
"loss": 1.4222, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 4.484210526315789e-05, |
|
"loss": 1.2456, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 4.468421052631579e-05, |
|
"loss": 1.6563, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 4.452631578947368e-05, |
|
"loss": 1.4191, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 5.125, |
|
"learning_rate": 4.4368421052631576e-05, |
|
"loss": 1.4417, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 4.4210526315789466e-05, |
|
"loss": 1.2115, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 6.0, |
|
"learning_rate": 4.405263157894736e-05, |
|
"loss": 1.5196, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 5.375, |
|
"learning_rate": 4.389473684210526e-05, |
|
"loss": 1.4436, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 4.3736842105263155e-05, |
|
"loss": 1.3976, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 4.357894736842105e-05, |
|
"loss": 1.5432, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 4.342105263157895e-05, |
|
"loss": 1.2969, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 4.326315789473684e-05, |
|
"loss": 1.3246, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 4.3105263157894735e-05, |
|
"loss": 1.3552, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 4.2947368421052625e-05, |
|
"loss": 1.1, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 6.25, |
|
"learning_rate": 4.278947368421052e-05, |
|
"loss": 1.449, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 8.5, |
|
"learning_rate": 4.263157894736842e-05, |
|
"loss": 1.572, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 4.2473684210526315e-05, |
|
"loss": 1.2444, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 5.0, |
|
"learning_rate": 4.231578947368421e-05, |
|
"loss": 1.1988, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 4.21578947368421e-05, |
|
"loss": 1.6043, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.2237, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 4.184210526315789e-05, |
|
"loss": 1.6211, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 4.1684210526315784e-05, |
|
"loss": 1.5567, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 7.375, |
|
"learning_rate": 4.152631578947368e-05, |
|
"loss": 1.3958, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 4.136842105263158e-05, |
|
"loss": 1.3257, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.1210526315789474e-05, |
|
"loss": 1.3897, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 5.0, |
|
"learning_rate": 4.105263157894737e-05, |
|
"loss": 1.3825, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 4.75, |
|
"learning_rate": 4.089473684210526e-05, |
|
"loss": 1.7153, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 4.073684210526315e-05, |
|
"loss": 1.3951, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 4.057894736842105e-05, |
|
"loss": 1.7072, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 4.0421052631578943e-05, |
|
"loss": 1.3874, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 4.026315789473684e-05, |
|
"loss": 1.3483, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 4.0105263157894737e-05, |
|
"loss": 1.3684, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 3.994736842105263e-05, |
|
"loss": 1.1862, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 3.978947368421053e-05, |
|
"loss": 1.8852, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 5.5, |
|
"learning_rate": 3.963157894736841e-05, |
|
"loss": 1.5302, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 3.947368421052631e-05, |
|
"loss": 1.2026, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.5, |
|
"learning_rate": 3.9315789473684206e-05, |
|
"loss": 1.3831, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 3.91578947368421e-05, |
|
"loss": 1.4271, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.9e-05, |
|
"loss": 1.2867, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 3.8842105263157896e-05, |
|
"loss": 0.9484, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 8.875, |
|
"learning_rate": 3.868421052631579e-05, |
|
"loss": 1.4929, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 3.8526315789473676e-05, |
|
"loss": 1.4549, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.75, |
|
"learning_rate": 3.836842105263157e-05, |
|
"loss": 1.1515, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 3.821052631578947e-05, |
|
"loss": 1.3083, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 3.8052631578947365e-05, |
|
"loss": 1.1697, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.789473684210526e-05, |
|
"loss": 1.4502, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 3.773684210526316e-05, |
|
"loss": 1.4979, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 3.7578947368421055e-05, |
|
"loss": 1.4675, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 3.7421052631578945e-05, |
|
"loss": 1.3028, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 3.726315789473684e-05, |
|
"loss": 1.337, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 3.710526315789473e-05, |
|
"loss": 1.5052, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 3.694736842105263e-05, |
|
"loss": 1.4187, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 3.6789473684210525e-05, |
|
"loss": 1.2143, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 3.6631578947368414e-05, |
|
"loss": 1.4787, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 3.647368421052631e-05, |
|
"loss": 1.5673, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 3.631578947368421e-05, |
|
"loss": 1.2965, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 3.6157894736842104e-05, |
|
"loss": 1.3391, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 3.5999999999999994e-05, |
|
"loss": 0.8979, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 6.25, |
|
"learning_rate": 3.584210526315789e-05, |
|
"loss": 1.3974, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.568421052631579e-05, |
|
"loss": 1.562, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 3.552631578947368e-05, |
|
"loss": 1.4908, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 3.5368421052631574e-05, |
|
"loss": 1.5491, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 3.521052631578947e-05, |
|
"loss": 1.5424, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.0, |
|
"learning_rate": 3.505263157894737e-05, |
|
"loss": 1.7824, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 3.489473684210526e-05, |
|
"loss": 1.5703, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 3.4736842105263153e-05, |
|
"loss": 1.079, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.5, |
|
"learning_rate": 3.457894736842105e-05, |
|
"loss": 1.438, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 3.442105263157894e-05, |
|
"loss": 1.5293, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 3.4263157894736836e-05, |
|
"loss": 1.2791, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 3.410526315789473e-05, |
|
"loss": 1.2439, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 3.394736842105263e-05, |
|
"loss": 1.1694, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 3.378947368421052e-05, |
|
"loss": 1.5024, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 3.3631578947368416e-05, |
|
"loss": 1.2139, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 3.347368421052631e-05, |
|
"loss": 1.5477, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.625, |
|
"learning_rate": 3.331578947368421e-05, |
|
"loss": 1.4138, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 3.31578947368421e-05, |
|
"loss": 1.2124, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 3.2999999999999996e-05, |
|
"loss": 1.3678, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 3.284210526315789e-05, |
|
"loss": 1.1943, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 3.268421052631578e-05, |
|
"loss": 1.3115, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 7.5, |
|
"learning_rate": 3.252631578947368e-05, |
|
"loss": 1.3972, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 3.2368421052631575e-05, |
|
"loss": 1.1899, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 8.125, |
|
"learning_rate": 3.221052631578947e-05, |
|
"loss": 1.439, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 3.205263157894736e-05, |
|
"loss": 1.55, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.625, |
|
"learning_rate": 3.189473684210526e-05, |
|
"loss": 1.6196, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 3.1736842105263155e-05, |
|
"loss": 1.1113, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 3.1578947368421045e-05, |
|
"loss": 1.3067, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 1.3947522640228271, |
|
"eval_runtime": 30.8209, |
|
"eval_samples_per_second": 32.445, |
|
"eval_steps_per_second": 32.445, |
|
"step": 8000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 2000, |
|
"total_flos": 6.455688167424e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|