|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 828, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.036231884057971016, |
|
"grad_norm": 10.94329153798689, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0543, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07246376811594203, |
|
"grad_norm": 6.69746195176836, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9358, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10869565217391304, |
|
"grad_norm": 1.4693988083101803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8947, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14492753623188406, |
|
"grad_norm": 1.1956268436230149, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8568, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.18115942028985507, |
|
"grad_norm": 1.161881545591458, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8308, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 1.0465718565340598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8168, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2536231884057971, |
|
"grad_norm": 0.8500018086045097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8027, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2898550724637681, |
|
"grad_norm": 0.7729963066093143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7872, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.32608695652173914, |
|
"grad_norm": 0.9101222071301408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7819, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.36231884057971014, |
|
"grad_norm": 1.0288861721148082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7754, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.39855072463768115, |
|
"grad_norm": 0.8778862523158388, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7745, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 1.0273017034313925, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7723, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.47101449275362317, |
|
"grad_norm": 1.1202477015356398, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7689, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5072463768115942, |
|
"grad_norm": 1.1947838008092264, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7588, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5434782608695652, |
|
"grad_norm": 0.9801347289517877, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7641, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5797101449275363, |
|
"grad_norm": 0.6049825201268222, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7579, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6159420289855072, |
|
"grad_norm": 0.7687560634971528, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7595, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 0.7372634515268928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7536, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6884057971014492, |
|
"grad_norm": 0.7700572406910111, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7531, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7246376811594203, |
|
"grad_norm": 0.6355090382849566, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7521, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7608695652173914, |
|
"grad_norm": 0.6549502688413438, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7503, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7971014492753623, |
|
"grad_norm": 0.582664220792735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7551, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.6407186366739495, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7459, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 0.6331891386534093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.744, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9057971014492754, |
|
"grad_norm": 0.7299118855941761, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7374, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9420289855072463, |
|
"grad_norm": 0.567880243301988, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7458, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9782608695652174, |
|
"grad_norm": 0.6907081447217029, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7358, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7438245415687561, |
|
"eval_runtime": 26.677, |
|
"eval_samples_per_second": 278.03, |
|
"eval_steps_per_second": 1.087, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.0144927536231885, |
|
"grad_norm": 0.8940503757064469, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7253, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.0507246376811594, |
|
"grad_norm": 0.7439488715557626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6908, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.0869565217391304, |
|
"grad_norm": 0.6352130108582714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6918, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.1231884057971016, |
|
"grad_norm": 0.6093227815341429, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6951, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.1594202898550725, |
|
"grad_norm": 0.648343758224349, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6853, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.1956521739130435, |
|
"grad_norm": 0.7170626934482996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6948, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2318840579710144, |
|
"grad_norm": 0.669568534520701, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6845, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.2681159420289856, |
|
"grad_norm": 0.7720612077716418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6873, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 0.6321115893300935, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6889, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.3405797101449275, |
|
"grad_norm": 0.6385246244453152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6886, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.3768115942028984, |
|
"grad_norm": 0.6322551870212395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6902, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.4130434782608696, |
|
"grad_norm": 0.682130131174512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6881, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.4492753623188406, |
|
"grad_norm": 0.6287023165159331, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6874, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.4855072463768115, |
|
"grad_norm": 0.6249521988426261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6917, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.5217391304347827, |
|
"grad_norm": 0.6110574768057114, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6875, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5579710144927537, |
|
"grad_norm": 0.771458385172311, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6905, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.5942028985507246, |
|
"grad_norm": 0.8776023781615215, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6934, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.6304347826086958, |
|
"grad_norm": 0.7459525790966258, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6927, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.579140354002018, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6857, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.7028985507246377, |
|
"grad_norm": 0.7077586289894291, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6914, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 0.5827410978723677, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6861, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.7753623188405796, |
|
"grad_norm": 0.697371946305589, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6894, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.8115942028985508, |
|
"grad_norm": 0.6086228815522553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6905, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.8478260869565217, |
|
"grad_norm": 0.5925797270404988, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6875, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.8840579710144927, |
|
"grad_norm": 0.5774122986770631, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6864, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.9202898550724639, |
|
"grad_norm": 0.7752937398691316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6848, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.9565217391304348, |
|
"grad_norm": 0.9039716311151348, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6851, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.9927536231884058, |
|
"grad_norm": 0.685112724319237, |
|
"learning_rate": 5e-06, |
|
"loss": 0.685, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7335925102233887, |
|
"eval_runtime": 26.8139, |
|
"eval_samples_per_second": 276.611, |
|
"eval_steps_per_second": 1.082, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 2.028985507246377, |
|
"grad_norm": 0.7665524028493372, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6385, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.0652173913043477, |
|
"grad_norm": 0.6043892824700962, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6304, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.101449275362319, |
|
"grad_norm": 0.7259867108096663, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6298, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.13768115942029, |
|
"grad_norm": 0.7510541142479693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.635, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.1739130434782608, |
|
"grad_norm": 0.7187894788401495, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6341, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.210144927536232, |
|
"grad_norm": 0.6750049579183571, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6381, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.246376811594203, |
|
"grad_norm": 0.8490350516471592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6324, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.282608695652174, |
|
"grad_norm": 0.7651747850436946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6359, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.318840579710145, |
|
"grad_norm": 0.8525868267751531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6327, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.355072463768116, |
|
"grad_norm": 0.9041979799840293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6335, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.391304347826087, |
|
"grad_norm": 0.6631449903138956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6333, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.427536231884058, |
|
"grad_norm": 0.6077513953951755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6354, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.463768115942029, |
|
"grad_norm": 0.7123161501999767, |
|
"learning_rate": 5e-06, |
|
"loss": 0.635, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.7517439764623557, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6339, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.536231884057971, |
|
"grad_norm": 0.782865335447992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6409, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.572463768115942, |
|
"grad_norm": 0.8659272969336265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6364, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 0.6169065680456968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6322, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.644927536231884, |
|
"grad_norm": 0.637344472194744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6394, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.681159420289855, |
|
"grad_norm": 0.723181305159317, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6423, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.717391304347826, |
|
"grad_norm": 0.6636365428175426, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6364, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.753623188405797, |
|
"grad_norm": 0.6833062497178735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6393, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.789855072463768, |
|
"grad_norm": 0.6330454188928252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6373, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.8260869565217392, |
|
"grad_norm": 0.6790151240239423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6375, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.86231884057971, |
|
"grad_norm": 0.6977511814313268, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6356, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.898550724637681, |
|
"grad_norm": 0.5840071260671889, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6407, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.9347826086956523, |
|
"grad_norm": 0.6248935845380484, |
|
"learning_rate": 5e-06, |
|
"loss": 0.634, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.971014492753623, |
|
"grad_norm": 0.6181053130332258, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6412, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.7416301369667053, |
|
"eval_runtime": 26.232, |
|
"eval_samples_per_second": 282.746, |
|
"eval_steps_per_second": 1.106, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 828, |
|
"total_flos": 1386930839224320.0, |
|
"train_loss": 0.7053444857758601, |
|
"train_runtime": 5350.6579, |
|
"train_samples_per_second": 79.003, |
|
"train_steps_per_second": 0.155 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 828, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1386930839224320.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|