{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.001000500250125, "eval_steps": 25, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05002501250625312, "grad_norm": 0.517996072769165, "learning_rate": 0.0001951951951951952, "loss": 1.677, "step": 25 }, { "epoch": 0.05002501250625312, "eval_loss": 1.3813503980636597, "eval_runtime": 148.0614, "eval_samples_per_second": 3.37, "eval_steps_per_second": 0.425, "step": 25 }, { "epoch": 0.10005002501250625, "grad_norm": 0.5020231604576111, "learning_rate": 0.0001901901901901902, "loss": 1.2016, "step": 50 }, { "epoch": 0.10005002501250625, "eval_loss": 1.347744107246399, "eval_runtime": 151.7258, "eval_samples_per_second": 3.289, "eval_steps_per_second": 0.415, "step": 50 }, { "epoch": 0.1500750375187594, "grad_norm": 0.3798060119152069, "learning_rate": 0.0001851851851851852, "loss": 1.4491, "step": 75 }, { "epoch": 0.1500750375187594, "eval_loss": 1.3210723400115967, "eval_runtime": 150.0032, "eval_samples_per_second": 3.327, "eval_steps_per_second": 0.42, "step": 75 }, { "epoch": 0.2001000500250125, "grad_norm": 0.3365944027900696, "learning_rate": 0.00018018018018018018, "loss": 1.2076, "step": 100 }, { "epoch": 0.2001000500250125, "eval_loss": 1.3334178924560547, "eval_runtime": 151.2551, "eval_samples_per_second": 3.299, "eval_steps_per_second": 0.417, "step": 100 }, { "epoch": 0.25012506253126565, "grad_norm": 0.22820694744586945, "learning_rate": 0.0001751751751751752, "loss": 1.4415, "step": 125 }, { "epoch": 0.25012506253126565, "eval_loss": 1.309592366218567, "eval_runtime": 149.299, "eval_samples_per_second": 3.342, "eval_steps_per_second": 0.422, "step": 125 }, { "epoch": 0.3001500750375188, "grad_norm": 0.3848935663700104, "learning_rate": 0.0001701701701701702, "loss": 1.139, "step": 150 }, { "epoch": 0.3001500750375188, "eval_loss": 1.3208202123641968, "eval_runtime": 149.5811, "eval_samples_per_second": 3.336, "eval_steps_per_second": 0.421, "step": 150 }, { "epoch": 0.3501750875437719, "grad_norm": 0.2774136960506439, "learning_rate": 0.00016516516516516518, "loss": 1.4055, "step": 175 }, { "epoch": 0.3501750875437719, "eval_loss": 1.3086917400360107, "eval_runtime": 150.1042, "eval_samples_per_second": 3.324, "eval_steps_per_second": 0.42, "step": 175 }, { "epoch": 0.400200100050025, "grad_norm": 0.32166117429733276, "learning_rate": 0.00016016016016016018, "loss": 1.1459, "step": 200 }, { "epoch": 0.400200100050025, "eval_loss": 1.306862473487854, "eval_runtime": 150.7168, "eval_samples_per_second": 3.311, "eval_steps_per_second": 0.418, "step": 200 }, { "epoch": 0.4502251125562781, "grad_norm": 0.23773141205310822, "learning_rate": 0.00015515515515515516, "loss": 1.4444, "step": 225 }, { "epoch": 0.4502251125562781, "eval_loss": 1.3020325899124146, "eval_runtime": 148.5364, "eval_samples_per_second": 3.359, "eval_steps_per_second": 0.424, "step": 225 }, { "epoch": 0.5002501250625313, "grad_norm": 0.37095341086387634, "learning_rate": 0.00015015015015015014, "loss": 1.2264, "step": 250 }, { "epoch": 0.5002501250625313, "eval_loss": 1.3001904487609863, "eval_runtime": 152.658, "eval_samples_per_second": 3.269, "eval_steps_per_second": 0.413, "step": 250 }, { "epoch": 0.5502751375687844, "grad_norm": 0.2519828677177429, "learning_rate": 0.00014514514514514515, "loss": 1.4605, "step": 275 }, { "epoch": 0.5502751375687844, "eval_loss": 1.299567699432373, "eval_runtime": 148.4653, "eval_samples_per_second": 3.361, "eval_steps_per_second": 0.424, "step": 275 }, { "epoch": 0.6003001500750376, "grad_norm": 0.3685779273509979, "learning_rate": 0.00014014014014014013, "loss": 1.1655, "step": 300 }, { "epoch": 0.6003001500750376, "eval_loss": 1.2988265752792358, "eval_runtime": 151.1788, "eval_samples_per_second": 3.301, "eval_steps_per_second": 0.417, "step": 300 }, { "epoch": 0.6503251625812907, "grad_norm": 0.26966241002082825, "learning_rate": 0.00013513513513513514, "loss": 1.4313, "step": 325 }, { "epoch": 0.6503251625812907, "eval_loss": 1.298296332359314, "eval_runtime": 152.0718, "eval_samples_per_second": 3.281, "eval_steps_per_second": 0.414, "step": 325 }, { "epoch": 0.7003501750875438, "grad_norm": 0.35637611150741577, "learning_rate": 0.00013013013013013014, "loss": 1.2002, "step": 350 }, { "epoch": 0.7003501750875438, "eval_loss": 1.2959158420562744, "eval_runtime": 151.1585, "eval_samples_per_second": 3.301, "eval_steps_per_second": 0.417, "step": 350 }, { "epoch": 0.7503751875937968, "grad_norm": 0.22513383626937866, "learning_rate": 0.00012512512512512512, "loss": 1.3994, "step": 375 }, { "epoch": 0.7503751875937968, "eval_loss": 1.2951635122299194, "eval_runtime": 148.5372, "eval_samples_per_second": 3.359, "eval_steps_per_second": 0.424, "step": 375 }, { "epoch": 0.80040020010005, "grad_norm": 0.35314086079597473, "learning_rate": 0.00012012012012012013, "loss": 1.1836, "step": 400 }, { "epoch": 0.80040020010005, "eval_loss": 1.294690728187561, "eval_runtime": 149.3769, "eval_samples_per_second": 3.341, "eval_steps_per_second": 0.422, "step": 400 }, { "epoch": 0.8504252126063031, "grad_norm": 0.240916907787323, "learning_rate": 0.00011511511511511512, "loss": 1.4378, "step": 425 }, { "epoch": 0.8504252126063031, "eval_loss": 1.2916043996810913, "eval_runtime": 152.0772, "eval_samples_per_second": 3.281, "eval_steps_per_second": 0.414, "step": 425 }, { "epoch": 0.9004502251125562, "grad_norm": 0.31087398529052734, "learning_rate": 0.00011011011011011012, "loss": 1.1989, "step": 450 }, { "epoch": 0.9004502251125562, "eval_loss": 1.2893831729888916, "eval_runtime": 150.4895, "eval_samples_per_second": 3.316, "eval_steps_per_second": 0.419, "step": 450 }, { "epoch": 0.9504752376188094, "grad_norm": 0.2413586527109146, "learning_rate": 0.00010510510510510511, "loss": 1.4508, "step": 475 }, { "epoch": 0.9504752376188094, "eval_loss": 1.2888984680175781, "eval_runtime": 151.1108, "eval_samples_per_second": 3.302, "eval_steps_per_second": 0.417, "step": 475 }, { "epoch": 1.0005002501250626, "grad_norm": 0.40069064497947693, "learning_rate": 0.00010010010010010012, "loss": 1.2076, "step": 500 }, { "epoch": 1.0005002501250626, "eval_loss": 1.2911962270736694, "eval_runtime": 148.6843, "eval_samples_per_second": 3.356, "eval_steps_per_second": 0.424, "step": 500 }, { "epoch": 1.0505252626313157, "grad_norm": 0.22050493955612183, "learning_rate": 9.50950950950951e-05, "loss": 1.3994, "step": 525 }, { "epoch": 1.0505252626313157, "eval_loss": 1.2921332120895386, "eval_runtime": 149.3015, "eval_samples_per_second": 3.342, "eval_steps_per_second": 0.422, "step": 525 }, { "epoch": 1.1005502751375689, "grad_norm": 0.3588818907737732, "learning_rate": 9.009009009009009e-05, "loss": 1.177, "step": 550 }, { "epoch": 1.1005502751375689, "eval_loss": 1.2903811931610107, "eval_runtime": 149.8093, "eval_samples_per_second": 3.331, "eval_steps_per_second": 0.421, "step": 550 }, { "epoch": 1.150575287643822, "grad_norm": 0.2672303020954132, "learning_rate": 8.50850850850851e-05, "loss": 1.4015, "step": 575 }, { "epoch": 1.150575287643822, "eval_loss": 1.2898900508880615, "eval_runtime": 149.8311, "eval_samples_per_second": 3.33, "eval_steps_per_second": 0.42, "step": 575 }, { "epoch": 1.2006003001500751, "grad_norm": 0.31220486760139465, "learning_rate": 8.008008008008009e-05, "loss": 1.192, "step": 600 }, { "epoch": 1.2006003001500751, "eval_loss": 1.288824439048767, "eval_runtime": 151.038, "eval_samples_per_second": 3.304, "eval_steps_per_second": 0.417, "step": 600 }, { "epoch": 1.2506253126563283, "grad_norm": 0.2526504695415497, "learning_rate": 7.507507507507507e-05, "loss": 1.3829, "step": 625 }, { "epoch": 1.2506253126563283, "eval_loss": 1.2878332138061523, "eval_runtime": 151.5015, "eval_samples_per_second": 3.294, "eval_steps_per_second": 0.416, "step": 625 }, { "epoch": 1.3006503251625814, "grad_norm": 0.28051283955574036, "learning_rate": 7.007007007007007e-05, "loss": 1.1514, "step": 650 }, { "epoch": 1.3006503251625814, "eval_loss": 1.2859280109405518, "eval_runtime": 150.4738, "eval_samples_per_second": 3.316, "eval_steps_per_second": 0.419, "step": 650 }, { "epoch": 1.3506753376688345, "grad_norm": 0.26419979333877563, "learning_rate": 6.506506506506507e-05, "loss": 1.4028, "step": 675 }, { "epoch": 1.3506753376688345, "eval_loss": 1.2848296165466309, "eval_runtime": 149.0963, "eval_samples_per_second": 3.347, "eval_steps_per_second": 0.423, "step": 675 }, { "epoch": 1.4007003501750876, "grad_norm": 0.3227976858615875, "learning_rate": 6.0060060060060066e-05, "loss": 1.1778, "step": 700 }, { "epoch": 1.4007003501750876, "eval_loss": 1.285400152206421, "eval_runtime": 149.1519, "eval_samples_per_second": 3.346, "eval_steps_per_second": 0.422, "step": 700 }, { "epoch": 1.4507253626813408, "grad_norm": 0.24903441965579987, "learning_rate": 5.505505505505506e-05, "loss": 1.4058, "step": 725 }, { "epoch": 1.4507253626813408, "eval_loss": 1.2824435234069824, "eval_runtime": 149.5232, "eval_samples_per_second": 3.337, "eval_steps_per_second": 0.421, "step": 725 }, { "epoch": 1.500750375187594, "grad_norm": 0.31187903881073, "learning_rate": 5.005005005005006e-05, "loss": 1.1698, "step": 750 }, { "epoch": 1.500750375187594, "eval_loss": 1.2831988334655762, "eval_runtime": 150.4227, "eval_samples_per_second": 3.317, "eval_steps_per_second": 0.419, "step": 750 }, { "epoch": 1.550775387693847, "grad_norm": 0.2889004051685333, "learning_rate": 4.5045045045045046e-05, "loss": 1.3516, "step": 775 }, { "epoch": 1.550775387693847, "eval_loss": 1.2823545932769775, "eval_runtime": 149.8614, "eval_samples_per_second": 3.33, "eval_steps_per_second": 0.42, "step": 775 }, { "epoch": 1.6008004002001002, "grad_norm": 0.37189939618110657, "learning_rate": 4.0040040040040046e-05, "loss": 1.1264, "step": 800 }, { "epoch": 1.6008004002001002, "eval_loss": 1.2828818559646606, "eval_runtime": 150.672, "eval_samples_per_second": 3.312, "eval_steps_per_second": 0.418, "step": 800 }, { "epoch": 1.6508254127063533, "grad_norm": 0.25290611386299133, "learning_rate": 3.503503503503503e-05, "loss": 1.4113, "step": 825 }, { "epoch": 1.6508254127063533, "eval_loss": 1.2822470664978027, "eval_runtime": 149.3988, "eval_samples_per_second": 3.34, "eval_steps_per_second": 0.422, "step": 825 }, { "epoch": 1.7008504252126064, "grad_norm": 0.3559873104095459, "learning_rate": 3.0030030030030033e-05, "loss": 1.1248, "step": 850 }, { "epoch": 1.7008504252126064, "eval_loss": 1.2828270196914673, "eval_runtime": 149.9897, "eval_samples_per_second": 3.327, "eval_steps_per_second": 0.42, "step": 850 }, { "epoch": 1.7508754377188596, "grad_norm": 0.3052867352962494, "learning_rate": 2.502502502502503e-05, "loss": 1.336, "step": 875 }, { "epoch": 1.7508754377188596, "eval_loss": 1.282852053642273, "eval_runtime": 151.397, "eval_samples_per_second": 3.296, "eval_steps_per_second": 0.416, "step": 875 }, { "epoch": 1.8009004502251127, "grad_norm": 0.33662667870521545, "learning_rate": 2.0020020020020023e-05, "loss": 1.0725, "step": 900 }, { "epoch": 1.8009004502251127, "eval_loss": 1.2822794914245605, "eval_runtime": 150.7632, "eval_samples_per_second": 3.31, "eval_steps_per_second": 0.418, "step": 900 }, { "epoch": 1.8509254627313658, "grad_norm": 0.29956212639808655, "learning_rate": 1.5015015015015016e-05, "loss": 1.3989, "step": 925 }, { "epoch": 1.8509254627313658, "eval_loss": 1.2824186086654663, "eval_runtime": 150.6938, "eval_samples_per_second": 3.311, "eval_steps_per_second": 0.418, "step": 925 }, { "epoch": 1.900950475237619, "grad_norm": 0.3255136013031006, "learning_rate": 1.0010010010010011e-05, "loss": 1.112, "step": 950 }, { "epoch": 1.900950475237619, "eval_loss": 1.28144371509552, "eval_runtime": 149.8969, "eval_samples_per_second": 3.329, "eval_steps_per_second": 0.42, "step": 950 }, { "epoch": 1.950975487743872, "grad_norm": 0.2689700424671173, "learning_rate": 5.005005005005006e-06, "loss": 1.3972, "step": 975 }, { "epoch": 1.950975487743872, "eval_loss": 1.280760645866394, "eval_runtime": 149.8977, "eval_samples_per_second": 3.329, "eval_steps_per_second": 0.42, "step": 975 }, { "epoch": 2.001000500250125, "grad_norm": 0.3633726239204407, "learning_rate": 0.0, "loss": 1.1746, "step": 1000 }, { "epoch": 2.001000500250125, "eval_loss": 1.2818013429641724, "eval_runtime": 149.8121, "eval_samples_per_second": 3.331, "eval_steps_per_second": 0.421, "step": 1000 } ], "logging_steps": 25, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.844485620424704e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }