{ "best_metric": 2.910346746444702, "best_model_checkpoint": "models/opt-babylm2-clean-spacy-32k-earlystop_seed-42_1e-3/checkpoint-38840", "epoch": 19.990992729846234, "eval_steps": 500, "global_step": 38840, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5147011516438268, "grad_norm": 0.4890323877334595, "learning_rate": 3.125e-05, "loss": 5.9107, "step": 1000 }, { "epoch": 0.9995496364923117, "eval_accuracy": 0.32687433630965734, "eval_loss": 3.988743305206299, "eval_runtime": 113.4579, "eval_samples_per_second": 462.198, "eval_steps_per_second": 7.227, "step": 1942 }, { "epoch": 1.0294023032876536, "grad_norm": 0.6617768406867981, "learning_rate": 6.25e-05, "loss": 4.1795, "step": 2000 }, { "epoch": 1.5441034549314803, "grad_norm": 0.5740628838539124, "learning_rate": 9.375e-05, "loss": 3.7896, "step": 3000 }, { "epoch": 1.9996139741362673, "eval_accuracy": 0.36569809006745335, "eval_loss": 3.523646593093872, "eval_runtime": 113.8096, "eval_samples_per_second": 460.769, "eval_steps_per_second": 7.205, "step": 3885 }, { "epoch": 2.058804606575307, "grad_norm": 0.5854473114013672, "learning_rate": 0.000125, "loss": 3.5494, "step": 4000 }, { "epoch": 2.573505758219134, "grad_norm": 0.5083662271499634, "learning_rate": 0.00015625, "loss": 3.3813, "step": 5000 }, { "epoch": 2.9996783117802224, "eval_accuracy": 0.38593522382255724, "eval_loss": 3.304034948348999, "eval_runtime": 113.7364, "eval_samples_per_second": 461.066, "eval_steps_per_second": 7.21, "step": 5828 }, { "epoch": 3.088206909862961, "grad_norm": 0.5111091732978821, "learning_rate": 0.0001875, "loss": 3.2658, "step": 6000 }, { "epoch": 3.6029080615067874, "grad_norm": 0.46191349625587463, "learning_rate": 0.00021875, "loss": 3.174, "step": 7000 }, { "epoch": 3.999742649424178, "eval_accuracy": 0.3961827522771122, "eval_loss": 3.1921420097351074, "eval_runtime": 113.4299, "eval_samples_per_second": 462.312, "eval_steps_per_second": 7.229, "step": 7771 }, { "epoch": 4.117609213150614, "grad_norm": 0.40757909417152405, "learning_rate": 0.00025, "loss": 3.1123, "step": 8000 }, { "epoch": 4.632310364794441, "grad_norm": 0.38333743810653687, "learning_rate": 0.00028125000000000003, "loss": 3.0533, "step": 9000 }, { "epoch": 4.999806987068133, "eval_accuracy": 0.4026235772722514, "eval_loss": 3.126554250717163, "eval_runtime": 113.5855, "eval_samples_per_second": 461.679, "eval_steps_per_second": 7.219, "step": 9714 }, { "epoch": 5.147011516438268, "grad_norm": 0.35808226466178894, "learning_rate": 0.0003125, "loss": 3.0181, "step": 10000 }, { "epoch": 5.661712668082095, "grad_norm": 0.33792659640312195, "learning_rate": 0.00034375, "loss": 2.9768, "step": 11000 }, { "epoch": 5.999871324712089, "eval_accuracy": 0.4071174526255964, "eval_loss": 3.0837907791137695, "eval_runtime": 113.7293, "eval_samples_per_second": 461.095, "eval_steps_per_second": 7.21, "step": 11657 }, { "epoch": 6.176413819725922, "grad_norm": 0.3134087324142456, "learning_rate": 0.000375, "loss": 2.9528, "step": 12000 }, { "epoch": 6.691114971369748, "grad_norm": 0.2982269525527954, "learning_rate": 0.00040625000000000004, "loss": 2.9232, "step": 13000 }, { "epoch": 6.9999356623560445, "eval_accuracy": 0.41005010394699454, "eval_loss": 3.054988145828247, "eval_runtime": 113.7694, "eval_samples_per_second": 460.932, "eval_steps_per_second": 7.208, "step": 13600 }, { "epoch": 7.205816123013575, "grad_norm": 0.3095534145832062, "learning_rate": 0.0004375, "loss": 2.9047, "step": 14000 }, { "epoch": 7.720517274657402, "grad_norm": 0.28180333971977234, "learning_rate": 0.0004686875, "loss": 2.8863, "step": 15000 }, { "epoch": 8.0, "eval_accuracy": 0.41221526749525134, "eval_loss": 3.0362703800201416, "eval_runtime": 113.4545, "eval_samples_per_second": 462.212, "eval_steps_per_second": 7.228, "step": 15543 }, { "epoch": 8.235218426301229, "grad_norm": 0.27651330828666687, "learning_rate": 0.0004999375, "loss": 2.8645, "step": 16000 }, { "epoch": 8.749919577945056, "grad_norm": 0.2618395984172821, "learning_rate": 0.0005311875000000001, "loss": 2.8563, "step": 17000 }, { "epoch": 8.999549636492311, "eval_accuracy": 0.41385650827836856, "eval_loss": 3.0208170413970947, "eval_runtime": 113.5361, "eval_samples_per_second": 461.879, "eval_steps_per_second": 7.222, "step": 17485 }, { "epoch": 9.264620729588882, "grad_norm": 0.2651124894618988, "learning_rate": 0.00056240625, "loss": 2.8341, "step": 18000 }, { "epoch": 9.77932188123271, "grad_norm": 0.25122708082199097, "learning_rate": 0.00059365625, "loss": 2.8356, "step": 19000 }, { "epoch": 9.999613974136267, "eval_accuracy": 0.41513595369497913, "eval_loss": 3.011690139770508, "eval_runtime": 113.8076, "eval_samples_per_second": 460.778, "eval_steps_per_second": 7.205, "step": 19428 }, { "epoch": 10.294023032876536, "grad_norm": 0.24296793341636658, "learning_rate": 0.00062490625, "loss": 2.8116, "step": 20000 }, { "epoch": 10.808724184520363, "grad_norm": 0.2230454832315445, "learning_rate": 0.0006561562500000001, "loss": 2.816, "step": 21000 }, { "epoch": 10.999678311780222, "eval_accuracy": 0.4161677958750243, "eval_loss": 3.0030288696289062, "eval_runtime": 113.595, "eval_samples_per_second": 461.64, "eval_steps_per_second": 7.219, "step": 21371 }, { "epoch": 11.32342533616419, "grad_norm": 0.2214185744524002, "learning_rate": 0.00068734375, "loss": 2.7929, "step": 22000 }, { "epoch": 11.838126487808017, "grad_norm": 0.20495054125785828, "learning_rate": 0.00071859375, "loss": 2.8069, "step": 23000 }, { "epoch": 11.999742649424178, "eval_accuracy": 0.41698351804489914, "eval_loss": 2.9951212406158447, "eval_runtime": 113.473, "eval_samples_per_second": 462.136, "eval_steps_per_second": 7.226, "step": 23314 }, { "epoch": 12.352827639451844, "grad_norm": 0.21121680736541748, "learning_rate": 0.0007498437500000001, "loss": 2.7777, "step": 24000 }, { "epoch": 12.86752879109567, "grad_norm": 0.20208890736103058, "learning_rate": 0.00078109375, "loss": 2.7941, "step": 25000 }, { "epoch": 12.999806987068133, "eval_accuracy": 0.41746990024079805, "eval_loss": 2.9923195838928223, "eval_runtime": 113.7475, "eval_samples_per_second": 461.021, "eval_steps_per_second": 7.209, "step": 25257 }, { "epoch": 13.382229942739498, "grad_norm": 0.21585527062416077, "learning_rate": 0.0008123125, "loss": 2.7644, "step": 26000 }, { "epoch": 13.896931094383323, "grad_norm": 0.19069883227348328, "learning_rate": 0.00084353125, "loss": 2.7889, "step": 27000 }, { "epoch": 13.999871324712089, "eval_accuracy": 0.4181739728690866, "eval_loss": 2.9887681007385254, "eval_runtime": 113.6405, "eval_samples_per_second": 461.455, "eval_steps_per_second": 7.216, "step": 27200 }, { "epoch": 14.41163224602715, "grad_norm": 0.19703735411167145, "learning_rate": 0.00087478125, "loss": 2.757, "step": 28000 }, { "epoch": 14.926333397670977, "grad_norm": 0.18608474731445312, "learning_rate": 0.0009060312499999999, "loss": 2.7802, "step": 29000 }, { "epoch": 14.999935662356044, "eval_accuracy": 0.41862535708409987, "eval_loss": 2.9839344024658203, "eval_runtime": 113.3429, "eval_samples_per_second": 462.667, "eval_steps_per_second": 7.235, "step": 29143 }, { "epoch": 15.441034549314804, "grad_norm": 0.18645653128623962, "learning_rate": 0.00093725, "loss": 2.7483, "step": 30000 }, { "epoch": 15.95573570095863, "grad_norm": 0.18724338710308075, "learning_rate": 0.0009685000000000001, "loss": 2.7802, "step": 31000 }, { "epoch": 16.0, "eval_accuracy": 0.4189873767966378, "eval_loss": 2.9820570945739746, "eval_runtime": 113.2324, "eval_samples_per_second": 463.118, "eval_steps_per_second": 7.242, "step": 31086 }, { "epoch": 16.470436852602457, "grad_norm": 0.18728293478488922, "learning_rate": 0.00099971875, "loss": 2.7412, "step": 32000 }, { "epoch": 16.985138004246284, "grad_norm": 0.18331313133239746, "learning_rate": 0.0008551169590643275, "loss": 2.7665, "step": 33000 }, { "epoch": 16.99954963649231, "eval_accuracy": 0.4212147589775804, "eval_loss": 2.962628126144409, "eval_runtime": 112.7133, "eval_samples_per_second": 465.251, "eval_steps_per_second": 7.275, "step": 33028 }, { "epoch": 17.49983915589011, "grad_norm": 0.17958644032478333, "learning_rate": 0.0007089181286549707, "loss": 2.6908, "step": 34000 }, { "epoch": 17.999613974136267, "eval_accuracy": 0.42467125828210767, "eval_loss": 2.937788963317871, "eval_runtime": 112.8442, "eval_samples_per_second": 464.711, "eval_steps_per_second": 7.267, "step": 34971 }, { "epoch": 18.014540307533938, "grad_norm": 0.19127266108989716, "learning_rate": 0.0005627192982456141, "loss": 2.6939, "step": 35000 }, { "epoch": 18.529241459177765, "grad_norm": 0.17924794554710388, "learning_rate": 0.0004166666666666667, "loss": 2.6058, "step": 36000 }, { "epoch": 18.999678311780222, "eval_accuracy": 0.4284452072209509, "eval_loss": 2.914487600326538, "eval_runtime": 113.021, "eval_samples_per_second": 463.985, "eval_steps_per_second": 7.255, "step": 36914 }, { "epoch": 19.043942610821592, "grad_norm": 0.19451527297496796, "learning_rate": 0.00027046783625730997, "loss": 2.5978, "step": 37000 }, { "epoch": 19.55864376246542, "grad_norm": 0.19833345711231232, "learning_rate": 0.0001242690058479532, "loss": 2.505, "step": 38000 }, { "epoch": 19.990992729846234, "eval_accuracy": 0.43054403912594785, "eval_loss": 2.910346746444702, "eval_runtime": 112.944, "eval_samples_per_second": 464.301, "eval_steps_per_second": 7.26, "step": 38840 }, { "epoch": 19.990992729846234, "step": 38840, "total_flos": 1.298988775636992e+18, "train_loss": 3.0016495520987547, "train_runtime": 41094.0958, "train_samples_per_second": 242.061, "train_steps_per_second": 0.945 } ], "logging_steps": 1000, "max_steps": 38840, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.298988775636992e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }