{ "best_metric": 2.6880221366882324, "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3/checkpoint-45140", "epoch": 19.998892396300604, "eval_steps": 500, "global_step": 45140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4430414797585424, "grad_norm": 0.4268507957458496, "learning_rate": 3.125e-05, "loss": 5.5906, "step": 1000 }, { "epoch": 0.8860829595170848, "grad_norm": 0.5278797149658203, "learning_rate": 6.25e-05, "loss": 4.0959, "step": 2000 }, { "epoch": 0.9999446198150301, "eval_accuracy": 0.3612965154478169, "eval_loss": 3.8126308917999268, "eval_runtime": 121.9395, "eval_samples_per_second": 498.534, "eval_steps_per_second": 7.791, "step": 2257 }, { "epoch": 1.3291244392756272, "grad_norm": 0.5504615306854248, "learning_rate": 9.375e-05, "loss": 3.6988, "step": 3000 }, { "epoch": 1.7721659190341694, "grad_norm": 0.5281194448471069, "learning_rate": 0.000125, "loss": 3.4463, "step": 4000 }, { "epoch": 1.9998892396300603, "eval_accuracy": 0.4098802035001956, "eval_loss": 3.297177791595459, "eval_runtime": 122.3693, "eval_samples_per_second": 496.783, "eval_steps_per_second": 7.763, "step": 4514 }, { "epoch": 2.215207398792712, "grad_norm": 0.491211473941803, "learning_rate": 0.00015625, "loss": 3.2482, "step": 5000 }, { "epoch": 2.6582488785512544, "grad_norm": 0.5076552033424377, "learning_rate": 0.0001875, "loss": 3.1228, "step": 6000 }, { "epoch": 2.9998338594450904, "eval_accuracy": 0.4315480135894729, "eval_loss": 3.085052013397217, "eval_runtime": 122.1579, "eval_samples_per_second": 497.643, "eval_steps_per_second": 7.777, "step": 6771 }, { "epoch": 3.1012903583097966, "grad_norm": 0.4416440725326538, "learning_rate": 0.00021875, "loss": 3.0284, "step": 7000 }, { "epoch": 3.5443318380683393, "grad_norm": 0.37537524104118347, "learning_rate": 0.00025, "loss": 2.9593, "step": 8000 }, { "epoch": 3.9873733178268815, "grad_norm": 0.39330974221229553, "learning_rate": 0.00028125000000000003, "loss": 2.9166, "step": 9000 }, { "epoch": 3.9997784792601205, "eval_accuracy": 0.44179488643346004, "eval_loss": 2.980708599090576, "eval_runtime": 122.9667, "eval_samples_per_second": 494.37, "eval_steps_per_second": 7.726, "step": 9028 }, { "epoch": 4.430414797585424, "grad_norm": 0.370112806558609, "learning_rate": 0.0003125, "loss": 2.8563, "step": 10000 }, { "epoch": 4.8734562773439665, "grad_norm": 0.3323429524898529, "learning_rate": 0.00034375, "loss": 2.8402, "step": 11000 }, { "epoch": 4.999723099075151, "eval_accuracy": 0.44756373573100505, "eval_loss": 2.924882411956787, "eval_runtime": 122.7484, "eval_samples_per_second": 495.249, "eval_steps_per_second": 7.739, "step": 11285 }, { "epoch": 5.316497757102509, "grad_norm": 0.31359294056892395, "learning_rate": 0.00037496875000000003, "loss": 2.7944, "step": 12000 }, { "epoch": 5.759539236861051, "grad_norm": 0.29497697949409485, "learning_rate": 0.00040621875, "loss": 2.7832, "step": 13000 }, { "epoch": 5.999667718890181, "eval_accuracy": 0.45211594466544164, "eval_loss": 2.8850650787353516, "eval_runtime": 122.4548, "eval_samples_per_second": 496.436, "eval_steps_per_second": 7.758, "step": 13542 }, { "epoch": 6.202580716619593, "grad_norm": 0.30153268575668335, "learning_rate": 0.00043746875000000003, "loss": 2.754, "step": 14000 }, { "epoch": 6.645622196378136, "grad_norm": 0.29313963651657104, "learning_rate": 0.00046871875, "loss": 2.7377, "step": 15000 }, { "epoch": 6.999612338705211, "eval_accuracy": 0.4545553537497972, "eval_loss": 2.860243558883667, "eval_runtime": 122.3572, "eval_samples_per_second": 496.832, "eval_steps_per_second": 7.764, "step": 15799 }, { "epoch": 7.088663676136679, "grad_norm": 0.2767232358455658, "learning_rate": 0.00049996875, "loss": 2.7295, "step": 16000 }, { "epoch": 7.531705155895221, "grad_norm": 0.2507473826408386, "learning_rate": 0.0005311875000000001, "loss": 2.7014, "step": 17000 }, { "epoch": 7.974746635653763, "grad_norm": 0.2542831301689148, "learning_rate": 0.00056240625, "loss": 2.7101, "step": 18000 }, { "epoch": 8.0, "eval_accuracy": 0.4571935151649448, "eval_loss": 2.8389248847961426, "eval_runtime": 121.9399, "eval_samples_per_second": 498.532, "eval_steps_per_second": 7.791, "step": 18057 }, { "epoch": 8.417788115412305, "grad_norm": 0.2478516399860382, "learning_rate": 0.00059365625, "loss": 2.673, "step": 19000 }, { "epoch": 8.860829595170848, "grad_norm": 0.22448837757110596, "learning_rate": 0.00062490625, "loss": 2.684, "step": 20000 }, { "epoch": 8.99994461981503, "eval_accuracy": 0.4586443233179834, "eval_loss": 2.8260207176208496, "eval_runtime": 122.3992, "eval_samples_per_second": 496.662, "eval_steps_per_second": 7.761, "step": 20314 }, { "epoch": 9.30387107492939, "grad_norm": 0.22841599583625793, "learning_rate": 0.0006561562500000001, "loss": 2.6598, "step": 21000 }, { "epoch": 9.746912554687933, "grad_norm": 0.21708275377750397, "learning_rate": 0.0006873749999999999, "loss": 2.6654, "step": 22000 }, { "epoch": 9.999889239630061, "eval_accuracy": 0.45958170407706767, "eval_loss": 2.815507411956787, "eval_runtime": 120.8572, "eval_samples_per_second": 502.998, "eval_steps_per_second": 7.861, "step": 22571 }, { "epoch": 10.189954034446474, "grad_norm": 0.2034858614206314, "learning_rate": 0.000718625, "loss": 2.6505, "step": 23000 }, { "epoch": 10.632995514205017, "grad_norm": 0.20455513894557953, "learning_rate": 0.000749875, "loss": 2.6466, "step": 24000 }, { "epoch": 10.99983385944509, "eval_accuracy": 0.4604201924885037, "eval_loss": 2.807711601257324, "eval_runtime": 121.2532, "eval_samples_per_second": 501.356, "eval_steps_per_second": 7.835, "step": 24828 }, { "epoch": 11.07603699396356, "grad_norm": 0.20884878933429718, "learning_rate": 0.000781125, "loss": 2.6497, "step": 25000 }, { "epoch": 11.519078473722102, "grad_norm": 0.2016923576593399, "learning_rate": 0.00081234375, "loss": 2.6304, "step": 26000 }, { "epoch": 11.962119953480645, "grad_norm": 0.1992848664522171, "learning_rate": 0.00084359375, "loss": 2.6474, "step": 27000 }, { "epoch": 11.99977847926012, "eval_accuracy": 0.4614518854538904, "eval_loss": 2.802515983581543, "eval_runtime": 121.3375, "eval_samples_per_second": 501.007, "eval_steps_per_second": 7.829, "step": 27085 }, { "epoch": 12.405161433239186, "grad_norm": 0.19233956933021545, "learning_rate": 0.0008748125, "loss": 2.6163, "step": 28000 }, { "epoch": 12.84820291299773, "grad_norm": 0.18767118453979492, "learning_rate": 0.0009060312499999999, "loss": 2.6366, "step": 29000 }, { "epoch": 12.999723099075151, "eval_accuracy": 0.46189461094763445, "eval_loss": 2.7982876300811768, "eval_runtime": 121.2511, "eval_samples_per_second": 501.364, "eval_steps_per_second": 7.835, "step": 29342 }, { "epoch": 13.291244392756273, "grad_norm": 0.188876211643219, "learning_rate": 0.00093728125, "loss": 2.6143, "step": 30000 }, { "epoch": 13.734285872514814, "grad_norm": 0.17773953080177307, "learning_rate": 0.00096853125, "loss": 2.625, "step": 31000 }, { "epoch": 13.999667718890182, "eval_accuracy": 0.46255660264467685, "eval_loss": 2.792785406112671, "eval_runtime": 120.7502, "eval_samples_per_second": 503.444, "eval_steps_per_second": 7.867, "step": 31599 }, { "epoch": 14.177327352273357, "grad_norm": 0.19869054853916168, "learning_rate": 0.00099978125, "loss": 2.6187, "step": 32000 }, { "epoch": 14.620368832031899, "grad_norm": 0.17133094370365143, "learning_rate": 0.0009245053272450533, "loss": 2.6109, "step": 33000 }, { "epoch": 14.99961233870521, "eval_accuracy": 0.46537558287943165, "eval_loss": 2.768995523452759, "eval_runtime": 120.8755, "eval_samples_per_second": 502.922, "eval_steps_per_second": 7.859, "step": 33856 }, { "epoch": 15.063410311790442, "grad_norm": 0.1772530972957611, "learning_rate": 0.0008484018264840183, "loss": 2.5987, "step": 34000 }, { "epoch": 15.506451791548983, "grad_norm": 0.17091083526611328, "learning_rate": 0.0007723744292237444, "loss": 2.5615, "step": 35000 }, { "epoch": 15.949493271307526, "grad_norm": 0.17316913604736328, "learning_rate": 0.0006962709284627093, "loss": 2.5658, "step": 36000 }, { "epoch": 16.0, "eval_accuracy": 0.4686073564166006, "eval_loss": 2.744506359100342, "eval_runtime": 120.8269, "eval_samples_per_second": 503.125, "eval_steps_per_second": 7.862, "step": 36114 }, { "epoch": 16.39253475106607, "grad_norm": 0.19304046034812927, "learning_rate": 0.0006201674277016743, "loss": 2.5123, "step": 37000 }, { "epoch": 16.83557623082461, "grad_norm": 0.17993681132793427, "learning_rate": 0.0005440639269406394, "loss": 2.5185, "step": 38000 }, { "epoch": 16.99994461981503, "eval_accuracy": 0.47166972923301015, "eval_loss": 2.72279953956604, "eval_runtime": 121.0608, "eval_samples_per_second": 502.152, "eval_steps_per_second": 7.847, "step": 38371 }, { "epoch": 17.278617710583152, "grad_norm": 0.18638047575950623, "learning_rate": 0.0004680365296803653, "loss": 2.4713, "step": 39000 }, { "epoch": 17.721659190341697, "grad_norm": 0.19553589820861816, "learning_rate": 0.0003919330289193303, "loss": 2.4637, "step": 40000 }, { "epoch": 17.99988923963006, "eval_accuracy": 0.4746745599919493, "eval_loss": 2.7043166160583496, "eval_runtime": 121.0615, "eval_samples_per_second": 502.15, "eval_steps_per_second": 7.847, "step": 40628 }, { "epoch": 18.16470067010024, "grad_norm": 0.1946045607328415, "learning_rate": 0.00031590563165905634, "loss": 2.4315, "step": 41000 }, { "epoch": 18.60774214985878, "grad_norm": 0.1942271739244461, "learning_rate": 0.0002398021308980213, "loss": 2.3969, "step": 42000 }, { "epoch": 18.99983385944509, "eval_accuracy": 0.4773752951691443, "eval_loss": 2.689497947692871, "eval_runtime": 121.1389, "eval_samples_per_second": 501.829, "eval_steps_per_second": 7.842, "step": 42885 }, { "epoch": 19.050783629617325, "grad_norm": 0.1933259516954422, "learning_rate": 0.0001636986301369863, "loss": 2.3882, "step": 43000 }, { "epoch": 19.493825109375866, "grad_norm": 0.1976754367351532, "learning_rate": 8.75951293759513e-05, "loss": 2.3278, "step": 44000 }, { "epoch": 19.936866589134407, "grad_norm": 0.1973201334476471, "learning_rate": 1.1567732115677321e-05, "loss": 2.3245, "step": 45000 }, { "epoch": 19.998892396300604, "eval_accuracy": 0.47865689612852264, "eval_loss": 2.6880221366882324, "eval_runtime": 121.1537, "eval_samples_per_second": 501.768, "eval_steps_per_second": 7.841, "step": 45140 }, { "epoch": 19.998892396300604, "step": 45140, "total_flos": 1.50966240067584e+18, "train_loss": 2.8030111154835997, "train_runtime": 43302.4979, "train_samples_per_second": 266.866, "train_steps_per_second": 1.042 } ], "logging_steps": 1000, "max_steps": 45140, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.50966240067584e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }