{ "best_metric": 2.683746337890625, "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-211_1e-3/checkpoint-45100", "epoch": 19.991464360935595, "eval_steps": 500, "global_step": 45100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4434098215275468, "grad_norm": 0.535893440246582, "learning_rate": 3.125e-05, "loss": 5.5759, "step": 1000 }, { "epoch": 0.8868196430550936, "grad_norm": 0.5292273163795471, "learning_rate": 6.25e-05, "loss": 4.1, "step": 2000 }, { "epoch": 1.0, "eval_accuracy": 0.36012162808432535, "eval_loss": 3.8208348751068115, "eval_runtime": 73.9931, "eval_samples_per_second": 820.076, "eval_steps_per_second": 12.826, "step": 2256 }, { "epoch": 1.3298969072164948, "grad_norm": 0.6377725601196289, "learning_rate": 9.375e-05, "loss": 3.7036, "step": 3000 }, { "epoch": 1.7733067287440418, "grad_norm": 0.5576291680335999, "learning_rate": 0.000125, "loss": 3.4569, "step": 4000 }, { "epoch": 2.0, "eval_accuracy": 0.40816362273320667, "eval_loss": 3.308465003967285, "eval_runtime": 73.782, "eval_samples_per_second": 822.423, "eval_steps_per_second": 12.862, "step": 4512 }, { "epoch": 2.2163839929054427, "grad_norm": 0.5923783183097839, "learning_rate": 0.00015625, "loss": 3.2553, "step": 5000 }, { "epoch": 2.6597938144329896, "grad_norm": 0.4800017178058624, "learning_rate": 0.0001875, "loss": 3.1304, "step": 6000 }, { "epoch": 3.0, "eval_accuracy": 0.4293314979254721, "eval_loss": 3.0989913940429688, "eval_runtime": 73.5868, "eval_samples_per_second": 824.605, "eval_steps_per_second": 12.896, "step": 6768 }, { "epoch": 3.102871078594391, "grad_norm": 0.40440040826797485, "learning_rate": 0.00021875, "loss": 3.0393, "step": 7000 }, { "epoch": 3.5462809001219378, "grad_norm": 0.3914831876754761, "learning_rate": 0.00025, "loss": 2.9686, "step": 8000 }, { "epoch": 3.9896907216494846, "grad_norm": 0.37147632241249084, "learning_rate": 0.00028125000000000003, "loss": 2.9241, "step": 9000 }, { "epoch": 4.0, "eval_accuracy": 0.44051643465560253, "eval_loss": 2.9892077445983887, "eval_runtime": 73.1132, "eval_samples_per_second": 829.946, "eval_steps_per_second": 12.98, "step": 9024 }, { "epoch": 4.4327679858108855, "grad_norm": 0.3617306649684906, "learning_rate": 0.0003125, "loss": 2.8599, "step": 10000 }, { "epoch": 4.876177807338433, "grad_norm": 0.31954342126846313, "learning_rate": 0.00034375, "loss": 2.8456, "step": 11000 }, { "epoch": 5.0, "eval_accuracy": 0.44670376258611555, "eval_loss": 2.9302828311920166, "eval_runtime": 72.5611, "eval_samples_per_second": 836.261, "eval_steps_per_second": 13.079, "step": 11280 }, { "epoch": 5.319255071499834, "grad_norm": 0.32251182198524475, "learning_rate": 0.000375, "loss": 2.7976, "step": 12000 }, { "epoch": 5.762664893027381, "grad_norm": 0.312112957239151, "learning_rate": 0.00040625000000000004, "loss": 2.7882, "step": 13000 }, { "epoch": 6.0, "eval_accuracy": 0.4509860793361511, "eval_loss": 2.888702869415283, "eval_runtime": 72.4551, "eval_samples_per_second": 837.484, "eval_steps_per_second": 13.098, "step": 13536 }, { "epoch": 6.205742157188782, "grad_norm": 0.28876274824142456, "learning_rate": 0.0004375, "loss": 2.7559, "step": 14000 }, { "epoch": 6.649151978716328, "grad_norm": 0.29188501834869385, "learning_rate": 0.00046875, "loss": 2.7426, "step": 15000 }, { "epoch": 7.0, "eval_accuracy": 0.4542816058526245, "eval_loss": 2.8618035316467285, "eval_runtime": 72.3378, "eval_samples_per_second": 838.842, "eval_steps_per_second": 13.119, "step": 15792 }, { "epoch": 7.09222924287773, "grad_norm": 0.28137603402137756, "learning_rate": 0.0005, "loss": 2.7305, "step": 16000 }, { "epoch": 7.535639064405276, "grad_norm": 0.25505542755126953, "learning_rate": 0.00053121875, "loss": 2.707, "step": 17000 }, { "epoch": 7.979048885932824, "grad_norm": 0.24025394022464752, "learning_rate": 0.0005624687499999999, "loss": 2.7124, "step": 18000 }, { "epoch": 8.0, "eval_accuracy": 0.4565116264040224, "eval_loss": 2.840358257293701, "eval_runtime": 72.5997, "eval_samples_per_second": 835.816, "eval_steps_per_second": 13.072, "step": 18048 }, { "epoch": 8.422126150094225, "grad_norm": 0.24389903247356415, "learning_rate": 0.0005936875, "loss": 2.6715, "step": 19000 }, { "epoch": 8.865535971621771, "grad_norm": 0.23571985960006714, "learning_rate": 0.0006249375000000001, "loss": 2.689, "step": 20000 }, { "epoch": 9.0, "eval_accuracy": 0.45795106440730543, "eval_loss": 2.8292860984802246, "eval_runtime": 72.5327, "eval_samples_per_second": 836.588, "eval_steps_per_second": 13.084, "step": 20304 }, { "epoch": 9.308613235783172, "grad_norm": 0.2347807139158249, "learning_rate": 0.0006561562500000001, "loss": 2.6603, "step": 21000 }, { "epoch": 9.75202305731072, "grad_norm": 0.21437448263168335, "learning_rate": 0.00068734375, "loss": 2.6674, "step": 22000 }, { "epoch": 10.0, "eval_accuracy": 0.45918653948065713, "eval_loss": 2.817204236984253, "eval_runtime": 72.5446, "eval_samples_per_second": 836.45, "eval_steps_per_second": 13.082, "step": 22560 }, { "epoch": 10.19510032147212, "grad_norm": 0.21189159154891968, "learning_rate": 0.00071859375, "loss": 2.6505, "step": 23000 }, { "epoch": 10.638510142999667, "grad_norm": 0.20313581824302673, "learning_rate": 0.0007498437500000001, "loss": 2.651, "step": 24000 }, { "epoch": 11.0, "eval_accuracy": 0.4604101231791332, "eval_loss": 2.8066623210906982, "eval_runtime": 72.4079, "eval_samples_per_second": 838.03, "eval_steps_per_second": 13.106, "step": 24816 }, { "epoch": 11.081587407161068, "grad_norm": 0.20089128613471985, "learning_rate": 0.00078109375, "loss": 2.6463, "step": 25000 }, { "epoch": 11.524997228688616, "grad_norm": 0.20063355565071106, "learning_rate": 0.0008123125, "loss": 2.633, "step": 26000 }, { "epoch": 11.968407050216163, "grad_norm": 0.18298469483852386, "learning_rate": 0.0008435625, "loss": 2.6483, "step": 27000 }, { "epoch": 12.0, "eval_accuracy": 0.4612719247224269, "eval_loss": 2.800128221511841, "eval_runtime": 72.4998, "eval_samples_per_second": 836.967, "eval_steps_per_second": 13.09, "step": 27072 }, { "epoch": 12.411484314377564, "grad_norm": 0.19970569014549255, "learning_rate": 0.00087478125, "loss": 2.6154, "step": 28000 }, { "epoch": 12.854894135905111, "grad_norm": 0.17127935588359833, "learning_rate": 0.0009060312499999999, "loss": 2.639, "step": 29000 }, { "epoch": 13.0, "eval_accuracy": 0.46190145669342225, "eval_loss": 2.7980594635009766, "eval_runtime": 72.3226, "eval_samples_per_second": 839.018, "eval_steps_per_second": 13.122, "step": 29328 }, { "epoch": 13.297971400066512, "grad_norm": 0.19391290843486786, "learning_rate": 0.00093725, "loss": 2.6128, "step": 30000 }, { "epoch": 13.741381221594057, "grad_norm": 0.1826080083847046, "learning_rate": 0.0009685000000000001, "loss": 2.6268, "step": 31000 }, { "epoch": 14.0, "eval_accuracy": 0.46204221438080834, "eval_loss": 2.7923851013183594, "eval_runtime": 72.136, "eval_samples_per_second": 841.189, "eval_steps_per_second": 13.156, "step": 31584 }, { "epoch": 14.18445848575546, "grad_norm": 0.1709497720003128, "learning_rate": 0.00099971875, "loss": 2.6164, "step": 32000 }, { "epoch": 14.627868307283006, "grad_norm": 0.1697809398174286, "learning_rate": 0.000924351145038168, "loss": 2.6116, "step": 33000 }, { "epoch": 15.0, "eval_accuracy": 0.46505868134992956, "eval_loss": 2.766456365585327, "eval_runtime": 72.5226, "eval_samples_per_second": 836.705, "eval_steps_per_second": 13.086, "step": 33840 }, { "epoch": 15.070945571444408, "grad_norm": 0.1920733004808426, "learning_rate": 0.0008480152671755725, "loss": 2.5972, "step": 34000 }, { "epoch": 15.514355392971954, "grad_norm": 0.17531998455524445, "learning_rate": 0.0007717557251908397, "loss": 2.5637, "step": 35000 }, { "epoch": 15.957765214499501, "grad_norm": 0.175877183675766, "learning_rate": 0.0006954198473282442, "loss": 2.5657, "step": 36000 }, { "epoch": 16.0, "eval_accuracy": 0.4687057789496814, "eval_loss": 2.7436182498931885, "eval_runtime": 72.9943, "eval_samples_per_second": 831.298, "eval_steps_per_second": 13.001, "step": 36096 }, { "epoch": 16.400842478660902, "grad_norm": 0.18140995502471924, "learning_rate": 0.0006191603053435114, "loss": 2.509, "step": 37000 }, { "epoch": 16.84425230018845, "grad_norm": 0.17342546582221985, "learning_rate": 0.0005428244274809161, "loss": 2.5199, "step": 38000 }, { "epoch": 17.0, "eval_accuracy": 0.47174531777114276, "eval_loss": 2.7186553478240967, "eval_runtime": 72.9122, "eval_samples_per_second": 832.234, "eval_steps_per_second": 13.016, "step": 38352 }, { "epoch": 17.28732956434985, "grad_norm": 0.18188068270683289, "learning_rate": 0.0004665648854961832, "loss": 2.4714, "step": 39000 }, { "epoch": 17.730739385877396, "grad_norm": 0.18511821329593658, "learning_rate": 0.0003902290076335878, "loss": 2.4626, "step": 40000 }, { "epoch": 18.0, "eval_accuracy": 0.4746070676128065, "eval_loss": 2.7018399238586426, "eval_runtime": 72.5891, "eval_samples_per_second": 835.938, "eval_steps_per_second": 13.074, "step": 40608 }, { "epoch": 18.1738166500388, "grad_norm": 0.18930885195732117, "learning_rate": 0.0003138931297709924, "loss": 2.4292, "step": 41000 }, { "epoch": 18.617226471566344, "grad_norm": 0.19555842876434326, "learning_rate": 0.00023763358778625953, "loss": 2.3982, "step": 42000 }, { "epoch": 19.0, "eval_accuracy": 0.47732812439412153, "eval_loss": 2.6869590282440186, "eval_runtime": 72.705, "eval_samples_per_second": 834.605, "eval_steps_per_second": 13.053, "step": 42864 }, { "epoch": 19.060303735727746, "grad_norm": 0.2038242667913437, "learning_rate": 0.00016129770992366412, "loss": 2.3835, "step": 43000 }, { "epoch": 19.503713557255292, "grad_norm": 0.20078131556510925, "learning_rate": 8.50381679389313e-05, "loss": 2.3292, "step": 44000 }, { "epoch": 19.94712337878284, "grad_norm": 0.19901303946971893, "learning_rate": 8.702290076335879e-06, "loss": 2.3238, "step": 45000 }, { "epoch": 19.991464360935595, "eval_accuracy": 0.47886644176457666, "eval_loss": 2.683746337890625, "eval_runtime": 72.7112, "eval_samples_per_second": 834.534, "eval_steps_per_second": 13.052, "step": 45100 }, { "epoch": 19.991464360935595, "step": 45100, "total_flos": 1.507910045663232e+18, "train_loss": 2.80528530789056, "train_runtime": 30473.0676, "train_samples_per_second": 378.921, "train_steps_per_second": 1.48 } ], "logging_steps": 1000, "max_steps": 45100, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.507910045663232e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }