{ "best_metric": 0.8123041987419128, "best_model_checkpoint": "./output/training_results/C013_llama3-8b-base_instruct_20240428_005832/checkpoint-15", "epoch": 4.0, "eval_steps": 5, "global_step": 192, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020833333333333332, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.9805, "step": 1 }, { "epoch": 0.020833333333333332, "eval_loss": 0.9736970067024231, "eval_runtime": 2.153, "eval_samples_per_second": 157.916, "eval_steps_per_second": 1.393, "step": 1 }, { "epoch": 0.10416666666666667, "grad_norm": 14.850728211706278, "learning_rate": 1.5e-06, "loss": 0.9446, "step": 5 }, { "epoch": 0.10416666666666667, "eval_loss": 0.9454841613769531, "eval_runtime": 2.0973, "eval_samples_per_second": 162.11, "eval_steps_per_second": 1.43, "step": 5 }, { "epoch": 0.20833333333333334, "grad_norm": 4.950599387514031, "learning_rate": 5.25e-06, "loss": 0.8481, "step": 10 }, { "epoch": 0.20833333333333334, "eval_loss": 0.8153812289237976, "eval_runtime": 2.0923, "eval_samples_per_second": 162.499, "eval_steps_per_second": 1.434, "step": 10 }, { "epoch": 0.3125, "grad_norm": 4.621063619275185, "learning_rate": 9e-06, "loss": 0.7794, "step": 15 }, { "epoch": 0.3125, "eval_loss": 0.8123041987419128, "eval_runtime": 2.1028, "eval_samples_per_second": 161.686, "eval_steps_per_second": 1.427, "step": 15 }, { "epoch": 0.4166666666666667, "grad_norm": 4.141809373286457, "learning_rate": 1.275e-05, "loss": 0.7798, "step": 20 }, { "epoch": 0.4166666666666667, "eval_loss": 0.8410752415657043, "eval_runtime": 2.0891, "eval_samples_per_second": 162.747, "eval_steps_per_second": 1.436, "step": 20 }, { "epoch": 0.5208333333333334, "grad_norm": 4.211750921552142, "learning_rate": 1.3195176200175283e-05, "loss": 0.8576, "step": 25 }, { "epoch": 0.5208333333333334, "eval_loss": 0.8676239848136902, "eval_runtime": 2.0885, "eval_samples_per_second": 162.793, "eval_steps_per_second": 1.436, "step": 25 }, { "epoch": 0.625, "grad_norm": 4.126229536438554, "learning_rate": 9.515676612044427e-06, "loss": 0.8852, "step": 30 }, { "epoch": 0.625, "eval_loss": 0.867268979549408, "eval_runtime": 2.0839, "eval_samples_per_second": 163.157, "eval_steps_per_second": 1.44, "step": 30 }, { "epoch": 0.7291666666666666, "grad_norm": 4.316589185885892, "learning_rate": 6.797580677308734e-06, "loss": 0.8529, "step": 35 }, { "epoch": 0.7291666666666666, "eval_loss": 0.8560981154441833, "eval_runtime": 2.1307, "eval_samples_per_second": 159.573, "eval_steps_per_second": 1.408, "step": 35 }, { "epoch": 0.8333333333333334, "grad_norm": 4.0216031828158005, "learning_rate": 4.808575415542887e-06, "loss": 0.8224, "step": 40 }, { "epoch": 0.8333333333333334, "eval_loss": 0.8470456004142761, "eval_runtime": 2.0873, "eval_samples_per_second": 162.886, "eval_steps_per_second": 1.437, "step": 40 }, { "epoch": 0.9375, "grad_norm": 4.316706720311178, "learning_rate": 3.3676619069852654e-06, "loss": 0.8536, "step": 45 }, { "epoch": 0.9375, "eval_loss": 0.8378292918205261, "eval_runtime": 2.0847, "eval_samples_per_second": 163.089, "eval_steps_per_second": 1.439, "step": 45 }, { "epoch": 1.0416666666666667, "grad_norm": 3.7957934185208795, "learning_rate": 2.334947896124909e-06, "loss": 0.662, "step": 50 }, { "epoch": 1.0416666666666667, "eval_loss": 0.8293696045875549, "eval_runtime": 2.0835, "eval_samples_per_second": 163.187, "eval_steps_per_second": 1.44, "step": 50 }, { "epoch": 1.1458333333333333, "grad_norm": 3.4155908301931186, "learning_rate": 1.603233215095547e-06, "loss": 0.437, "step": 55 }, { "epoch": 1.1458333333333333, "eval_loss": 0.8531150817871094, "eval_runtime": 2.1006, "eval_samples_per_second": 161.859, "eval_steps_per_second": 1.428, "step": 55 }, { "epoch": 1.25, "grad_norm": 3.377214905899517, "learning_rate": 1.0911174606561334e-06, "loss": 0.4402, "step": 60 }, { "epoch": 1.25, "eval_loss": 0.8569180369377136, "eval_runtime": 2.0899, "eval_samples_per_second": 162.69, "eval_steps_per_second": 1.436, "step": 60 }, { "epoch": 1.3541666666666667, "grad_norm": 4.018786896199577, "learning_rate": 7.373930741131784e-07, "loss": 0.4244, "step": 65 }, { "epoch": 1.3541666666666667, "eval_loss": 0.8569238185882568, "eval_runtime": 2.0969, "eval_samples_per_second": 162.148, "eval_steps_per_second": 1.431, "step": 65 }, { "epoch": 1.4583333333333333, "grad_norm": 4.3050060673581205, "learning_rate": 5.374210410959207e-07, "loss": 0.4495, "step": 70 }, { "epoch": 1.4583333333333333, "eval_loss": 0.8547163605690002, "eval_runtime": 2.0852, "eval_samples_per_second": 163.056, "eval_steps_per_second": 1.439, "step": 70 }, { "epoch": 1.5625, "grad_norm": 3.8753963390823842, "learning_rate": 3.6222476698215175e-07, "loss": 0.4689, "step": 75 }, { "epoch": 1.5625, "eval_loss": 0.8493571877479553, "eval_runtime": 2.1006, "eval_samples_per_second": 161.855, "eval_steps_per_second": 1.428, "step": 75 }, { "epoch": 1.6666666666666665, "grad_norm": 3.2777220151938935, "learning_rate": 2.462755297384099e-07, "loss": 0.4309, "step": 80 }, { "epoch": 1.6666666666666665, "eval_loss": 0.846055269241333, "eval_runtime": 2.0775, "eval_samples_per_second": 163.657, "eval_steps_per_second": 1.444, "step": 80 }, { "epoch": 1.7708333333333335, "grad_norm": 3.25027538013195, "learning_rate": 1.7088740175034947e-07, "loss": 0.4299, "step": 85 }, { "epoch": 1.7708333333333335, "eval_loss": 0.8445951342582703, "eval_runtime": 2.0859, "eval_samples_per_second": 163.002, "eval_steps_per_second": 1.438, "step": 85 }, { "epoch": 1.875, "grad_norm": 3.841600887262257, "learning_rate": 1.228102956599465e-07, "loss": 0.4461, "step": 90 }, { "epoch": 1.875, "eval_loss": 0.8440027832984924, "eval_runtime": 2.099, "eval_samples_per_second": 161.984, "eval_steps_per_second": 1.429, "step": 90 }, { "epoch": 1.9791666666666665, "grad_norm": 4.633157495322692, "learning_rate": 9.279207916081227e-08, "loss": 0.4474, "step": 95 }, { "epoch": 1.9791666666666665, "eval_loss": 0.8438854217529297, "eval_runtime": 2.094, "eval_samples_per_second": 162.368, "eval_steps_per_second": 1.433, "step": 95 }, { "epoch": 2.0833333333333335, "grad_norm": 3.3543713588136885, "learning_rate": 7.448002404850094e-08, "loss": 0.3614, "step": 100 }, { "epoch": 2.0833333333333335, "eval_loss": 0.8445320725440979, "eval_runtime": 2.0778, "eval_samples_per_second": 163.634, "eval_steps_per_second": 1.444, "step": 100 }, { "epoch": 2.1875, "grad_norm": 3.5776096289343053, "learning_rate": 6.35920070839697e-08, "loss": 0.3861, "step": 105 }, { "epoch": 2.1875, "eval_loss": 0.8457441926002502, "eval_runtime": 2.1055, "eval_samples_per_second": 161.484, "eval_steps_per_second": 1.425, "step": 105 }, { "epoch": 2.2916666666666665, "grad_norm": 3.811456756438563, "learning_rate": 5.7299804687499997e-08, "loss": 0.3829, "step": 110 }, { "epoch": 2.2916666666666665, "eval_loss": 0.847288191318512, "eval_runtime": 2.083, "eval_samples_per_second": 163.223, "eval_steps_per_second": 1.44, "step": 110 }, { "epoch": 2.3958333333333335, "grad_norm": 3.1978758437608823, "learning_rate": 5.37771434967624e-08, "loss": 0.3764, "step": 115 }, { "epoch": 2.3958333333333335, "eval_loss": 0.8487641215324402, "eval_runtime": 2.1168, "eval_samples_per_second": 160.617, "eval_steps_per_second": 1.417, "step": 115 }, { "epoch": 2.5, "grad_norm": 3.472352228062058, "learning_rate": 5.187403540619925e-08, "loss": 0.3655, "step": 120 }, { "epoch": 2.5, "eval_loss": 0.8499611020088196, "eval_runtime": 2.0908, "eval_samples_per_second": 162.615, "eval_steps_per_second": 1.435, "step": 120 }, { "epoch": 2.6041666666666665, "grad_norm": 3.2298459394815793, "learning_rate": 5.088648238966908e-08, "loss": 0.4243, "step": 125 }, { "epoch": 2.6041666666666665, "eval_loss": 0.8510637879371643, "eval_runtime": 2.0941, "eval_samples_per_second": 162.36, "eval_steps_per_second": 1.433, "step": 125 }, { "epoch": 2.7083333333333335, "grad_norm": 3.7544587648641756, "learning_rate": 5.039701925276604e-08, "loss": 0.3884, "step": 130 }, { "epoch": 2.7083333333333335, "eval_loss": 0.8520172238349915, "eval_runtime": 2.1032, "eval_samples_per_second": 161.66, "eval_steps_per_second": 1.426, "step": 130 }, { "epoch": 2.8125, "grad_norm": 3.5032769257867695, "learning_rate": 5.0166900048082497e-08, "loss": 0.3634, "step": 135 }, { "epoch": 2.8125, "eval_loss": 0.8528143763542175, "eval_runtime": 2.0786, "eval_samples_per_second": 163.568, "eval_steps_per_second": 1.443, "step": 135 }, { "epoch": 2.9166666666666665, "grad_norm": 3.023294292675947, "learning_rate": 5.0065147322870076e-08, "loss": 0.3846, "step": 140 }, { "epoch": 2.9166666666666665, "eval_loss": 0.8537066578865051, "eval_runtime": 2.0903, "eval_samples_per_second": 162.659, "eval_steps_per_second": 1.435, "step": 140 }, { "epoch": 3.0208333333333335, "grad_norm": 3.1767015238154075, "learning_rate": 5.002328628528332e-08, "loss": 0.3872, "step": 145 }, { "epoch": 3.0208333333333335, "eval_loss": 0.8547406196594238, "eval_runtime": 2.0891, "eval_samples_per_second": 162.748, "eval_steps_per_second": 1.436, "step": 145 }, { "epoch": 3.125, "grad_norm": 3.1942747338221045, "learning_rate": 5.0007484528133236e-08, "loss": 0.3869, "step": 150 }, { "epoch": 3.125, "eval_loss": 0.8557960391044617, "eval_runtime": 2.0819, "eval_samples_per_second": 163.312, "eval_steps_per_second": 1.441, "step": 150 }, { "epoch": 3.2291666666666665, "grad_norm": 3.815918812229993, "learning_rate": 5.0002110817570477e-08, "loss": 0.3876, "step": 155 }, { "epoch": 3.2291666666666665, "eval_loss": 0.8566272854804993, "eval_runtime": 2.0781, "eval_samples_per_second": 163.61, "eval_steps_per_second": 1.444, "step": 155 }, { "epoch": 3.3333333333333335, "grad_norm": 3.4577646975309366, "learning_rate": 5.0000504842356326e-08, "loss": 0.3844, "step": 160 }, { "epoch": 3.3333333333333335, "eval_loss": 0.8572790026664734, "eval_runtime": 2.0811, "eval_samples_per_second": 163.373, "eval_steps_per_second": 1.442, "step": 160 }, { "epoch": 3.4375, "grad_norm": 3.274685205370877, "learning_rate": 5.000009745562451e-08, "loss": 0.3535, "step": 165 }, { "epoch": 3.4375, "eval_loss": 0.8578632473945618, "eval_runtime": 2.0918, "eval_samples_per_second": 162.539, "eval_steps_per_second": 1.434, "step": 165 }, { "epoch": 3.5416666666666665, "grad_norm": 3.246459205886974, "learning_rate": 5.0000014077810156e-08, "loss": 0.3488, "step": 170 }, { "epoch": 3.5416666666666665, "eval_loss": 0.85884028673172, "eval_runtime": 2.1178, "eval_samples_per_second": 160.545, "eval_steps_per_second": 1.417, "step": 170 }, { "epoch": 3.6458333333333335, "grad_norm": 3.3944513203963504, "learning_rate": 5.0000001343508807e-08, "loss": 0.3464, "step": 175 }, { "epoch": 3.6458333333333335, "eval_loss": 0.8598365783691406, "eval_runtime": 2.0828, "eval_samples_per_second": 163.238, "eval_steps_per_second": 1.44, "step": 175 }, { "epoch": 3.75, "grad_norm": 3.258773113208273, "learning_rate": 5.000000006747581e-08, "loss": 0.361, "step": 180 }, { "epoch": 3.75, "eval_loss": 0.8606703877449036, "eval_runtime": 2.1172, "eval_samples_per_second": 160.588, "eval_steps_per_second": 1.417, "step": 180 }, { "epoch": 3.8541666666666665, "grad_norm": 3.586703083699586, "learning_rate": 5.0000000001094325e-08, "loss": 0.3674, "step": 185 }, { "epoch": 3.8541666666666665, "eval_loss": 0.8611735701560974, "eval_runtime": 2.0956, "eval_samples_per_second": 162.243, "eval_steps_per_second": 1.432, "step": 185 }, { "epoch": 3.9583333333333335, "grad_norm": 3.5661429802112616, "learning_rate": 5.000000000000139e-08, "loss": 0.3988, "step": 190 }, { "epoch": 3.9583333333333335, "eval_loss": 0.8612277507781982, "eval_runtime": 2.0853, "eval_samples_per_second": 163.045, "eval_steps_per_second": 1.439, "step": 190 }, { "epoch": 4.0, "step": 192, "total_flos": 5363820134400.0, "train_loss": 0.5090278356025616, "train_runtime": 6014.9673, "train_samples_per_second": 2.031, "train_steps_per_second": 0.032 } ], "logging_steps": 5, "max_steps": 192, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 5, "total_flos": 5363820134400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }