adapters-gemma-bf16-QLORA-super_glue-boolq
/
trainer_state-gemma-bf16-QLORA-super_glue-boolq-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 1.984, | |
"eval_steps": 1, | |
"global_step": 124, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.016, | |
"grad_norm": 58.75, | |
"learning_rate": 2.5e-05, | |
"loss": 1.6327, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.016, | |
"eval_accuracy": 0.344, | |
"eval_loss": 1.8050549030303955, | |
"eval_runtime": 8.5979, | |
"eval_samples_per_second": 29.077, | |
"eval_steps_per_second": 3.722, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.032, | |
"grad_norm": 173.0, | |
"learning_rate": 5e-05, | |
"loss": 1.2182, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.032, | |
"eval_accuracy": 0.368, | |
"eval_loss": 1.5831865072250366, | |
"eval_runtime": 8.6593, | |
"eval_samples_per_second": 28.871, | |
"eval_steps_per_second": 3.695, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.048, | |
"grad_norm": 78.5, | |
"learning_rate": 4.959016393442623e-05, | |
"loss": 1.0166, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.048, | |
"eval_accuracy": 0.48, | |
"eval_loss": 1.2497016191482544, | |
"eval_runtime": 8.6547, | |
"eval_samples_per_second": 28.886, | |
"eval_steps_per_second": 3.697, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.064, | |
"grad_norm": 205.0, | |
"learning_rate": 4.918032786885246e-05, | |
"loss": 1.1151, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.064, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.9809591174125671, | |
"eval_runtime": 8.6606, | |
"eval_samples_per_second": 28.866, | |
"eval_steps_per_second": 3.695, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.08, | |
"grad_norm": 92.0, | |
"learning_rate": 4.8770491803278687e-05, | |
"loss": 1.1203, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.08, | |
"eval_accuracy": 0.616, | |
"eval_loss": 0.9002195000648499, | |
"eval_runtime": 8.6562, | |
"eval_samples_per_second": 28.881, | |
"eval_steps_per_second": 3.697, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.096, | |
"grad_norm": 39.5, | |
"learning_rate": 4.836065573770492e-05, | |
"loss": 0.3129, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.096, | |
"eval_accuracy": 0.692, | |
"eval_loss": 0.8504685759544373, | |
"eval_runtime": 8.6632, | |
"eval_samples_per_second": 28.858, | |
"eval_steps_per_second": 3.694, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.112, | |
"grad_norm": 93.0, | |
"learning_rate": 4.795081967213115e-05, | |
"loss": 0.989, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.112, | |
"eval_accuracy": 0.72, | |
"eval_loss": 0.8811690807342529, | |
"eval_runtime": 8.6664, | |
"eval_samples_per_second": 28.847, | |
"eval_steps_per_second": 3.692, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.128, | |
"grad_norm": 69.5, | |
"learning_rate": 4.754098360655738e-05, | |
"loss": 0.6991, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.128, | |
"eval_accuracy": 0.68, | |
"eval_loss": 1.079397439956665, | |
"eval_runtime": 8.6622, | |
"eval_samples_per_second": 28.861, | |
"eval_steps_per_second": 3.694, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.144, | |
"grad_norm": 161.0, | |
"learning_rate": 4.713114754098361e-05, | |
"loss": 1.2626, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.144, | |
"eval_accuracy": 0.688, | |
"eval_loss": 1.0678237676620483, | |
"eval_runtime": 8.6644, | |
"eval_samples_per_second": 28.854, | |
"eval_steps_per_second": 3.693, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 155.0, | |
"learning_rate": 4.672131147540984e-05, | |
"loss": 0.7883, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.16, | |
"eval_accuracy": 0.696, | |
"eval_loss": 0.88979172706604, | |
"eval_runtime": 8.6685, | |
"eval_samples_per_second": 28.84, | |
"eval_steps_per_second": 3.692, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.176, | |
"grad_norm": 71.5, | |
"learning_rate": 4.631147540983607e-05, | |
"loss": 0.2973, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.176, | |
"eval_accuracy": 0.768, | |
"eval_loss": 0.7034730315208435, | |
"eval_runtime": 8.6628, | |
"eval_samples_per_second": 28.859, | |
"eval_steps_per_second": 3.694, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.192, | |
"grad_norm": 32.5, | |
"learning_rate": 4.59016393442623e-05, | |
"loss": 0.3976, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.192, | |
"eval_accuracy": 0.772, | |
"eval_loss": 0.64277583360672, | |
"eval_runtime": 8.6669, | |
"eval_samples_per_second": 28.845, | |
"eval_steps_per_second": 3.692, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.208, | |
"grad_norm": 85.0, | |
"learning_rate": 4.549180327868853e-05, | |
"loss": 0.8966, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.208, | |
"eval_accuracy": 0.776, | |
"eval_loss": 0.5894673466682434, | |
"eval_runtime": 8.6718, | |
"eval_samples_per_second": 28.829, | |
"eval_steps_per_second": 3.69, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.224, | |
"grad_norm": 93.0, | |
"learning_rate": 4.508196721311476e-05, | |
"loss": 0.3748, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.224, | |
"eval_accuracy": 0.748, | |
"eval_loss": 0.6436864137649536, | |
"eval_runtime": 8.6619, | |
"eval_samples_per_second": 28.862, | |
"eval_steps_per_second": 3.694, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.24, | |
"grad_norm": 102.0, | |
"learning_rate": 4.467213114754098e-05, | |
"loss": 0.6883, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.24, | |
"eval_accuracy": 0.74, | |
"eval_loss": 0.6454311609268188, | |
"eval_runtime": 8.6684, | |
"eval_samples_per_second": 28.841, | |
"eval_steps_per_second": 3.692, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.256, | |
"grad_norm": 40.5, | |
"learning_rate": 4.426229508196721e-05, | |
"loss": 0.3292, | |
"step": 16 | |
}, | |
{ | |
"epoch": 0.256, | |
"eval_accuracy": 0.708, | |
"eval_loss": 0.8357064127922058, | |
"eval_runtime": 8.6666, | |
"eval_samples_per_second": 28.846, | |
"eval_steps_per_second": 3.692, | |
"step": 16 | |
}, | |
{ | |
"epoch": 0.272, | |
"grad_norm": 138.0, | |
"learning_rate": 4.3852459016393444e-05, | |
"loss": 1.0341, | |
"step": 17 | |
}, | |
{ | |
"epoch": 0.272, | |
"eval_accuracy": 0.692, | |
"eval_loss": 0.920940101146698, | |
"eval_runtime": 8.6644, | |
"eval_samples_per_second": 28.854, | |
"eval_steps_per_second": 3.693, | |
"step": 17 | |
}, | |
{ | |
"epoch": 0.288, | |
"grad_norm": 97.5, | |
"learning_rate": 4.3442622950819674e-05, | |
"loss": 0.8867, | |
"step": 18 | |
}, | |
{ | |
"epoch": 0.288, | |
"eval_accuracy": 0.708, | |
"eval_loss": 0.8621469736099243, | |
"eval_runtime": 8.6638, | |
"eval_samples_per_second": 28.856, | |
"eval_steps_per_second": 3.694, | |
"step": 18 | |
}, | |
{ | |
"epoch": 0.304, | |
"grad_norm": 176.0, | |
"learning_rate": 4.3032786885245904e-05, | |
"loss": 1.2041, | |
"step": 19 | |
}, | |
{ | |
"epoch": 0.304, | |
"eval_accuracy": 0.744, | |
"eval_loss": 0.67635178565979, | |
"eval_runtime": 8.6671, | |
"eval_samples_per_second": 28.845, | |
"eval_steps_per_second": 3.692, | |
"step": 19 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 150.0, | |
"learning_rate": 4.262295081967213e-05, | |
"loss": 0.9002, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.32, | |
"eval_accuracy": 0.732, | |
"eval_loss": 0.5985668301582336, | |
"eval_runtime": 8.6631, | |
"eval_samples_per_second": 28.858, | |
"eval_steps_per_second": 3.694, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.336, | |
"grad_norm": 53.0, | |
"learning_rate": 4.2213114754098365e-05, | |
"loss": 0.8948, | |
"step": 21 | |
}, | |
{ | |
"epoch": 0.336, | |
"eval_accuracy": 0.716, | |
"eval_loss": 0.652230978012085, | |
"eval_runtime": 8.6655, | |
"eval_samples_per_second": 28.85, | |
"eval_steps_per_second": 3.693, | |
"step": 21 | |
}, | |
{ | |
"epoch": 0.352, | |
"grad_norm": 174.0, | |
"learning_rate": 4.1803278688524595e-05, | |
"loss": 0.86, | |
"step": 22 | |
}, | |
{ | |
"epoch": 0.352, | |
"eval_accuracy": 0.728, | |
"eval_loss": 0.6597179174423218, | |
"eval_runtime": 8.6672, | |
"eval_samples_per_second": 28.844, | |
"eval_steps_per_second": 3.692, | |
"step": 22 | |
}, | |
{ | |
"epoch": 0.368, | |
"grad_norm": 156.0, | |
"learning_rate": 4.1393442622950826e-05, | |
"loss": 0.6364, | |
"step": 23 | |
}, | |
{ | |
"epoch": 0.368, | |
"eval_accuracy": 0.744, | |
"eval_loss": 0.5796850919723511, | |
"eval_runtime": 8.664, | |
"eval_samples_per_second": 28.855, | |
"eval_steps_per_second": 3.693, | |
"step": 23 | |
}, | |
{ | |
"epoch": 0.384, | |
"grad_norm": 53.25, | |
"learning_rate": 4.098360655737705e-05, | |
"loss": 0.2094, | |
"step": 24 | |
}, | |
{ | |
"epoch": 0.384, | |
"eval_accuracy": 0.748, | |
"eval_loss": 0.5883631706237793, | |
"eval_runtime": 8.6686, | |
"eval_samples_per_second": 28.84, | |
"eval_steps_per_second": 3.692, | |
"step": 24 | |
}, | |
{ | |
"epoch": 0.4, | |
"grad_norm": 87.5, | |
"learning_rate": 4.057377049180328e-05, | |
"loss": 0.4607, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.4, | |
"eval_accuracy": 0.768, | |
"eval_loss": 0.5390456318855286, | |
"eval_runtime": 8.6866, | |
"eval_samples_per_second": 28.78, | |
"eval_steps_per_second": 3.684, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.416, | |
"grad_norm": 155.0, | |
"learning_rate": 4.016393442622951e-05, | |
"loss": 0.814, | |
"step": 26 | |
}, | |
{ | |
"epoch": 0.416, | |
"eval_accuracy": 0.78, | |
"eval_loss": 0.4743637144565582, | |
"eval_runtime": 8.6531, | |
"eval_samples_per_second": 28.892, | |
"eval_steps_per_second": 3.698, | |
"step": 26 | |
}, | |
{ | |
"epoch": 0.432, | |
"grad_norm": 41.0, | |
"learning_rate": 3.975409836065574e-05, | |
"loss": 0.5358, | |
"step": 27 | |
}, | |
{ | |
"epoch": 0.432, | |
"eval_accuracy": 0.776, | |
"eval_loss": 0.4668542146682739, | |
"eval_runtime": 8.6595, | |
"eval_samples_per_second": 28.87, | |
"eval_steps_per_second": 3.695, | |
"step": 27 | |
}, | |
{ | |
"epoch": 0.448, | |
"grad_norm": 131.0, | |
"learning_rate": 3.934426229508197e-05, | |
"loss": 0.5556, | |
"step": 28 | |
}, | |
{ | |
"epoch": 0.448, | |
"eval_accuracy": 0.736, | |
"eval_loss": 0.6067003011703491, | |
"eval_runtime": 8.6518, | |
"eval_samples_per_second": 28.896, | |
"eval_steps_per_second": 3.699, | |
"step": 28 | |
}, | |
{ | |
"epoch": 0.464, | |
"grad_norm": 126.5, | |
"learning_rate": 3.89344262295082e-05, | |
"loss": 0.505, | |
"step": 29 | |
}, | |
{ | |
"epoch": 0.464, | |
"eval_accuracy": 0.712, | |
"eval_loss": 0.7375366687774658, | |
"eval_runtime": 8.6519, | |
"eval_samples_per_second": 28.895, | |
"eval_steps_per_second": 3.699, | |
"step": 29 | |
}, | |
{ | |
"epoch": 0.48, | |
"grad_norm": 171.0, | |
"learning_rate": 3.8524590163934424e-05, | |
"loss": 0.9589, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.48, | |
"eval_accuracy": 0.704, | |
"eval_loss": 0.7679601311683655, | |
"eval_runtime": 8.6582, | |
"eval_samples_per_second": 28.874, | |
"eval_steps_per_second": 3.696, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.496, | |
"grad_norm": 150.0, | |
"learning_rate": 3.8114754098360655e-05, | |
"loss": 0.74, | |
"step": 31 | |
}, | |
{ | |
"epoch": 0.496, | |
"eval_accuracy": 0.732, | |
"eval_loss": 0.6937733888626099, | |
"eval_runtime": 8.6569, | |
"eval_samples_per_second": 28.879, | |
"eval_steps_per_second": 3.696, | |
"step": 31 | |
}, | |
{ | |
"epoch": 0.512, | |
"grad_norm": 79.5, | |
"learning_rate": 3.7704918032786885e-05, | |
"loss": 0.5474, | |
"step": 32 | |
}, | |
{ | |
"epoch": 0.512, | |
"eval_accuracy": 0.748, | |
"eval_loss": 0.5756805539131165, | |
"eval_runtime": 8.6562, | |
"eval_samples_per_second": 28.881, | |
"eval_steps_per_second": 3.697, | |
"step": 32 | |
}, | |
{ | |
"epoch": 0.528, | |
"grad_norm": 112.5, | |
"learning_rate": 3.729508196721312e-05, | |
"loss": 0.4916, | |
"step": 33 | |
}, | |
{ | |
"epoch": 0.528, | |
"eval_accuracy": 0.792, | |
"eval_loss": 0.47289371490478516, | |
"eval_runtime": 8.6581, | |
"eval_samples_per_second": 28.875, | |
"eval_steps_per_second": 3.696, | |
"step": 33 | |
}, | |
{ | |
"epoch": 0.544, | |
"grad_norm": 33.0, | |
"learning_rate": 3.6885245901639346e-05, | |
"loss": 0.8822, | |
"step": 34 | |
}, | |
{ | |
"epoch": 0.544, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.4487142264842987, | |
"eval_runtime": 8.6584, | |
"eval_samples_per_second": 28.874, | |
"eval_steps_per_second": 3.696, | |
"step": 34 | |
}, | |
{ | |
"epoch": 0.56, | |
"grad_norm": 84.5, | |
"learning_rate": 3.6475409836065576e-05, | |
"loss": 0.7691, | |
"step": 35 | |
}, | |
{ | |
"epoch": 0.56, | |
"eval_accuracy": 0.812, | |
"eval_loss": 0.45519721508026123, | |
"eval_runtime": 8.6547, | |
"eval_samples_per_second": 28.886, | |
"eval_steps_per_second": 3.697, | |
"step": 35 | |
}, | |
{ | |
"epoch": 0.576, | |
"grad_norm": 28.625, | |
"learning_rate": 3.6065573770491806e-05, | |
"loss": 0.4743, | |
"step": 36 | |
}, | |
{ | |
"epoch": 0.576, | |
"eval_accuracy": 0.764, | |
"eval_loss": 0.5331873893737793, | |
"eval_runtime": 8.6566, | |
"eval_samples_per_second": 28.88, | |
"eval_steps_per_second": 3.697, | |
"step": 36 | |
}, | |
{ | |
"epoch": 0.592, | |
"grad_norm": 23.875, | |
"learning_rate": 3.5655737704918037e-05, | |
"loss": 0.3101, | |
"step": 37 | |
}, | |
{ | |
"epoch": 0.592, | |
"eval_accuracy": 0.744, | |
"eval_loss": 0.6849313974380493, | |
"eval_runtime": 8.6571, | |
"eval_samples_per_second": 28.878, | |
"eval_steps_per_second": 3.696, | |
"step": 37 | |
}, | |
{ | |
"epoch": 0.608, | |
"grad_norm": 103.5, | |
"learning_rate": 3.524590163934427e-05, | |
"loss": 0.962, | |
"step": 38 | |
}, | |
{ | |
"epoch": 0.608, | |
"eval_accuracy": 0.724, | |
"eval_loss": 0.7783421874046326, | |
"eval_runtime": 8.6583, | |
"eval_samples_per_second": 28.874, | |
"eval_steps_per_second": 3.696, | |
"step": 38 | |
}, | |
{ | |
"epoch": 0.624, | |
"grad_norm": 133.0, | |
"learning_rate": 3.483606557377049e-05, | |
"loss": 0.5671, | |
"step": 39 | |
}, | |
{ | |
"epoch": 0.624, | |
"eval_accuracy": 0.712, | |
"eval_loss": 0.7919518947601318, | |
"eval_runtime": 8.661, | |
"eval_samples_per_second": 28.865, | |
"eval_steps_per_second": 3.695, | |
"step": 39 | |
}, | |
{ | |
"epoch": 0.64, | |
"grad_norm": 193.0, | |
"learning_rate": 3.442622950819672e-05, | |
"loss": 0.7741, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.64, | |
"eval_accuracy": 0.724, | |
"eval_loss": 0.7195008397102356, | |
"eval_runtime": 8.6644, | |
"eval_samples_per_second": 28.854, | |
"eval_steps_per_second": 3.693, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.656, | |
"grad_norm": 236.0, | |
"learning_rate": 3.401639344262295e-05, | |
"loss": 0.9336, | |
"step": 41 | |
}, | |
{ | |
"epoch": 0.656, | |
"eval_accuracy": 0.784, | |
"eval_loss": 0.5999830365180969, | |
"eval_runtime": 8.6611, | |
"eval_samples_per_second": 28.865, | |
"eval_steps_per_second": 3.695, | |
"step": 41 | |
}, | |
{ | |
"epoch": 0.672, | |
"grad_norm": 194.0, | |
"learning_rate": 3.360655737704918e-05, | |
"loss": 0.9252, | |
"step": 42 | |
}, | |
{ | |
"epoch": 0.672, | |
"eval_accuracy": 0.812, | |
"eval_loss": 0.4787631928920746, | |
"eval_runtime": 8.6643, | |
"eval_samples_per_second": 28.854, | |
"eval_steps_per_second": 3.693, | |
"step": 42 | |
}, | |
{ | |
"epoch": 0.688, | |
"grad_norm": 102.0, | |
"learning_rate": 3.319672131147541e-05, | |
"loss": 0.2934, | |
"step": 43 | |
}, | |
{ | |
"epoch": 0.688, | |
"eval_accuracy": 0.812, | |
"eval_loss": 0.41090723872184753, | |
"eval_runtime": 8.6614, | |
"eval_samples_per_second": 28.864, | |
"eval_steps_per_second": 3.695, | |
"step": 43 | |
}, | |
{ | |
"epoch": 0.704, | |
"grad_norm": 87.5, | |
"learning_rate": 3.2786885245901635e-05, | |
"loss": 0.4936, | |
"step": 44 | |
}, | |
{ | |
"epoch": 0.704, | |
"eval_accuracy": 0.78, | |
"eval_loss": 0.46753987669944763, | |
"eval_runtime": 8.6615, | |
"eval_samples_per_second": 28.863, | |
"eval_steps_per_second": 3.694, | |
"step": 44 | |
}, | |
{ | |
"epoch": 0.72, | |
"grad_norm": 18.375, | |
"learning_rate": 3.237704918032787e-05, | |
"loss": 0.3223, | |
"step": 45 | |
}, | |
{ | |
"epoch": 0.72, | |
"eval_accuracy": 0.748, | |
"eval_loss": 0.5864301919937134, | |
"eval_runtime": 8.6597, | |
"eval_samples_per_second": 28.869, | |
"eval_steps_per_second": 3.695, | |
"step": 45 | |
}, | |
{ | |
"epoch": 0.736, | |
"grad_norm": 121.0, | |
"learning_rate": 3.19672131147541e-05, | |
"loss": 0.408, | |
"step": 46 | |
}, | |
{ | |
"epoch": 0.736, | |
"eval_accuracy": 0.728, | |
"eval_loss": 0.6596755981445312, | |
"eval_runtime": 8.663, | |
"eval_samples_per_second": 28.858, | |
"eval_steps_per_second": 3.694, | |
"step": 46 | |
}, | |
{ | |
"epoch": 0.752, | |
"grad_norm": 54.0, | |
"learning_rate": 3.155737704918033e-05, | |
"loss": 0.759, | |
"step": 47 | |
}, | |
{ | |
"epoch": 0.752, | |
"eval_accuracy": 0.728, | |
"eval_loss": 0.6460751891136169, | |
"eval_runtime": 8.6565, | |
"eval_samples_per_second": 28.88, | |
"eval_steps_per_second": 3.697, | |
"step": 47 | |
}, | |
{ | |
"epoch": 0.768, | |
"grad_norm": 114.0, | |
"learning_rate": 3.114754098360656e-05, | |
"loss": 0.6628, | |
"step": 48 | |
}, | |
{ | |
"epoch": 0.768, | |
"eval_accuracy": 0.744, | |
"eval_loss": 0.5938560962677002, | |
"eval_runtime": 8.6567, | |
"eval_samples_per_second": 28.879, | |
"eval_steps_per_second": 3.697, | |
"step": 48 | |
}, | |
{ | |
"epoch": 0.784, | |
"grad_norm": 111.0, | |
"learning_rate": 3.073770491803279e-05, | |
"loss": 0.761, | |
"step": 49 | |
}, | |
{ | |
"epoch": 0.784, | |
"eval_accuracy": 0.804, | |
"eval_loss": 0.5164662003517151, | |
"eval_runtime": 8.6557, | |
"eval_samples_per_second": 28.883, | |
"eval_steps_per_second": 3.697, | |
"step": 49 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 32.0, | |
"learning_rate": 3.0327868852459017e-05, | |
"loss": 0.308, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.8, | |
"eval_accuracy": 0.836, | |
"eval_loss": 0.43705108761787415, | |
"eval_runtime": 8.6584, | |
"eval_samples_per_second": 28.874, | |
"eval_steps_per_second": 3.696, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.816, | |
"grad_norm": 78.0, | |
"learning_rate": 2.9918032786885248e-05, | |
"loss": 0.4859, | |
"step": 51 | |
}, | |
{ | |
"epoch": 0.816, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.3826155364513397, | |
"eval_runtime": 8.6539, | |
"eval_samples_per_second": 28.889, | |
"eval_steps_per_second": 3.698, | |
"step": 51 | |
}, | |
{ | |
"epoch": 0.832, | |
"grad_norm": 24.5, | |
"learning_rate": 2.9508196721311478e-05, | |
"loss": 0.6841, | |
"step": 52 | |
}, | |
{ | |
"epoch": 0.832, | |
"eval_accuracy": 0.828, | |
"eval_loss": 0.3742530345916748, | |
"eval_runtime": 8.6541, | |
"eval_samples_per_second": 28.888, | |
"eval_steps_per_second": 3.698, | |
"step": 52 | |
}, | |
{ | |
"epoch": 0.848, | |
"grad_norm": 37.5, | |
"learning_rate": 2.9098360655737705e-05, | |
"loss": 0.7852, | |
"step": 53 | |
}, | |
{ | |
"epoch": 0.848, | |
"eval_accuracy": 0.8, | |
"eval_loss": 0.43144190311431885, | |
"eval_runtime": 8.653, | |
"eval_samples_per_second": 28.892, | |
"eval_steps_per_second": 3.698, | |
"step": 53 | |
}, | |
{ | |
"epoch": 0.864, | |
"grad_norm": 91.0, | |
"learning_rate": 2.8688524590163935e-05, | |
"loss": 0.3388, | |
"step": 54 | |
}, | |
{ | |
"epoch": 0.864, | |
"eval_accuracy": 0.792, | |
"eval_loss": 0.501422107219696, | |
"eval_runtime": 8.6518, | |
"eval_samples_per_second": 28.896, | |
"eval_steps_per_second": 3.699, | |
"step": 54 | |
}, | |
{ | |
"epoch": 0.88, | |
"grad_norm": 17.625, | |
"learning_rate": 2.8278688524590162e-05, | |
"loss": 0.3829, | |
"step": 55 | |
}, | |
{ | |
"epoch": 0.88, | |
"eval_accuracy": 0.768, | |
"eval_loss": 0.5729050040245056, | |
"eval_runtime": 8.6468, | |
"eval_samples_per_second": 28.912, | |
"eval_steps_per_second": 3.701, | |
"step": 55 | |
}, | |
{ | |
"epoch": 0.896, | |
"grad_norm": 93.5, | |
"learning_rate": 2.7868852459016392e-05, | |
"loss": 0.6144, | |
"step": 56 | |
}, | |
{ | |
"epoch": 0.896, | |
"eval_accuracy": 0.764, | |
"eval_loss": 0.6807990074157715, | |
"eval_runtime": 8.6452, | |
"eval_samples_per_second": 28.918, | |
"eval_steps_per_second": 3.701, | |
"step": 56 | |
}, | |
{ | |
"epoch": 0.912, | |
"grad_norm": 28.5, | |
"learning_rate": 2.7459016393442626e-05, | |
"loss": 0.3515, | |
"step": 57 | |
}, | |
{ | |
"epoch": 0.912, | |
"eval_accuracy": 0.756, | |
"eval_loss": 0.7396586537361145, | |
"eval_runtime": 8.6535, | |
"eval_samples_per_second": 28.89, | |
"eval_steps_per_second": 3.698, | |
"step": 57 | |
}, | |
{ | |
"epoch": 0.928, | |
"grad_norm": 112.5, | |
"learning_rate": 2.7049180327868856e-05, | |
"loss": 0.3028, | |
"step": 58 | |
}, | |
{ | |
"epoch": 0.928, | |
"eval_accuracy": 0.756, | |
"eval_loss": 0.745948314666748, | |
"eval_runtime": 8.6584, | |
"eval_samples_per_second": 28.874, | |
"eval_steps_per_second": 3.696, | |
"step": 58 | |
}, | |
{ | |
"epoch": 0.944, | |
"grad_norm": 164.0, | |
"learning_rate": 2.6639344262295087e-05, | |
"loss": 0.6729, | |
"step": 59 | |
}, | |
{ | |
"epoch": 0.944, | |
"eval_accuracy": 0.752, | |
"eval_loss": 0.7118371725082397, | |
"eval_runtime": 8.6567, | |
"eval_samples_per_second": 28.879, | |
"eval_steps_per_second": 3.697, | |
"step": 59 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 131.0, | |
"learning_rate": 2.6229508196721314e-05, | |
"loss": 0.4634, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.96, | |
"eval_accuracy": 0.76, | |
"eval_loss": 0.6441870331764221, | |
"eval_runtime": 8.6557, | |
"eval_samples_per_second": 28.883, | |
"eval_steps_per_second": 3.697, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.976, | |
"grad_norm": 127.5, | |
"learning_rate": 2.5819672131147544e-05, | |
"loss": 0.5924, | |
"step": 61 | |
}, | |
{ | |
"epoch": 0.976, | |
"eval_accuracy": 0.776, | |
"eval_loss": 0.5635260939598083, | |
"eval_runtime": 8.6577, | |
"eval_samples_per_second": 28.876, | |
"eval_steps_per_second": 3.696, | |
"step": 61 | |
}, | |
{ | |
"epoch": 0.992, | |
"grad_norm": 130.0, | |
"learning_rate": 2.540983606557377e-05, | |
"loss": 0.5527, | |
"step": 62 | |
}, | |
{ | |
"epoch": 0.992, | |
"eval_accuracy": 0.796, | |
"eval_loss": 0.4781284034252167, | |
"eval_runtime": 8.6636, | |
"eval_samples_per_second": 28.856, | |
"eval_steps_per_second": 3.694, | |
"step": 62 | |
}, | |
{ | |
"epoch": 1.008, | |
"grad_norm": 43.0, | |
"learning_rate": 2.5e-05, | |
"loss": 0.1542, | |
"step": 63 | |
}, | |
{ | |
"epoch": 1.008, | |
"eval_accuracy": 0.82, | |
"eval_loss": 0.4085061252117157, | |
"eval_runtime": 8.6612, | |
"eval_samples_per_second": 28.864, | |
"eval_steps_per_second": 3.695, | |
"step": 63 | |
}, | |
{ | |
"epoch": 1.024, | |
"grad_norm": 147.0, | |
"learning_rate": 2.459016393442623e-05, | |
"loss": 0.3714, | |
"step": 64 | |
}, | |
{ | |
"epoch": 1.024, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.37276288866996765, | |
"eval_runtime": 8.6673, | |
"eval_samples_per_second": 28.844, | |
"eval_steps_per_second": 3.692, | |
"step": 64 | |
}, | |
{ | |
"epoch": 1.04, | |
"grad_norm": 34.5, | |
"learning_rate": 2.418032786885246e-05, | |
"loss": 0.1124, | |
"step": 65 | |
}, | |
{ | |
"epoch": 1.04, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.36895105242729187, | |
"eval_runtime": 8.6746, | |
"eval_samples_per_second": 28.82, | |
"eval_steps_per_second": 3.689, | |
"step": 65 | |
}, | |
{ | |
"epoch": 1.056, | |
"grad_norm": 50.25, | |
"learning_rate": 2.377049180327869e-05, | |
"loss": 0.1433, | |
"step": 66 | |
}, | |
{ | |
"epoch": 1.056, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.3762807548046112, | |
"eval_runtime": 8.6794, | |
"eval_samples_per_second": 28.804, | |
"eval_steps_per_second": 3.687, | |
"step": 66 | |
}, | |
{ | |
"epoch": 1.072, | |
"grad_norm": 85.5, | |
"learning_rate": 2.336065573770492e-05, | |
"loss": 0.2446, | |
"step": 67 | |
}, | |
{ | |
"epoch": 1.072, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.38033661246299744, | |
"eval_runtime": 8.6709, | |
"eval_samples_per_second": 28.832, | |
"eval_steps_per_second": 3.691, | |
"step": 67 | |
}, | |
{ | |
"epoch": 1.088, | |
"grad_norm": 120.5, | |
"learning_rate": 2.295081967213115e-05, | |
"loss": 0.6573, | |
"step": 68 | |
}, | |
{ | |
"epoch": 1.088, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.37577661871910095, | |
"eval_runtime": 8.6746, | |
"eval_samples_per_second": 28.82, | |
"eval_steps_per_second": 3.689, | |
"step": 68 | |
}, | |
{ | |
"epoch": 1.104, | |
"grad_norm": 32.25, | |
"learning_rate": 2.254098360655738e-05, | |
"loss": 0.1509, | |
"step": 69 | |
}, | |
{ | |
"epoch": 1.104, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.36732277274131775, | |
"eval_runtime": 8.6668, | |
"eval_samples_per_second": 28.846, | |
"eval_steps_per_second": 3.692, | |
"step": 69 | |
}, | |
{ | |
"epoch": 1.12, | |
"grad_norm": 36.0, | |
"learning_rate": 2.2131147540983607e-05, | |
"loss": 0.2131, | |
"step": 70 | |
}, | |
{ | |
"epoch": 1.12, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.36693572998046875, | |
"eval_runtime": 8.667, | |
"eval_samples_per_second": 28.845, | |
"eval_steps_per_second": 3.692, | |
"step": 70 | |
}, | |
{ | |
"epoch": 1.1360000000000001, | |
"grad_norm": 35.0, | |
"learning_rate": 2.1721311475409837e-05, | |
"loss": 0.077, | |
"step": 71 | |
}, | |
{ | |
"epoch": 1.1360000000000001, | |
"eval_accuracy": 0.836, | |
"eval_loss": 0.3619978427886963, | |
"eval_runtime": 8.671, | |
"eval_samples_per_second": 28.832, | |
"eval_steps_per_second": 3.69, | |
"step": 71 | |
}, | |
{ | |
"epoch": 1.152, | |
"grad_norm": 21.625, | |
"learning_rate": 2.1311475409836064e-05, | |
"loss": 0.2332, | |
"step": 72 | |
}, | |
{ | |
"epoch": 1.152, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.36414313316345215, | |
"eval_runtime": 8.6706, | |
"eval_samples_per_second": 28.833, | |
"eval_steps_per_second": 3.691, | |
"step": 72 | |
}, | |
{ | |
"epoch": 1.168, | |
"grad_norm": 69.5, | |
"learning_rate": 2.0901639344262298e-05, | |
"loss": 0.2056, | |
"step": 73 | |
}, | |
{ | |
"epoch": 1.168, | |
"eval_accuracy": 0.836, | |
"eval_loss": 0.36293938755989075, | |
"eval_runtime": 8.6724, | |
"eval_samples_per_second": 28.827, | |
"eval_steps_per_second": 3.69, | |
"step": 73 | |
}, | |
{ | |
"epoch": 1.184, | |
"grad_norm": 9.5, | |
"learning_rate": 2.0491803278688525e-05, | |
"loss": 0.1412, | |
"step": 74 | |
}, | |
{ | |
"epoch": 1.184, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.3655231297016144, | |
"eval_runtime": 8.6711, | |
"eval_samples_per_second": 28.831, | |
"eval_steps_per_second": 3.69, | |
"step": 74 | |
}, | |
{ | |
"epoch": 1.2, | |
"grad_norm": 35.25, | |
"learning_rate": 2.0081967213114755e-05, | |
"loss": 0.1982, | |
"step": 75 | |
}, | |
{ | |
"epoch": 1.2, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.3644102215766907, | |
"eval_runtime": 8.6641, | |
"eval_samples_per_second": 28.855, | |
"eval_steps_per_second": 3.693, | |
"step": 75 | |
}, | |
{ | |
"epoch": 1.216, | |
"grad_norm": 12.875, | |
"learning_rate": 1.9672131147540985e-05, | |
"loss": 0.2003, | |
"step": 76 | |
}, | |
{ | |
"epoch": 1.216, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.3651863932609558, | |
"eval_runtime": 8.6665, | |
"eval_samples_per_second": 28.847, | |
"eval_steps_per_second": 3.692, | |
"step": 76 | |
}, | |
{ | |
"epoch": 1.232, | |
"grad_norm": 7.28125, | |
"learning_rate": 1.9262295081967212e-05, | |
"loss": 0.0934, | |
"step": 77 | |
}, | |
{ | |
"epoch": 1.232, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.3709143102169037, | |
"eval_runtime": 8.6583, | |
"eval_samples_per_second": 28.874, | |
"eval_steps_per_second": 3.696, | |
"step": 77 | |
}, | |
{ | |
"epoch": 1.248, | |
"grad_norm": 42.25, | |
"learning_rate": 1.8852459016393442e-05, | |
"loss": 0.1577, | |
"step": 78 | |
}, | |
{ | |
"epoch": 1.248, | |
"eval_accuracy": 0.836, | |
"eval_loss": 0.37103718519210815, | |
"eval_runtime": 8.6594, | |
"eval_samples_per_second": 28.87, | |
"eval_steps_per_second": 3.695, | |
"step": 78 | |
}, | |
{ | |
"epoch": 1.264, | |
"grad_norm": 25.25, | |
"learning_rate": 1.8442622950819673e-05, | |
"loss": 0.3063, | |
"step": 79 | |
}, | |
{ | |
"epoch": 1.264, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.3689051866531372, | |
"eval_runtime": 8.6658, | |
"eval_samples_per_second": 28.849, | |
"eval_steps_per_second": 3.693, | |
"step": 79 | |
}, | |
{ | |
"epoch": 1.28, | |
"grad_norm": 31.625, | |
"learning_rate": 1.8032786885245903e-05, | |
"loss": 0.2724, | |
"step": 80 | |
}, | |
{ | |
"epoch": 1.28, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.3685128688812256, | |
"eval_runtime": 8.6623, | |
"eval_samples_per_second": 28.861, | |
"eval_steps_per_second": 3.694, | |
"step": 80 | |
}, | |
{ | |
"epoch": 1.296, | |
"grad_norm": 34.75, | |
"learning_rate": 1.7622950819672133e-05, | |
"loss": 0.4324, | |
"step": 81 | |
}, | |
{ | |
"epoch": 1.296, | |
"eval_accuracy": 0.836, | |
"eval_loss": 0.3717711567878723, | |
"eval_runtime": 8.6564, | |
"eval_samples_per_second": 28.88, | |
"eval_steps_per_second": 3.697, | |
"step": 81 | |
}, | |
{ | |
"epoch": 1.312, | |
"grad_norm": 33.0, | |
"learning_rate": 1.721311475409836e-05, | |
"loss": 0.1911, | |
"step": 82 | |
}, | |
{ | |
"epoch": 1.312, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.3723936080932617, | |
"eval_runtime": 8.6687, | |
"eval_samples_per_second": 28.839, | |
"eval_steps_per_second": 3.691, | |
"step": 82 | |
}, | |
{ | |
"epoch": 1.328, | |
"grad_norm": 16.125, | |
"learning_rate": 1.680327868852459e-05, | |
"loss": 0.1936, | |
"step": 83 | |
}, | |
{ | |
"epoch": 1.328, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.3704240024089813, | |
"eval_runtime": 8.6668, | |
"eval_samples_per_second": 28.846, | |
"eval_steps_per_second": 3.692, | |
"step": 83 | |
}, | |
{ | |
"epoch": 1.3439999999999999, | |
"grad_norm": 34.75, | |
"learning_rate": 1.6393442622950818e-05, | |
"loss": 0.0839, | |
"step": 84 | |
}, | |
{ | |
"epoch": 1.3439999999999999, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.36510899662971497, | |
"eval_runtime": 8.661, | |
"eval_samples_per_second": 28.865, | |
"eval_steps_per_second": 3.695, | |
"step": 84 | |
}, | |
{ | |
"epoch": 1.3599999999999999, | |
"grad_norm": 40.0, | |
"learning_rate": 1.598360655737705e-05, | |
"loss": 0.2661, | |
"step": 85 | |
}, | |
{ | |
"epoch": 1.3599999999999999, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.3661534786224365, | |
"eval_runtime": 8.6702, | |
"eval_samples_per_second": 28.834, | |
"eval_steps_per_second": 3.691, | |
"step": 85 | |
}, | |
{ | |
"epoch": 1.376, | |
"grad_norm": 52.5, | |
"learning_rate": 1.557377049180328e-05, | |
"loss": 0.1679, | |
"step": 86 | |
}, | |
{ | |
"epoch": 1.376, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.36859577894210815, | |
"eval_runtime": 8.6649, | |
"eval_samples_per_second": 28.852, | |
"eval_steps_per_second": 3.693, | |
"step": 86 | |
}, | |
{ | |
"epoch": 1.392, | |
"grad_norm": 12.75, | |
"learning_rate": 1.5163934426229509e-05, | |
"loss": 0.0698, | |
"step": 87 | |
}, | |
{ | |
"epoch": 1.392, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.3691750466823578, | |
"eval_runtime": 8.6861, | |
"eval_samples_per_second": 28.782, | |
"eval_steps_per_second": 3.684, | |
"step": 87 | |
}, | |
{ | |
"epoch": 1.408, | |
"grad_norm": 39.25, | |
"learning_rate": 1.4754098360655739e-05, | |
"loss": 0.1173, | |
"step": 88 | |
}, | |
{ | |
"epoch": 1.408, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.3779418170452118, | |
"eval_runtime": 8.6673, | |
"eval_samples_per_second": 28.844, | |
"eval_steps_per_second": 3.692, | |
"step": 88 | |
}, | |
{ | |
"epoch": 1.424, | |
"grad_norm": 21.5, | |
"learning_rate": 1.4344262295081968e-05, | |
"loss": 0.3727, | |
"step": 89 | |
}, | |
{ | |
"epoch": 1.424, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.38709089159965515, | |
"eval_runtime": 8.6636, | |
"eval_samples_per_second": 28.856, | |
"eval_steps_per_second": 3.694, | |
"step": 89 | |
}, | |
{ | |
"epoch": 1.44, | |
"grad_norm": 18.25, | |
"learning_rate": 1.3934426229508196e-05, | |
"loss": 0.3828, | |
"step": 90 | |
}, | |
{ | |
"epoch": 1.44, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.3986479640007019, | |
"eval_runtime": 8.6565, | |
"eval_samples_per_second": 28.88, | |
"eval_steps_per_second": 3.697, | |
"step": 90 | |
}, | |
{ | |
"epoch": 1.456, | |
"grad_norm": 29.875, | |
"learning_rate": 1.3524590163934428e-05, | |
"loss": 0.0911, | |
"step": 91 | |
}, | |
{ | |
"epoch": 1.456, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.4078799784183502, | |
"eval_runtime": 8.654, | |
"eval_samples_per_second": 28.888, | |
"eval_steps_per_second": 3.698, | |
"step": 91 | |
}, | |
{ | |
"epoch": 1.472, | |
"grad_norm": 40.75, | |
"learning_rate": 1.3114754098360657e-05, | |
"loss": 0.1798, | |
"step": 92 | |
}, | |
{ | |
"epoch": 1.472, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.4203779399394989, | |
"eval_runtime": 8.6654, | |
"eval_samples_per_second": 28.85, | |
"eval_steps_per_second": 3.693, | |
"step": 92 | |
}, | |
{ | |
"epoch": 1.488, | |
"grad_norm": 15.6875, | |
"learning_rate": 1.2704918032786885e-05, | |
"loss": 0.0851, | |
"step": 93 | |
}, | |
{ | |
"epoch": 1.488, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.4253535568714142, | |
"eval_runtime": 8.6605, | |
"eval_samples_per_second": 28.867, | |
"eval_steps_per_second": 3.695, | |
"step": 93 | |
}, | |
{ | |
"epoch": 1.504, | |
"grad_norm": 21.0, | |
"learning_rate": 1.2295081967213116e-05, | |
"loss": 0.0962, | |
"step": 94 | |
}, | |
{ | |
"epoch": 1.504, | |
"eval_accuracy": 0.832, | |
"eval_loss": 0.42336249351501465, | |
"eval_runtime": 8.6599, | |
"eval_samples_per_second": 28.869, | |
"eval_steps_per_second": 3.695, | |
"step": 94 | |
}, | |
{ | |
"epoch": 1.52, | |
"grad_norm": 111.0, | |
"learning_rate": 1.1885245901639344e-05, | |
"loss": 0.3427, | |
"step": 95 | |
}, | |
{ | |
"epoch": 1.52, | |
"eval_accuracy": 0.828, | |
"eval_loss": 0.4188750684261322, | |
"eval_runtime": 8.6648, | |
"eval_samples_per_second": 28.852, | |
"eval_steps_per_second": 3.693, | |
"step": 95 | |
}, | |
{ | |
"epoch": 1.536, | |
"grad_norm": 27.5, | |
"learning_rate": 1.1475409836065575e-05, | |
"loss": 0.0881, | |
"step": 96 | |
}, | |
{ | |
"epoch": 1.536, | |
"eval_accuracy": 0.84, | |
"eval_loss": 0.4100199043750763, | |
"eval_runtime": 8.6603, | |
"eval_samples_per_second": 28.867, | |
"eval_steps_per_second": 3.695, | |
"step": 96 | |
}, | |
{ | |
"epoch": 1.552, | |
"grad_norm": 69.0, | |
"learning_rate": 1.1065573770491803e-05, | |
"loss": 0.1664, | |
"step": 97 | |
}, | |
{ | |
"epoch": 1.552, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.39891311526298523, | |
"eval_runtime": 8.6581, | |
"eval_samples_per_second": 28.875, | |
"eval_steps_per_second": 3.696, | |
"step": 97 | |
}, | |
{ | |
"epoch": 1.568, | |
"grad_norm": 46.25, | |
"learning_rate": 1.0655737704918032e-05, | |
"loss": 0.6757, | |
"step": 98 | |
}, | |
{ | |
"epoch": 1.568, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.3860258162021637, | |
"eval_runtime": 8.66, | |
"eval_samples_per_second": 28.868, | |
"eval_steps_per_second": 3.695, | |
"step": 98 | |
}, | |
{ | |
"epoch": 1.584, | |
"grad_norm": 5.875, | |
"learning_rate": 1.0245901639344262e-05, | |
"loss": 0.0751, | |
"step": 99 | |
}, | |
{ | |
"epoch": 1.584, | |
"eval_accuracy": 0.86, | |
"eval_loss": 0.3817059397697449, | |
"eval_runtime": 8.6608, | |
"eval_samples_per_second": 28.866, | |
"eval_steps_per_second": 3.695, | |
"step": 99 | |
}, | |
{ | |
"epoch": 1.6, | |
"grad_norm": 64.0, | |
"learning_rate": 9.836065573770493e-06, | |
"loss": 0.1923, | |
"step": 100 | |
}, | |
{ | |
"epoch": 1.6, | |
"eval_accuracy": 0.856, | |
"eval_loss": 0.37669360637664795, | |
"eval_runtime": 8.6602, | |
"eval_samples_per_second": 28.868, | |
"eval_steps_per_second": 3.695, | |
"step": 100 | |
}, | |
{ | |
"epoch": 1.616, | |
"grad_norm": 11.75, | |
"learning_rate": 9.426229508196721e-06, | |
"loss": 0.0365, | |
"step": 101 | |
}, | |
{ | |
"epoch": 1.616, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.3779665231704712, | |
"eval_runtime": 8.6661, | |
"eval_samples_per_second": 28.848, | |
"eval_steps_per_second": 3.693, | |
"step": 101 | |
}, | |
{ | |
"epoch": 1.6320000000000001, | |
"grad_norm": 13.75, | |
"learning_rate": 9.016393442622952e-06, | |
"loss": 0.0895, | |
"step": 102 | |
}, | |
{ | |
"epoch": 1.6320000000000001, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.3783411383628845, | |
"eval_runtime": 8.6625, | |
"eval_samples_per_second": 28.86, | |
"eval_steps_per_second": 3.694, | |
"step": 102 | |
}, | |
{ | |
"epoch": 1.6480000000000001, | |
"grad_norm": 87.0, | |
"learning_rate": 8.60655737704918e-06, | |
"loss": 0.3337, | |
"step": 103 | |
}, | |
{ | |
"epoch": 1.6480000000000001, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.3828529417514801, | |
"eval_runtime": 8.6587, | |
"eval_samples_per_second": 28.873, | |
"eval_steps_per_second": 3.696, | |
"step": 103 | |
}, | |
{ | |
"epoch": 1.6640000000000001, | |
"grad_norm": 29.375, | |
"learning_rate": 8.196721311475409e-06, | |
"loss": 0.1525, | |
"step": 104 | |
}, | |
{ | |
"epoch": 1.6640000000000001, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.38398581743240356, | |
"eval_runtime": 8.6634, | |
"eval_samples_per_second": 28.857, | |
"eval_steps_per_second": 3.694, | |
"step": 104 | |
}, | |
{ | |
"epoch": 1.6800000000000002, | |
"grad_norm": 10.0625, | |
"learning_rate": 7.78688524590164e-06, | |
"loss": 0.0973, | |
"step": 105 | |
}, | |
{ | |
"epoch": 1.6800000000000002, | |
"eval_accuracy": 0.844, | |
"eval_loss": 0.3848567605018616, | |
"eval_runtime": 8.6582, | |
"eval_samples_per_second": 28.874, | |
"eval_steps_per_second": 3.696, | |
"step": 105 | |
}, | |
{ | |
"epoch": 1.696, | |
"grad_norm": 77.5, | |
"learning_rate": 7.3770491803278695e-06, | |
"loss": 0.2626, | |
"step": 106 | |
}, | |
{ | |
"epoch": 1.696, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.385408878326416, | |
"eval_runtime": 8.6577, | |
"eval_samples_per_second": 28.876, | |
"eval_steps_per_second": 3.696, | |
"step": 106 | |
}, | |
{ | |
"epoch": 1.712, | |
"grad_norm": 9.5, | |
"learning_rate": 6.967213114754098e-06, | |
"loss": 0.0585, | |
"step": 107 | |
}, | |
{ | |
"epoch": 1.712, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.38454535603523254, | |
"eval_runtime": 8.6544, | |
"eval_samples_per_second": 28.887, | |
"eval_steps_per_second": 3.698, | |
"step": 107 | |
}, | |
{ | |
"epoch": 1.728, | |
"grad_norm": 55.0, | |
"learning_rate": 6.557377049180328e-06, | |
"loss": 0.2257, | |
"step": 108 | |
}, | |
{ | |
"epoch": 1.728, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.38379326462745667, | |
"eval_runtime": 8.6523, | |
"eval_samples_per_second": 28.894, | |
"eval_steps_per_second": 3.698, | |
"step": 108 | |
}, | |
{ | |
"epoch": 1.744, | |
"grad_norm": 32.5, | |
"learning_rate": 6.147540983606558e-06, | |
"loss": 0.1137, | |
"step": 109 | |
}, | |
{ | |
"epoch": 1.744, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.3833220303058624, | |
"eval_runtime": 8.652, | |
"eval_samples_per_second": 28.895, | |
"eval_steps_per_second": 3.699, | |
"step": 109 | |
}, | |
{ | |
"epoch": 1.76, | |
"grad_norm": 15.4375, | |
"learning_rate": 5.737704918032787e-06, | |
"loss": 0.283, | |
"step": 110 | |
}, | |
{ | |
"epoch": 1.76, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.37939703464508057, | |
"eval_runtime": 8.6539, | |
"eval_samples_per_second": 28.889, | |
"eval_steps_per_second": 3.698, | |
"step": 110 | |
}, | |
{ | |
"epoch": 1.776, | |
"grad_norm": 27.375, | |
"learning_rate": 5.327868852459016e-06, | |
"loss": 0.1111, | |
"step": 111 | |
}, | |
{ | |
"epoch": 1.776, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.3771066665649414, | |
"eval_runtime": 8.6565, | |
"eval_samples_per_second": 28.88, | |
"eval_steps_per_second": 3.697, | |
"step": 111 | |
}, | |
{ | |
"epoch": 1.792, | |
"grad_norm": 26.125, | |
"learning_rate": 4.918032786885246e-06, | |
"loss": 0.1367, | |
"step": 112 | |
}, | |
{ | |
"epoch": 1.792, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.3757225275039673, | |
"eval_runtime": 8.6575, | |
"eval_samples_per_second": 28.877, | |
"eval_steps_per_second": 3.696, | |
"step": 112 | |
}, | |
{ | |
"epoch": 1.808, | |
"grad_norm": 24.875, | |
"learning_rate": 4.508196721311476e-06, | |
"loss": 0.0762, | |
"step": 113 | |
}, | |
{ | |
"epoch": 1.808, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.3756250739097595, | |
"eval_runtime": 8.6535, | |
"eval_samples_per_second": 28.89, | |
"eval_steps_per_second": 3.698, | |
"step": 113 | |
}, | |
{ | |
"epoch": 1.8239999999999998, | |
"grad_norm": 47.5, | |
"learning_rate": 4.098360655737704e-06, | |
"loss": 0.133, | |
"step": 114 | |
}, | |
{ | |
"epoch": 1.8239999999999998, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.37420740723609924, | |
"eval_runtime": 8.6587, | |
"eval_samples_per_second": 28.873, | |
"eval_steps_per_second": 3.696, | |
"step": 114 | |
}, | |
{ | |
"epoch": 1.8399999999999999, | |
"grad_norm": 22.625, | |
"learning_rate": 3.6885245901639347e-06, | |
"loss": 0.2904, | |
"step": 115 | |
}, | |
{ | |
"epoch": 1.8399999999999999, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.372751921415329, | |
"eval_runtime": 8.6548, | |
"eval_samples_per_second": 28.886, | |
"eval_steps_per_second": 3.697, | |
"step": 115 | |
}, | |
{ | |
"epoch": 1.8559999999999999, | |
"grad_norm": 16.75, | |
"learning_rate": 3.278688524590164e-06, | |
"loss": 0.1686, | |
"step": 116 | |
}, | |
{ | |
"epoch": 1.8559999999999999, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.3734797239303589, | |
"eval_runtime": 8.6629, | |
"eval_samples_per_second": 28.859, | |
"eval_steps_per_second": 3.694, | |
"step": 116 | |
}, | |
{ | |
"epoch": 1.8719999999999999, | |
"grad_norm": 35.25, | |
"learning_rate": 2.8688524590163937e-06, | |
"loss": 0.0737, | |
"step": 117 | |
}, | |
{ | |
"epoch": 1.8719999999999999, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.3713564872741699, | |
"eval_runtime": 8.6633, | |
"eval_samples_per_second": 28.857, | |
"eval_steps_per_second": 3.694, | |
"step": 117 | |
}, | |
{ | |
"epoch": 1.888, | |
"grad_norm": 100.0, | |
"learning_rate": 2.459016393442623e-06, | |
"loss": 0.2758, | |
"step": 118 | |
}, | |
{ | |
"epoch": 1.888, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.3682093322277069, | |
"eval_runtime": 8.6631, | |
"eval_samples_per_second": 28.858, | |
"eval_steps_per_second": 3.694, | |
"step": 118 | |
}, | |
{ | |
"epoch": 1.904, | |
"grad_norm": 22.5, | |
"learning_rate": 2.049180327868852e-06, | |
"loss": 0.0542, | |
"step": 119 | |
}, | |
{ | |
"epoch": 1.904, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.3716946542263031, | |
"eval_runtime": 8.6618, | |
"eval_samples_per_second": 28.862, | |
"eval_steps_per_second": 3.694, | |
"step": 119 | |
}, | |
{ | |
"epoch": 1.92, | |
"grad_norm": 69.0, | |
"learning_rate": 1.639344262295082e-06, | |
"loss": 0.1646, | |
"step": 120 | |
}, | |
{ | |
"epoch": 1.92, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.3682910203933716, | |
"eval_runtime": 8.6617, | |
"eval_samples_per_second": 28.863, | |
"eval_steps_per_second": 3.694, | |
"step": 120 | |
}, | |
{ | |
"epoch": 1.936, | |
"grad_norm": 31.125, | |
"learning_rate": 1.2295081967213116e-06, | |
"loss": 0.4908, | |
"step": 121 | |
}, | |
{ | |
"epoch": 1.936, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.3708224594593048, | |
"eval_runtime": 8.6585, | |
"eval_samples_per_second": 28.873, | |
"eval_steps_per_second": 3.696, | |
"step": 121 | |
}, | |
{ | |
"epoch": 1.952, | |
"grad_norm": 55.75, | |
"learning_rate": 8.19672131147541e-07, | |
"loss": 0.3249, | |
"step": 122 | |
}, | |
{ | |
"epoch": 1.952, | |
"eval_accuracy": 0.848, | |
"eval_loss": 0.36828938126564026, | |
"eval_runtime": 8.6603, | |
"eval_samples_per_second": 28.867, | |
"eval_steps_per_second": 3.695, | |
"step": 122 | |
}, | |
{ | |
"epoch": 1.968, | |
"grad_norm": 22.375, | |
"learning_rate": 4.098360655737705e-07, | |
"loss": 0.1096, | |
"step": 123 | |
}, | |
{ | |
"epoch": 1.968, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.3706204891204834, | |
"eval_runtime": 8.6631, | |
"eval_samples_per_second": 28.858, | |
"eval_steps_per_second": 3.694, | |
"step": 123 | |
}, | |
{ | |
"epoch": 1.984, | |
"grad_norm": 52.5, | |
"learning_rate": 0.0, | |
"loss": 0.1758, | |
"step": 124 | |
}, | |
{ | |
"epoch": 1.984, | |
"eval_accuracy": 0.852, | |
"eval_loss": 0.36876150965690613, | |
"eval_runtime": 8.6609, | |
"eval_samples_per_second": 28.865, | |
"eval_steps_per_second": 3.695, | |
"step": 124 | |
}, | |
{ | |
"epoch": 1.984, | |
"step": 124, | |
"total_flos": 1.693315531538432e+16, | |
"train_loss": 0.4407010670871504, | |
"train_runtime": 1267.2396, | |
"train_samples_per_second": 1.578, | |
"train_steps_per_second": 0.098 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 124, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 2, | |
"save_steps": 500, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": false, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 1.693315531538432e+16, | |
"train_batch_size": 2, | |
"trial_name": null, | |
"trial_params": null | |
} | |