{ "best_metric": 0.23757965862751007, "best_model_checkpoint": "qa-model-finetune/checkpoint-117", "epoch": 8.26, "eval_steps": 500, "global_step": 117, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 4.000000000000001e-06, "loss": 1.1156, "step": 1 }, { "epoch": 0.04, "learning_rate": 8.000000000000001e-06, "loss": 1.225, "step": 2 }, { "epoch": 0.06, "learning_rate": 1.2e-05, "loss": 1.0757, "step": 3 }, { "epoch": 0.08, "learning_rate": 1.6000000000000003e-05, "loss": 0.8098, "step": 4 }, { "epoch": 0.1, "learning_rate": 2e-05, "loss": 1.2561, "step": 5 }, { "epoch": 0.12, "learning_rate": 2.4e-05, "loss": 1.2355, "step": 6 }, { "epoch": 0.14, "learning_rate": 2.8000000000000003e-05, "loss": 1.0707, "step": 7 }, { "epoch": 0.16, "learning_rate": 3.2000000000000005e-05, "loss": 1.3054, "step": 8 }, { "epoch": 0.18, "learning_rate": 3.6e-05, "loss": 1.2546, "step": 9 }, { "epoch": 0.2, "learning_rate": 4e-05, "loss": 1.2499, "step": 10 }, { "epoch": 0.22, "learning_rate": 4.4000000000000006e-05, "loss": 1.0749, "step": 11 }, { "epoch": 0.24, "learning_rate": 4.8e-05, "loss": 1.1383, "step": 12 }, { "epoch": 0.26, "learning_rate": 5.2000000000000004e-05, "loss": 1.15, "step": 13 }, { "epoch": 0.26, "eval_loss": 0.841742217540741, "eval_runtime": 24.118, "eval_samples_per_second": 0.788, "eval_steps_per_second": 0.207, "step": 13 }, { "epoch": 1.02, "learning_rate": 5.6000000000000006e-05, "loss": 0.9705, "step": 14 }, { "epoch": 1.04, "learning_rate": 6e-05, "loss": 1.2398, "step": 15 }, { "epoch": 1.06, "learning_rate": 6.400000000000001e-05, "loss": 0.9997, "step": 16 }, { "epoch": 1.08, "learning_rate": 6.800000000000001e-05, "loss": 1.2356, "step": 17 }, { "epoch": 1.1, "learning_rate": 7.2e-05, "loss": 0.8312, "step": 18 }, { "epoch": 1.12, "learning_rate": 7.6e-05, "loss": 1.0169, "step": 19 }, { "epoch": 1.14, "learning_rate": 8e-05, "loss": 0.8709, "step": 20 }, { "epoch": 1.16, "learning_rate": 8.4e-05, "loss": 1.121, "step": 21 }, { "epoch": 1.18, "learning_rate": 8.800000000000001e-05, "loss": 0.9481, "step": 22 }, { "epoch": 1.2, "learning_rate": 9.200000000000001e-05, "loss": 0.8888, "step": 23 }, { "epoch": 1.22, "learning_rate": 9.6e-05, "loss": 1.0233, "step": 24 }, { "epoch": 1.24, "learning_rate": 0.0001, "loss": 0.8382, "step": 25 }, { "epoch": 1.26, "learning_rate": 0.00010400000000000001, "loss": 0.8734, "step": 26 }, { "epoch": 1.26, "eval_loss": 0.7118059992790222, "eval_runtime": 24.0715, "eval_samples_per_second": 0.789, "eval_steps_per_second": 0.208, "step": 26 }, { "epoch": 2.02, "learning_rate": 0.00010800000000000001, "loss": 0.9912, "step": 27 }, { "epoch": 2.04, "learning_rate": 0.00011200000000000001, "loss": 1.0997, "step": 28 }, { "epoch": 2.06, "learning_rate": 0.000116, "loss": 0.7957, "step": 29 }, { "epoch": 2.08, "learning_rate": 0.00012, "loss": 0.9109, "step": 30 }, { "epoch": 2.1, "learning_rate": 0.000124, "loss": 0.9713, "step": 31 }, { "epoch": 2.12, "learning_rate": 0.00012800000000000002, "loss": 0.7388, "step": 32 }, { "epoch": 2.14, "learning_rate": 0.000132, "loss": 0.481, "step": 33 }, { "epoch": 2.16, "learning_rate": 0.00013600000000000003, "loss": 0.9344, "step": 34 }, { "epoch": 2.18, "learning_rate": 0.00014, "loss": 0.8717, "step": 35 }, { "epoch": 2.2, "learning_rate": 0.000144, "loss": 0.7509, "step": 36 }, { "epoch": 2.22, "learning_rate": 0.000148, "loss": 0.8116, "step": 37 }, { "epoch": 2.24, "learning_rate": 0.000152, "loss": 0.7251, "step": 38 }, { "epoch": 2.26, "learning_rate": 0.00015600000000000002, "loss": 0.7168, "step": 39 }, { "epoch": 2.26, "eval_loss": 0.5603676438331604, "eval_runtime": 23.997, "eval_samples_per_second": 0.792, "eval_steps_per_second": 0.208, "step": 39 }, { "epoch": 3.02, "learning_rate": 0.00016, "loss": 0.7851, "step": 40 }, { "epoch": 3.04, "learning_rate": 0.000164, "loss": 0.7412, "step": 41 }, { "epoch": 3.06, "learning_rate": 0.000168, "loss": 0.674, "step": 42 }, { "epoch": 3.08, "learning_rate": 0.000172, "loss": 0.7634, "step": 43 }, { "epoch": 3.1, "learning_rate": 0.00017600000000000002, "loss": 0.6933, "step": 44 }, { "epoch": 3.12, "learning_rate": 0.00018, "loss": 0.5634, "step": 45 }, { "epoch": 3.14, "learning_rate": 0.00018400000000000003, "loss": 0.6571, "step": 46 }, { "epoch": 3.16, "learning_rate": 0.000188, "loss": 0.5984, "step": 47 }, { "epoch": 3.18, "learning_rate": 0.000192, "loss": 0.674, "step": 48 }, { "epoch": 3.2, "learning_rate": 0.000196, "loss": 0.6171, "step": 49 }, { "epoch": 3.22, "learning_rate": 0.0002, "loss": 0.6129, "step": 50 }, { "epoch": 3.24, "learning_rate": 0.00019955555555555558, "loss": 0.5328, "step": 51 }, { "epoch": 3.26, "learning_rate": 0.00019911111111111111, "loss": 0.479, "step": 52 }, { "epoch": 3.26, "eval_loss": 0.41886183619499207, "eval_runtime": 23.9197, "eval_samples_per_second": 0.794, "eval_steps_per_second": 0.209, "step": 52 }, { "epoch": 4.02, "learning_rate": 0.00019866666666666668, "loss": 0.5089, "step": 53 }, { "epoch": 4.04, "learning_rate": 0.00019822222222222225, "loss": 0.5049, "step": 54 }, { "epoch": 4.06, "learning_rate": 0.00019777777777777778, "loss": 0.5407, "step": 55 }, { "epoch": 4.08, "learning_rate": 0.00019733333333333335, "loss": 0.3821, "step": 56 }, { "epoch": 4.1, "learning_rate": 0.0001968888888888889, "loss": 0.484, "step": 57 }, { "epoch": 4.12, "learning_rate": 0.00019644444444444445, "loss": 0.5705, "step": 58 }, { "epoch": 4.14, "learning_rate": 0.000196, "loss": 0.5301, "step": 59 }, { "epoch": 4.16, "learning_rate": 0.00019555555555555556, "loss": 0.6168, "step": 60 }, { "epoch": 4.18, "learning_rate": 0.0001951111111111111, "loss": 0.4745, "step": 61 }, { "epoch": 4.2, "learning_rate": 0.0001946666666666667, "loss": 0.6383, "step": 62 }, { "epoch": 4.22, "learning_rate": 0.00019422222222222223, "loss": 0.3682, "step": 63 }, { "epoch": 4.24, "learning_rate": 0.0001937777777777778, "loss": 0.5654, "step": 64 }, { "epoch": 4.26, "learning_rate": 0.00019333333333333333, "loss": 0.451, "step": 65 }, { "epoch": 4.26, "eval_loss": 0.3399796783924103, "eval_runtime": 24.1635, "eval_samples_per_second": 0.786, "eval_steps_per_second": 0.207, "step": 65 }, { "epoch": 5.02, "learning_rate": 0.0001928888888888889, "loss": 0.4663, "step": 66 }, { "epoch": 5.04, "learning_rate": 0.00019244444444444444, "loss": 0.4582, "step": 67 }, { "epoch": 5.06, "learning_rate": 0.000192, "loss": 0.3941, "step": 68 }, { "epoch": 5.08, "learning_rate": 0.00019155555555555554, "loss": 0.401, "step": 69 }, { "epoch": 5.1, "learning_rate": 0.00019111111111111114, "loss": 0.5281, "step": 70 }, { "epoch": 5.12, "learning_rate": 0.00019066666666666668, "loss": 0.4658, "step": 71 }, { "epoch": 5.14, "learning_rate": 0.00019022222222222224, "loss": 0.4215, "step": 72 }, { "epoch": 5.16, "learning_rate": 0.00018977777777777778, "loss": 0.33, "step": 73 }, { "epoch": 5.18, "learning_rate": 0.00018933333333333335, "loss": 0.5385, "step": 74 }, { "epoch": 5.2, "learning_rate": 0.00018888888888888888, "loss": 0.3539, "step": 75 }, { "epoch": 5.22, "learning_rate": 0.00018844444444444445, "loss": 0.4084, "step": 76 }, { "epoch": 5.24, "learning_rate": 0.000188, "loss": 0.2773, "step": 77 }, { "epoch": 5.26, "learning_rate": 0.00018755555555555558, "loss": 0.3543, "step": 78 }, { "epoch": 5.26, "eval_loss": 0.29384246468544006, "eval_runtime": 24.1775, "eval_samples_per_second": 0.786, "eval_steps_per_second": 0.207, "step": 78 }, { "epoch": 6.02, "learning_rate": 0.00018711111111111112, "loss": 0.4595, "step": 79 }, { "epoch": 6.04, "learning_rate": 0.0001866666666666667, "loss": 0.4364, "step": 80 }, { "epoch": 6.06, "learning_rate": 0.00018622222222222223, "loss": 0.2679, "step": 81 }, { "epoch": 6.08, "learning_rate": 0.0001857777777777778, "loss": 0.3882, "step": 82 }, { "epoch": 6.1, "learning_rate": 0.00018533333333333333, "loss": 0.3004, "step": 83 }, { "epoch": 6.12, "learning_rate": 0.0001848888888888889, "loss": 0.3132, "step": 84 }, { "epoch": 6.14, "learning_rate": 0.00018444444444444446, "loss": 0.4519, "step": 85 }, { "epoch": 6.16, "learning_rate": 0.00018400000000000003, "loss": 0.2922, "step": 86 }, { "epoch": 6.18, "learning_rate": 0.00018355555555555557, "loss": 0.3561, "step": 87 }, { "epoch": 6.2, "learning_rate": 0.00018311111111111113, "loss": 0.3193, "step": 88 }, { "epoch": 6.22, "learning_rate": 0.00018266666666666667, "loss": 0.3153, "step": 89 }, { "epoch": 6.24, "learning_rate": 0.00018222222222222224, "loss": 0.2202, "step": 90 }, { "epoch": 6.26, "learning_rate": 0.00018177777777777778, "loss": 0.2222, "step": 91 }, { "epoch": 6.26, "eval_loss": 0.2629795968532562, "eval_runtime": 24.1398, "eval_samples_per_second": 0.787, "eval_steps_per_second": 0.207, "step": 91 }, { "epoch": 7.02, "learning_rate": 0.00018133333333333334, "loss": 0.1878, "step": 92 }, { "epoch": 7.04, "learning_rate": 0.0001808888888888889, "loss": 0.2644, "step": 93 }, { "epoch": 7.06, "learning_rate": 0.00018044444444444447, "loss": 0.2501, "step": 94 }, { "epoch": 7.08, "learning_rate": 0.00018, "loss": 0.2801, "step": 95 }, { "epoch": 7.1, "learning_rate": 0.00017955555555555558, "loss": 0.3479, "step": 96 }, { "epoch": 7.12, "learning_rate": 0.00017911111111111112, "loss": 0.3556, "step": 97 }, { "epoch": 7.14, "learning_rate": 0.00017866666666666668, "loss": 0.2206, "step": 98 }, { "epoch": 7.16, "learning_rate": 0.00017822222222222222, "loss": 0.2939, "step": 99 }, { "epoch": 7.18, "learning_rate": 0.00017777777777777779, "loss": 0.2029, "step": 100 }, { "epoch": 7.2, "learning_rate": 0.00017733333333333335, "loss": 0.2979, "step": 101 }, { "epoch": 7.22, "learning_rate": 0.0001768888888888889, "loss": 0.2793, "step": 102 }, { "epoch": 7.24, "learning_rate": 0.00017644444444444446, "loss": 0.2809, "step": 103 }, { "epoch": 7.26, "learning_rate": 0.00017600000000000002, "loss": 0.2748, "step": 104 }, { "epoch": 7.26, "eval_loss": 0.25376519560813904, "eval_runtime": 24.0872, "eval_samples_per_second": 0.789, "eval_steps_per_second": 0.208, "step": 104 }, { "epoch": 8.02, "learning_rate": 0.00017555555555555556, "loss": 0.116, "step": 105 }, { "epoch": 8.04, "learning_rate": 0.00017511111111111113, "loss": 0.2272, "step": 106 }, { "epoch": 8.06, "learning_rate": 0.00017466666666666667, "loss": 0.1273, "step": 107 }, { "epoch": 8.08, "learning_rate": 0.00017422222222222223, "loss": 0.1605, "step": 108 }, { "epoch": 8.1, "learning_rate": 0.0001737777777777778, "loss": 0.2263, "step": 109 }, { "epoch": 8.12, "learning_rate": 0.00017333333333333334, "loss": 0.2452, "step": 110 }, { "epoch": 8.14, "learning_rate": 0.0001728888888888889, "loss": 0.2009, "step": 111 }, { "epoch": 8.16, "learning_rate": 0.00017244444444444444, "loss": 0.2372, "step": 112 }, { "epoch": 8.18, "learning_rate": 0.000172, "loss": 0.2736, "step": 113 }, { "epoch": 8.2, "learning_rate": 0.00017155555555555555, "loss": 0.3009, "step": 114 }, { "epoch": 8.22, "learning_rate": 0.0001711111111111111, "loss": 0.2619, "step": 115 }, { "epoch": 8.24, "learning_rate": 0.00017066666666666668, "loss": 0.2698, "step": 116 }, { "epoch": 8.26, "learning_rate": 0.00017022222222222224, "loss": 0.1716, "step": 117 }, { "epoch": 8.26, "eval_loss": 0.23757965862751007, "eval_runtime": 24.2556, "eval_samples_per_second": 0.783, "eval_steps_per_second": 0.206, "step": 117 } ], "logging_steps": 1, "max_steps": 500, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.8291357646848e+16, "trial_name": null, "trial_params": null }