{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0016680567139282, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2e-05, "loss": 1.7915, "step": 1 }, { "epoch": 0.04, "learning_rate": 4e-05, "loss": 1.7449, "step": 2 }, { "epoch": 0.06, "learning_rate": 6e-05, "loss": 1.7538, "step": 3 }, { "epoch": 0.08, "learning_rate": 8e-05, "loss": 1.713, "step": 4 }, { "epoch": 0.1, "learning_rate": 0.0001, "loss": 1.7313, "step": 5 }, { "epoch": 0.1, "eval_loss": 1.6295605897903442, "eval_runtime": 6.0457, "eval_samples_per_second": 0.331, "eval_steps_per_second": 0.331, "step": 5 }, { "epoch": 0.12, "learning_rate": 0.00012, "loss": 1.8184, "step": 6 }, { "epoch": 0.14, "learning_rate": 0.00014, "loss": 1.7586, "step": 7 }, { "epoch": 0.16, "learning_rate": 0.00016, "loss": 1.8926, "step": 8 }, { "epoch": 0.18, "learning_rate": 0.00018, "loss": 1.9176, "step": 9 }, { "epoch": 0.2, "learning_rate": 0.0002, "loss": 1.584, "step": 10 }, { "epoch": 0.2, "eval_loss": 1.606095790863037, "eval_runtime": 6.0412, "eval_samples_per_second": 0.331, "eval_steps_per_second": 0.331, "step": 10 }, { "epoch": 0.22, "learning_rate": 0.00019997370884991842, "loss": 1.2008, "step": 11 }, { "epoch": 0.24, "learning_rate": 0.00019989484922416502, "loss": 1.2531, "step": 12 }, { "epoch": 0.26, "learning_rate": 0.00019976346258894503, "loss": 1.6787, "step": 13 }, { "epoch": 0.28, "learning_rate": 0.00019957961803037326, "loss": 1.7146, "step": 14 }, { "epoch": 0.3, "learning_rate": 0.00019934341221814739, "loss": 1.6937, "step": 15 }, { "epoch": 0.3, "eval_loss": 1.5841087102890015, "eval_runtime": 6.0341, "eval_samples_per_second": 0.331, "eval_steps_per_second": 0.331, "step": 15 }, { "epoch": 0.32, "learning_rate": 0.00019905496935471658, "loss": 1.7531, "step": 16 }, { "epoch": 0.34, "learning_rate": 0.0001987144411099731, "loss": 1.7094, "step": 17 }, { "epoch": 0.36, "learning_rate": 0.00019832200654150076, "loss": 1.7333, "step": 18 }, { "epoch": 0.38, "learning_rate": 0.00019787787200042223, "loss": 1.7736, "step": 19 }, { "epoch": 0.4, "learning_rate": 0.0001973822710228951, "loss": 1.6655, "step": 20 }, { "epoch": 0.4, "eval_loss": 1.5755609273910522, "eval_runtime": 6.0389, "eval_samples_per_second": 0.331, "eval_steps_per_second": 0.331, "step": 20 }, { "epoch": 0.42, "learning_rate": 0.0001968354642073129, "loss": 1.8054, "step": 21 }, { "epoch": 0.44, "learning_rate": 0.00019623773907727682, "loss": 1.8327, "step": 22 }, { "epoch": 0.46, "learning_rate": 0.00019558940993040885, "loss": 1.3822, "step": 23 }, { "epoch": 0.48, "learning_rate": 0.00019489081767308698, "loss": 1.254, "step": 24 }, { "epoch": 0.5, "learning_rate": 0.00019414232964118892, "loss": 1.7648, "step": 25 }, { "epoch": 0.5, "eval_loss": 1.5711314678192139, "eval_runtime": 6.0254, "eval_samples_per_second": 0.332, "eval_steps_per_second": 0.332, "step": 25 }, { "epoch": 0.52, "learning_rate": 0.0001933443394069383, "loss": 1.641, "step": 26 }, { "epoch": 0.54, "learning_rate": 0.00019249726657195532, "loss": 1.7777, "step": 27 }, { "epoch": 0.56, "learning_rate": 0.00019160155654662076, "loss": 1.7038, "step": 28 }, { "epoch": 0.58, "learning_rate": 0.0001906576803158686, "loss": 1.6179, "step": 29 }, { "epoch": 0.6, "learning_rate": 0.0001896661341915318, "loss": 1.7813, "step": 30 }, { "epoch": 0.6, "eval_loss": 1.5650010108947754, "eval_runtime": 6.0228, "eval_samples_per_second": 0.332, "eval_steps_per_second": 0.332, "step": 30 }, { "epoch": 0.62, "learning_rate": 0.00018862743955136966, "loss": 1.661, "step": 31 }, { "epoch": 0.64, "learning_rate": 0.00018754214256491562, "loss": 1.7948, "step": 32 }, { "epoch": 0.66, "learning_rate": 0.00018641081390628877, "loss": 1.92, "step": 33 }, { "epoch": 0.68, "learning_rate": 0.00018523404845412027, "loss": 1.6941, "step": 34 }, { "epoch": 0.7, "learning_rate": 0.0001840124649787524, "loss": 1.3461, "step": 35 }, { "epoch": 0.7, "eval_loss": 1.5624847412109375, "eval_runtime": 6.0325, "eval_samples_per_second": 0.332, "eval_steps_per_second": 0.332, "step": 35 }, { "epoch": 0.72, "learning_rate": 0.0001827467058168748, "loss": 0.8176, "step": 36 }, { "epoch": 0.74, "learning_rate": 0.00018143743653376942, "loss": 1.7262, "step": 37 }, { "epoch": 0.76, "learning_rate": 0.00018008534557334064, "loss": 1.7333, "step": 38 }, { "epoch": 0.78, "learning_rate": 0.00017869114389611575, "loss": 1.5991, "step": 39 }, { "epoch": 0.8, "learning_rate": 0.0001772555646054055, "loss": 1.7267, "step": 40 }, { "epoch": 0.8, "eval_loss": 1.5579214096069336, "eval_runtime": 6.0388, "eval_samples_per_second": 0.331, "eval_steps_per_second": 0.331, "step": 40 }, { "epoch": 0.82, "learning_rate": 0.00017577936256182167, "loss": 1.6694, "step": 41 }, { "epoch": 0.84, "learning_rate": 0.0001742633139863538, "loss": 1.8201, "step": 42 }, { "epoch": 0.86, "learning_rate": 0.0001727082160522145, "loss": 1.7913, "step": 43 }, { "epoch": 0.88, "learning_rate": 0.00017111488646566727, "loss": 1.825, "step": 44 }, { "epoch": 0.9, "learning_rate": 0.00016948416303605795, "loss": 1.7778, "step": 45 }, { "epoch": 0.9, "eval_loss": 1.555617094039917, "eval_runtime": 6.0421, "eval_samples_per_second": 0.331, "eval_steps_per_second": 0.331, "step": 45 }, { "epoch": 0.92, "learning_rate": 0.00016781690323527511, "loss": 1.6311, "step": 46 }, { "epoch": 0.94, "learning_rate": 0.0001661139837468717, "loss": 1.1499, "step": 47 }, { "epoch": 0.96, "learning_rate": 0.00016437630000508464, "loss": 1.0455, "step": 48 }, { "epoch": 0.98, "learning_rate": 0.00016260476572399496, "loss": 1.7178, "step": 49 }, { "epoch": 1.0, "learning_rate": 0.00016080031241707578, "loss": 1.4832, "step": 50 }, { "epoch": 1.0, "eval_loss": 1.554579257965088, "eval_runtime": 6.0239, "eval_samples_per_second": 0.332, "eval_steps_per_second": 0.332, "step": 50 }, { "epoch": 1.02, "learning_rate": 0.00015896388890738127, "loss": 1.6801, "step": 51 }, { "epoch": 1.04, "learning_rate": 0.0001570964608286336, "loss": 1.6462, "step": 52 }, { "epoch": 1.06, "learning_rate": 0.00015519901011747044, "loss": 1.7264, "step": 53 }, { "epoch": 1.08, "learning_rate": 0.0001532725344971202, "loss": 1.63, "step": 54 }, { "epoch": 1.1, "learning_rate": 0.00015131804695277612, "loss": 1.7584, "step": 55 }, { "epoch": 1.1, "eval_loss": 1.5519572496414185, "eval_runtime": 6.0181, "eval_samples_per_second": 0.332, "eval_steps_per_second": 0.332, "step": 55 }, { "epoch": 1.12, "learning_rate": 0.0001493365751989454, "loss": 1.7956, "step": 56 }, { "epoch": 1.14, "learning_rate": 0.00014732916113905335, "loss": 1.6528, "step": 57 }, { "epoch": 1.16, "learning_rate": 0.00014529686031758643, "loss": 1.7006, "step": 58 }, { "epoch": 1.18, "learning_rate": 0.00014324074136506284, "loss": 1.8171, "step": 59 }, { "epoch": 1.2, "learning_rate": 0.0001411618854361218, "loss": 1.4825, "step": 60 }, { "epoch": 1.2, "eval_loss": 1.5512959957122803, "eval_runtime": 6.0143, "eval_samples_per_second": 0.333, "eval_steps_per_second": 0.333, "step": 60 }, { "epoch": 1.22, "learning_rate": 0.00013906138564102793, "loss": 0.9645, "step": 61 }, { "epoch": 1.24, "learning_rate": 0.0001369403464708884, "loss": 1.2708, "step": 62 }, { "epoch": 1.26, "learning_rate": 0.0001347998832168862, "loss": 1.7432, "step": 63 }, { "epoch": 1.28, "learning_rate": 0.00013264112138383445, "loss": 1.6986, "step": 64 }, { "epoch": 1.3, "learning_rate": 0.00013046519609836, "loss": 1.7969, "step": 65 }, { "epoch": 1.3, "eval_loss": 1.5507731437683105, "eval_runtime": 6.0363, "eval_samples_per_second": 0.331, "eval_steps_per_second": 0.331, "step": 65 }, { "epoch": 1.32, "learning_rate": 0.00012827325151202782, "loss": 1.5769, "step": 66 }, { "epoch": 1.34, "learning_rate": 0.00012606644019971968, "loss": 1.7011, "step": 67 }, { "epoch": 1.36, "learning_rate": 0.00012384592255358385, "loss": 1.6525, "step": 68 }, { "epoch": 1.38, "learning_rate": 0.00012161286617287419, "loss": 1.6834, "step": 69 }, { "epoch": 1.4, "learning_rate": 0.00011936844524999966, "loss": 1.7182, "step": 70 }, { "epoch": 1.4, "eval_loss": 1.549034595489502, "eval_runtime": 6.0315, "eval_samples_per_second": 0.332, "eval_steps_per_second": 0.332, "step": 70 }, { "epoch": 1.42, "learning_rate": 0.00011711383995310681, "loss": 1.7941, "step": 71 }, { "epoch": 1.44, "learning_rate": 0.00011485023580552039, "loss": 1.6289, "step": 72 }, { "epoch": 1.46, "learning_rate": 0.00011257882306236775, "loss": 1.1176, "step": 73 }, { "epoch": 1.48, "learning_rate": 0.00011030079608471544, "loss": 0.8403, "step": 74 }, { "epoch": 1.5, "learning_rate": 0.00010801735271154669, "loss": 1.724, "step": 75 }, { "epoch": 1.5, "eval_loss": 1.5494024753570557, "eval_runtime": 6.0321, "eval_samples_per_second": 0.332, "eval_steps_per_second": 0.332, "step": 75 }, { "epoch": 1.52, "learning_rate": 0.00010572969362990998, "loss": 1.6746, "step": 76 }, { "epoch": 1.54, "learning_rate": 0.00010343902174357039, "loss": 1.7152, "step": 77 }, { "epoch": 1.56, "learning_rate": 0.0001011465415404949, "loss": 1.7054, "step": 78 }, { "epoch": 1.58, "learning_rate": 9.88534584595051e-05, "loss": 1.7333, "step": 79 }, { "epoch": 1.6, "learning_rate": 9.656097825642961e-05, "loss": 1.7083, "step": 80 }, { "epoch": 1.6, "eval_loss": 1.5476174354553223, "eval_runtime": 6.0289, "eval_samples_per_second": 0.332, "eval_steps_per_second": 0.332, "step": 80 }, { "epoch": 1.62, "learning_rate": 9.427030637009003e-05, "loss": 1.8771, "step": 81 }, { "epoch": 1.64, "learning_rate": 9.198264728845332e-05, "loss": 1.8446, "step": 82 }, { "epoch": 1.66, "learning_rate": 8.969920391528458e-05, "loss": 1.86, "step": 83 }, { "epoch": 1.68, "learning_rate": 8.742117693763227e-05, "loss": 1.7291, "step": 84 }, { "epoch": 1.7, "learning_rate": 8.514976419447964e-05, "loss": 1.2195, "step": 85 }, { "epoch": 1.7, "eval_loss": 1.5478819608688354, "eval_runtime": 6.0353, "eval_samples_per_second": 0.331, "eval_steps_per_second": 0.331, "step": 85 }, { "epoch": 1.72, "learning_rate": 8.28861600468932e-05, "loss": 0.9047, "step": 86 }, { "epoch": 1.74, "learning_rate": 8.063155475000037e-05, "loss": 1.7297, "step": 87 }, { "epoch": 1.76, "learning_rate": 7.838713382712583e-05, "loss": 1.5731, "step": 88 }, { "epoch": 1.78, "learning_rate": 7.615407744641619e-05, "loss": 1.598, "step": 89 }, { "epoch": 1.8, "learning_rate": 7.393355980028039e-05, "loss": 1.6782, "step": 90 }, { "epoch": 1.8, "eval_loss": 1.5477495193481445, "eval_runtime": 6.0384, "eval_samples_per_second": 0.331, "eval_steps_per_second": 0.331, "step": 90 }, { "epoch": 1.82, "learning_rate": 7.172674848797219e-05, "loss": 1.672, "step": 91 }, { "epoch": 1.84, "learning_rate": 6.953480390164e-05, "loss": 1.6457, "step": 92 }, { "epoch": 1.86, "learning_rate": 6.735887861616556e-05, "loss": 1.7155, "step": 93 }, { "epoch": 1.88, "learning_rate": 6.520011678311382e-05, "loss": 1.7334, "step": 94 }, { "epoch": 1.9, "learning_rate": 6.305965352911161e-05, "loss": 1.7342, "step": 95 }, { "epoch": 1.9, "eval_loss": 1.5465799570083618, "eval_runtime": 6.0244, "eval_samples_per_second": 0.332, "eval_steps_per_second": 0.332, "step": 95 }, { "epoch": 1.92, "learning_rate": 6.093861435897208e-05, "loss": 1.8115, "step": 96 }, { "epoch": 1.94, "learning_rate": 5.883811456387821e-05, "loss": 1.4625, "step": 97 }, { "epoch": 1.96, "learning_rate": 5.675925863493721e-05, "loss": 1.08, "step": 98 }, { "epoch": 1.98, "learning_rate": 5.4703139682413586e-05, "loss": 1.6914, "step": 99 }, { "epoch": 2.0, "learning_rate": 5.267083886094668e-05, "loss": 1.6016, "step": 100 }, { "epoch": 2.0, "eval_loss": 1.547050952911377, "eval_runtime": 6.0254, "eval_samples_per_second": 0.332, "eval_steps_per_second": 0.332, "step": 100 } ], "max_steps": 147, "num_train_epochs": 3, "total_flos": 5.5839227595669504e+17, "trial_name": null, "trial_params": null }