diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5272 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 207.7562326869806, + "global_step": 300000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.35, + "learning_rate": 1.999307479224377e-05, + "loss": 5.6258, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 1.9986149584487535e-05, + "loss": 5.4807, + "step": 1000 + }, + { + "epoch": 1.0, + "eval_loss": 5.224731922149658, + "eval_runtime": 75.7111, + "eval_samples_per_second": 6503.99, + "eval_steps_per_second": 2.127, + "step": 1444 + }, + { + "epoch": 1.04, + "learning_rate": 1.9979224376731302e-05, + "loss": 5.3786, + "step": 1500 + }, + { + "epoch": 1.39, + "learning_rate": 1.9972299168975073e-05, + "loss": 5.2907, + "step": 2000 + }, + { + "epoch": 1.73, + "learning_rate": 1.996537396121884e-05, + "loss": 5.2208, + "step": 2500 + }, + { + "epoch": 2.0, + "eval_loss": 5.035708427429199, + "eval_runtime": 74.9448, + "eval_samples_per_second": 6570.491, + "eval_steps_per_second": 2.148, + "step": 2888 + }, + { + "epoch": 2.08, + "learning_rate": 1.9958448753462603e-05, + "loss": 5.163, + "step": 3000 + }, + { + "epoch": 2.42, + "learning_rate": 1.9951523545706373e-05, + "loss": 5.1119, + "step": 3500 + }, + { + "epoch": 2.77, + "learning_rate": 1.994459833795014e-05, + "loss": 5.0685, + "step": 4000 + }, + { + "epoch": 3.0, + "eval_loss": 4.928272724151611, + "eval_runtime": 74.5205, + "eval_samples_per_second": 6607.9, + "eval_steps_per_second": 2.16, + "step": 4332 + }, + { + "epoch": 3.12, + "learning_rate": 1.9937673130193907e-05, + "loss": 5.034, + "step": 4500 + }, + { + "epoch": 3.46, + "learning_rate": 1.9930747922437674e-05, + "loss": 5.0038, + "step": 5000 + }, + { + "epoch": 3.81, + "learning_rate": 1.992382271468144e-05, + "loss": 4.9748, + "step": 5500 + }, + { + "epoch": 4.0, + "eval_loss": 4.848881721496582, + "eval_runtime": 73.1546, + "eval_samples_per_second": 6731.277, + "eval_steps_per_second": 2.201, + "step": 5776 + }, + { + "epoch": 4.16, + "learning_rate": 1.991689750692521e-05, + "loss": 4.9513, + "step": 6000 + }, + { + "epoch": 4.5, + "learning_rate": 1.9909972299168975e-05, + "loss": 4.9265, + "step": 6500 + }, + { + "epoch": 4.85, + "learning_rate": 1.9903047091412745e-05, + "loss": 4.9046, + "step": 7000 + }, + { + "epoch": 5.0, + "eval_loss": 4.779133319854736, + "eval_runtime": 74.0757, + "eval_samples_per_second": 6647.578, + "eval_steps_per_second": 2.173, + "step": 7220 + }, + { + "epoch": 5.19, + "learning_rate": 1.9896121883656512e-05, + "loss": 4.8871, + "step": 7500 + }, + { + "epoch": 5.54, + "learning_rate": 1.988919667590028e-05, + "loss": 4.8665, + "step": 8000 + }, + { + "epoch": 5.89, + "learning_rate": 1.9882271468144046e-05, + "loss": 4.8483, + "step": 8500 + }, + { + "epoch": 6.0, + "eval_loss": 4.739624977111816, + "eval_runtime": 74.1752, + "eval_samples_per_second": 6638.657, + "eval_steps_per_second": 2.171, + "step": 8664 + }, + { + "epoch": 6.23, + "learning_rate": 1.9875346260387813e-05, + "loss": 4.8328, + "step": 9000 + }, + { + "epoch": 6.58, + "learning_rate": 1.986842105263158e-05, + "loss": 4.8205, + "step": 9500 + }, + { + "epoch": 6.93, + "learning_rate": 1.9861495844875347e-05, + "loss": 4.8032, + "step": 10000 + }, + { + "epoch": 7.0, + "eval_loss": 4.699394702911377, + "eval_runtime": 74.6581, + "eval_samples_per_second": 6595.725, + "eval_steps_per_second": 2.156, + "step": 10108 + }, + { + "epoch": 7.27, + "learning_rate": 1.9854570637119114e-05, + "loss": 4.7918, + "step": 10500 + }, + { + "epoch": 7.62, + "learning_rate": 1.9847645429362884e-05, + "loss": 4.7758, + "step": 11000 + }, + { + "epoch": 7.96, + "learning_rate": 1.984072022160665e-05, + "loss": 4.7675, + "step": 11500 + }, + { + "epoch": 8.0, + "eval_loss": 4.659648895263672, + "eval_runtime": 77.484, + "eval_samples_per_second": 6355.173, + "eval_steps_per_second": 2.078, + "step": 11552 + }, + { + "epoch": 8.31, + "learning_rate": 1.9833795013850418e-05, + "loss": 4.7551, + "step": 12000 + }, + { + "epoch": 8.66, + "learning_rate": 1.9826869806094185e-05, + "loss": 4.7441, + "step": 12500 + }, + { + "epoch": 9.0, + "eval_loss": 4.621845722198486, + "eval_runtime": 74.377, + "eval_samples_per_second": 6620.653, + "eval_steps_per_second": 2.165, + "step": 12996 + }, + { + "epoch": 9.0, + "learning_rate": 1.9819944598337952e-05, + "loss": 4.7288, + "step": 13000 + }, + { + "epoch": 9.35, + "learning_rate": 1.981301939058172e-05, + "loss": 4.7199, + "step": 13500 + }, + { + "epoch": 9.7, + "learning_rate": 1.9806094182825486e-05, + "loss": 4.7093, + "step": 14000 + }, + { + "epoch": 10.0, + "eval_loss": 4.591496467590332, + "eval_runtime": 74.6789, + "eval_samples_per_second": 6593.887, + "eval_steps_per_second": 2.156, + "step": 14440 + }, + { + "epoch": 10.04, + "learning_rate": 1.9799168975069252e-05, + "loss": 4.7023, + "step": 14500 + }, + { + "epoch": 10.39, + "learning_rate": 1.9792243767313023e-05, + "loss": 4.6882, + "step": 15000 + }, + { + "epoch": 10.73, + "learning_rate": 1.9785318559556786e-05, + "loss": 4.6787, + "step": 15500 + }, + { + "epoch": 11.0, + "eval_loss": 4.566598415374756, + "eval_runtime": 74.19, + "eval_samples_per_second": 6637.336, + "eval_steps_per_second": 2.17, + "step": 15884 + }, + { + "epoch": 11.08, + "learning_rate": 1.9778393351800557e-05, + "loss": 4.668, + "step": 16000 + }, + { + "epoch": 11.43, + "learning_rate": 1.9771468144044324e-05, + "loss": 4.6627, + "step": 16500 + }, + { + "epoch": 11.77, + "learning_rate": 1.976454293628809e-05, + "loss": 4.6523, + "step": 17000 + }, + { + "epoch": 12.0, + "eval_loss": 4.539627552032471, + "eval_runtime": 74.3111, + "eval_samples_per_second": 6626.517, + "eval_steps_per_second": 2.167, + "step": 17328 + }, + { + "epoch": 12.12, + "learning_rate": 1.9757617728531857e-05, + "loss": 4.6439, + "step": 17500 + }, + { + "epoch": 12.47, + "learning_rate": 1.9750692520775624e-05, + "loss": 4.6371, + "step": 18000 + }, + { + "epoch": 12.81, + "learning_rate": 1.9743767313019395e-05, + "loss": 4.6281, + "step": 18500 + }, + { + "epoch": 13.0, + "eval_loss": 4.52562141418457, + "eval_runtime": 75.4167, + "eval_samples_per_second": 6529.372, + "eval_steps_per_second": 2.135, + "step": 18772 + }, + { + "epoch": 13.16, + "learning_rate": 1.9736842105263158e-05, + "loss": 4.6176, + "step": 19000 + }, + { + "epoch": 13.5, + "learning_rate": 1.9729916897506925e-05, + "loss": 4.6111, + "step": 19500 + }, + { + "epoch": 13.85, + "learning_rate": 1.9722991689750695e-05, + "loss": 4.6068, + "step": 20000 + }, + { + "epoch": 14.0, + "eval_loss": 4.500814437866211, + "eval_runtime": 74.0488, + "eval_samples_per_second": 6649.993, + "eval_steps_per_second": 2.174, + "step": 20216 + }, + { + "epoch": 14.2, + "learning_rate": 1.9716066481994462e-05, + "loss": 4.5989, + "step": 20500 + }, + { + "epoch": 14.54, + "learning_rate": 1.970914127423823e-05, + "loss": 4.5929, + "step": 21000 + }, + { + "epoch": 14.89, + "learning_rate": 1.9702216066481996e-05, + "loss": 4.5839, + "step": 21500 + }, + { + "epoch": 15.0, + "eval_loss": 4.482580661773682, + "eval_runtime": 74.9815, + "eval_samples_per_second": 6567.273, + "eval_steps_per_second": 2.147, + "step": 21660 + }, + { + "epoch": 15.24, + "learning_rate": 1.9695290858725763e-05, + "loss": 4.5769, + "step": 22000 + }, + { + "epoch": 15.58, + "learning_rate": 1.968836565096953e-05, + "loss": 4.5733, + "step": 22500 + }, + { + "epoch": 15.93, + "learning_rate": 1.9681440443213297e-05, + "loss": 4.5638, + "step": 23000 + }, + { + "epoch": 16.0, + "eval_loss": 4.467035293579102, + "eval_runtime": 74.2416, + "eval_samples_per_second": 6632.726, + "eval_steps_per_second": 2.169, + "step": 23104 + }, + { + "epoch": 16.27, + "learning_rate": 1.9674515235457067e-05, + "loss": 4.5584, + "step": 23500 + }, + { + "epoch": 16.62, + "learning_rate": 1.9667590027700834e-05, + "loss": 4.5497, + "step": 24000 + }, + { + "epoch": 16.97, + "learning_rate": 1.9660664819944598e-05, + "loss": 4.5469, + "step": 24500 + }, + { + "epoch": 17.0, + "eval_loss": 4.447102069854736, + "eval_runtime": 74.6073, + "eval_samples_per_second": 6600.208, + "eval_steps_per_second": 2.158, + "step": 24548 + }, + { + "epoch": 17.31, + "learning_rate": 1.9653739612188368e-05, + "loss": 4.5406, + "step": 25000 + }, + { + "epoch": 17.66, + "learning_rate": 1.9646814404432135e-05, + "loss": 4.5333, + "step": 25500 + }, + { + "epoch": 18.0, + "eval_loss": 4.4333930015563965, + "eval_runtime": 74.4269, + "eval_samples_per_second": 6616.211, + "eval_steps_per_second": 2.163, + "step": 25992 + }, + { + "epoch": 18.01, + "learning_rate": 1.9639889196675902e-05, + "loss": 4.5288, + "step": 26000 + }, + { + "epoch": 18.35, + "learning_rate": 1.963296398891967e-05, + "loss": 4.5174, + "step": 26500 + }, + { + "epoch": 18.7, + "learning_rate": 1.9626038781163436e-05, + "loss": 4.5178, + "step": 27000 + }, + { + "epoch": 19.0, + "eval_loss": 4.415244102478027, + "eval_runtime": 74.8709, + "eval_samples_per_second": 6576.976, + "eval_steps_per_second": 2.15, + "step": 27436 + }, + { + "epoch": 19.04, + "learning_rate": 1.9619113573407206e-05, + "loss": 4.5116, + "step": 27500 + }, + { + "epoch": 19.39, + "learning_rate": 1.961218836565097e-05, + "loss": 4.5031, + "step": 28000 + }, + { + "epoch": 19.74, + "learning_rate": 1.960526315789474e-05, + "loss": 4.501, + "step": 28500 + }, + { + "epoch": 20.0, + "eval_loss": 4.401387691497803, + "eval_runtime": 74.9583, + "eval_samples_per_second": 6569.303, + "eval_steps_per_second": 2.148, + "step": 28880 + }, + { + "epoch": 20.08, + "learning_rate": 1.9598337950138507e-05, + "loss": 4.4967, + "step": 29000 + }, + { + "epoch": 20.43, + "learning_rate": 1.9591412742382274e-05, + "loss": 4.4896, + "step": 29500 + }, + { + "epoch": 20.78, + "learning_rate": 1.958448753462604e-05, + "loss": 4.4853, + "step": 30000 + }, + { + "epoch": 21.0, + "eval_loss": 4.387256622314453, + "eval_runtime": 75.2035, + "eval_samples_per_second": 6547.89, + "eval_steps_per_second": 2.141, + "step": 30324 + }, + { + "epoch": 21.12, + "learning_rate": 1.9577562326869807e-05, + "loss": 4.4815, + "step": 30500 + }, + { + "epoch": 21.47, + "learning_rate": 1.9570637119113574e-05, + "loss": 4.4741, + "step": 31000 + }, + { + "epoch": 21.81, + "learning_rate": 1.956371191135734e-05, + "loss": 4.4705, + "step": 31500 + }, + { + "epoch": 22.0, + "eval_loss": 4.372128963470459, + "eval_runtime": 75.7644, + "eval_samples_per_second": 6499.411, + "eval_steps_per_second": 2.125, + "step": 31768 + }, + { + "epoch": 22.16, + "learning_rate": 1.9556786703601108e-05, + "loss": 4.465, + "step": 32000 + }, + { + "epoch": 22.51, + "learning_rate": 1.954986149584488e-05, + "loss": 4.4612, + "step": 32500 + }, + { + "epoch": 22.85, + "learning_rate": 1.9542936288088645e-05, + "loss": 4.4548, + "step": 33000 + }, + { + "epoch": 23.0, + "eval_loss": 4.359036445617676, + "eval_runtime": 75.0614, + "eval_samples_per_second": 6560.284, + "eval_steps_per_second": 2.145, + "step": 33212 + }, + { + "epoch": 23.2, + "learning_rate": 1.9536011080332412e-05, + "loss": 4.452, + "step": 33500 + }, + { + "epoch": 23.55, + "learning_rate": 1.952908587257618e-05, + "loss": 4.4467, + "step": 34000 + }, + { + "epoch": 23.89, + "learning_rate": 1.9522160664819946e-05, + "loss": 4.4435, + "step": 34500 + }, + { + "epoch": 24.0, + "eval_loss": 4.351931571960449, + "eval_runtime": 73.9885, + "eval_samples_per_second": 6655.409, + "eval_steps_per_second": 2.176, + "step": 34656 + }, + { + "epoch": 24.24, + "learning_rate": 1.9515235457063713e-05, + "loss": 4.4384, + "step": 35000 + }, + { + "epoch": 24.58, + "learning_rate": 1.950831024930748e-05, + "loss": 4.4346, + "step": 35500 + }, + { + "epoch": 24.93, + "learning_rate": 1.9501385041551247e-05, + "loss": 4.4276, + "step": 36000 + }, + { + "epoch": 25.0, + "eval_loss": 4.33967924118042, + "eval_runtime": 73.6152, + "eval_samples_per_second": 6689.162, + "eval_steps_per_second": 2.187, + "step": 36100 + }, + { + "epoch": 25.28, + "learning_rate": 1.9494459833795017e-05, + "loss": 4.4249, + "step": 36500 + }, + { + "epoch": 25.62, + "learning_rate": 1.948753462603878e-05, + "loss": 4.4245, + "step": 37000 + }, + { + "epoch": 25.97, + "learning_rate": 1.948060941828255e-05, + "loss": 4.4166, + "step": 37500 + }, + { + "epoch": 26.0, + "eval_loss": 4.3328375816345215, + "eval_runtime": 74.8225, + "eval_samples_per_second": 6581.225, + "eval_steps_per_second": 2.152, + "step": 37544 + }, + { + "epoch": 26.32, + "learning_rate": 1.9473684210526318e-05, + "loss": 4.4133, + "step": 38000 + }, + { + "epoch": 26.66, + "learning_rate": 1.9466759002770085e-05, + "loss": 4.41, + "step": 38500 + }, + { + "epoch": 27.0, + "eval_loss": 4.31511926651001, + "eval_runtime": 75.2713, + "eval_samples_per_second": 6541.986, + "eval_steps_per_second": 2.139, + "step": 38988 + }, + { + "epoch": 27.01, + "learning_rate": 1.9459833795013852e-05, + "loss": 4.4064, + "step": 39000 + }, + { + "epoch": 27.35, + "learning_rate": 1.945290858725762e-05, + "loss": 4.403, + "step": 39500 + }, + { + "epoch": 27.7, + "learning_rate": 1.944598337950139e-05, + "loss": 4.3996, + "step": 40000 + }, + { + "epoch": 28.0, + "eval_loss": 4.304774761199951, + "eval_runtime": 73.8005, + "eval_samples_per_second": 6672.362, + "eval_steps_per_second": 2.182, + "step": 40432 + }, + { + "epoch": 28.05, + "learning_rate": 1.9439058171745153e-05, + "loss": 4.3957, + "step": 40500 + }, + { + "epoch": 28.39, + "learning_rate": 1.943213296398892e-05, + "loss": 4.3921, + "step": 41000 + }, + { + "epoch": 28.74, + "learning_rate": 1.942520775623269e-05, + "loss": 4.3886, + "step": 41500 + }, + { + "epoch": 29.0, + "eval_loss": 4.294938564300537, + "eval_runtime": 76.8852, + "eval_samples_per_second": 6404.662, + "eval_steps_per_second": 2.094, + "step": 41876 + }, + { + "epoch": 29.09, + "learning_rate": 1.9418282548476453e-05, + "loss": 4.3851, + "step": 42000 + }, + { + "epoch": 29.43, + "learning_rate": 1.9411357340720224e-05, + "loss": 4.3808, + "step": 42500 + }, + { + "epoch": 29.78, + "learning_rate": 1.940443213296399e-05, + "loss": 4.3776, + "step": 43000 + }, + { + "epoch": 30.0, + "eval_loss": 4.287764549255371, + "eval_runtime": 74.0431, + "eval_samples_per_second": 6650.502, + "eval_steps_per_second": 2.174, + "step": 43320 + }, + { + "epoch": 30.12, + "learning_rate": 1.9397506925207758e-05, + "loss": 4.3721, + "step": 43500 + }, + { + "epoch": 30.47, + "learning_rate": 1.9390581717451524e-05, + "loss": 4.3705, + "step": 44000 + }, + { + "epoch": 30.82, + "learning_rate": 1.938365650969529e-05, + "loss": 4.3686, + "step": 44500 + }, + { + "epoch": 31.0, + "eval_loss": 4.277154922485352, + "eval_runtime": 74.74, + "eval_samples_per_second": 6588.493, + "eval_steps_per_second": 2.154, + "step": 44764 + }, + { + "epoch": 31.16, + "learning_rate": 1.937673130193906e-05, + "loss": 4.3631, + "step": 45000 + }, + { + "epoch": 31.51, + "learning_rate": 1.936980609418283e-05, + "loss": 4.363, + "step": 45500 + }, + { + "epoch": 31.86, + "learning_rate": 1.9362880886426595e-05, + "loss": 4.3572, + "step": 46000 + }, + { + "epoch": 32.0, + "eval_loss": 4.265904903411865, + "eval_runtime": 74.1213, + "eval_samples_per_second": 6643.49, + "eval_steps_per_second": 2.172, + "step": 46208 + }, + { + "epoch": 32.2, + "learning_rate": 1.9355955678670362e-05, + "loss": 4.3549, + "step": 46500 + }, + { + "epoch": 32.55, + "learning_rate": 1.934903047091413e-05, + "loss": 4.352, + "step": 47000 + }, + { + "epoch": 32.89, + "learning_rate": 1.9342105263157896e-05, + "loss": 4.3518, + "step": 47500 + }, + { + "epoch": 33.0, + "eval_loss": 4.2521071434021, + "eval_runtime": 73.7943, + "eval_samples_per_second": 6672.929, + "eval_steps_per_second": 2.182, + "step": 47652 + }, + { + "epoch": 33.24, + "learning_rate": 1.9335180055401663e-05, + "loss": 4.345, + "step": 48000 + }, + { + "epoch": 33.59, + "learning_rate": 1.932825484764543e-05, + "loss": 4.344, + "step": 48500 + }, + { + "epoch": 33.93, + "learning_rate": 1.93213296398892e-05, + "loss": 4.3377, + "step": 49000 + }, + { + "epoch": 34.0, + "eval_loss": 4.250110626220703, + "eval_runtime": 76.5664, + "eval_samples_per_second": 6431.332, + "eval_steps_per_second": 2.103, + "step": 49096 + }, + { + "epoch": 34.28, + "learning_rate": 1.9314404432132964e-05, + "loss": 4.333, + "step": 49500 + }, + { + "epoch": 34.63, + "learning_rate": 1.9307479224376734e-05, + "loss": 4.3307, + "step": 50000 + }, + { + "epoch": 34.97, + "learning_rate": 1.93005540166205e-05, + "loss": 4.3292, + "step": 50500 + }, + { + "epoch": 35.0, + "eval_loss": 4.243873596191406, + "eval_runtime": 75.8252, + "eval_samples_per_second": 6494.199, + "eval_steps_per_second": 2.123, + "step": 50540 + }, + { + "epoch": 35.32, + "learning_rate": 1.9293628808864268e-05, + "loss": 4.3287, + "step": 51000 + }, + { + "epoch": 35.66, + "learning_rate": 1.9286703601108035e-05, + "loss": 4.3236, + "step": 51500 + }, + { + "epoch": 36.0, + "eval_loss": 4.227120399475098, + "eval_runtime": 75.2969, + "eval_samples_per_second": 6539.764, + "eval_steps_per_second": 2.138, + "step": 51984 + }, + { + "epoch": 36.01, + "learning_rate": 1.9279778393351802e-05, + "loss": 4.3221, + "step": 52000 + }, + { + "epoch": 36.36, + "learning_rate": 1.927285318559557e-05, + "loss": 4.3191, + "step": 52500 + }, + { + "epoch": 36.7, + "learning_rate": 1.9265927977839336e-05, + "loss": 4.3147, + "step": 53000 + }, + { + "epoch": 37.0, + "eval_loss": 4.228574752807617, + "eval_runtime": 73.7741, + "eval_samples_per_second": 6674.754, + "eval_steps_per_second": 2.182, + "step": 53428 + }, + { + "epoch": 37.05, + "learning_rate": 1.9259002770083103e-05, + "loss": 4.3139, + "step": 53500 + }, + { + "epoch": 37.4, + "learning_rate": 1.9252077562326873e-05, + "loss": 4.3107, + "step": 54000 + }, + { + "epoch": 37.74, + "learning_rate": 1.924515235457064e-05, + "loss": 4.3054, + "step": 54500 + }, + { + "epoch": 38.0, + "eval_loss": 4.218676567077637, + "eval_runtime": 73.9641, + "eval_samples_per_second": 6657.61, + "eval_steps_per_second": 2.177, + "step": 54872 + }, + { + "epoch": 38.09, + "learning_rate": 1.9238227146814407e-05, + "loss": 4.303, + "step": 55000 + }, + { + "epoch": 38.43, + "learning_rate": 1.9231301939058174e-05, + "loss": 4.3031, + "step": 55500 + }, + { + "epoch": 38.78, + "learning_rate": 1.922437673130194e-05, + "loss": 4.3004, + "step": 56000 + }, + { + "epoch": 39.0, + "eval_loss": 4.211672306060791, + "eval_runtime": 73.1433, + "eval_samples_per_second": 6732.321, + "eval_steps_per_second": 2.201, + "step": 56316 + }, + { + "epoch": 39.13, + "learning_rate": 1.9217451523545708e-05, + "loss": 4.2952, + "step": 56500 + }, + { + "epoch": 39.47, + "learning_rate": 1.9210526315789474e-05, + "loss": 4.291, + "step": 57000 + }, + { + "epoch": 39.82, + "learning_rate": 1.920360110803324e-05, + "loss": 4.2922, + "step": 57500 + }, + { + "epoch": 40.0, + "eval_loss": 4.2032246589660645, + "eval_runtime": 74.4105, + "eval_samples_per_second": 6617.665, + "eval_steps_per_second": 2.164, + "step": 57760 + }, + { + "epoch": 40.17, + "learning_rate": 1.9196675900277012e-05, + "loss": 4.2891, + "step": 58000 + }, + { + "epoch": 40.51, + "learning_rate": 1.9189750692520775e-05, + "loss": 4.2848, + "step": 58500 + }, + { + "epoch": 40.86, + "learning_rate": 1.9182825484764546e-05, + "loss": 4.2841, + "step": 59000 + }, + { + "epoch": 41.0, + "eval_loss": 4.192102432250977, + "eval_runtime": 75.6355, + "eval_samples_per_second": 6510.485, + "eval_steps_per_second": 2.129, + "step": 59204 + }, + { + "epoch": 41.2, + "learning_rate": 1.9175900277008312e-05, + "loss": 4.2807, + "step": 59500 + }, + { + "epoch": 41.55, + "learning_rate": 1.916897506925208e-05, + "loss": 4.2781, + "step": 60000 + }, + { + "epoch": 41.9, + "learning_rate": 1.9162049861495846e-05, + "loss": 4.2765, + "step": 60500 + }, + { + "epoch": 42.0, + "eval_loss": 4.186478614807129, + "eval_runtime": 74.8014, + "eval_samples_per_second": 6583.085, + "eval_steps_per_second": 2.152, + "step": 60648 + }, + { + "epoch": 42.24, + "learning_rate": 1.9155124653739613e-05, + "loss": 4.2787, + "step": 61000 + }, + { + "epoch": 42.59, + "learning_rate": 1.9148199445983384e-05, + "loss": 4.2747, + "step": 61500 + }, + { + "epoch": 42.94, + "learning_rate": 1.9141274238227147e-05, + "loss": 4.2699, + "step": 62000 + }, + { + "epoch": 43.0, + "eval_loss": 4.183748245239258, + "eval_runtime": 74.0325, + "eval_samples_per_second": 6651.456, + "eval_steps_per_second": 2.175, + "step": 62092 + }, + { + "epoch": 43.28, + "learning_rate": 1.9134349030470914e-05, + "loss": 4.2688, + "step": 62500 + }, + { + "epoch": 43.63, + "learning_rate": 1.9127423822714684e-05, + "loss": 4.2662, + "step": 63000 + }, + { + "epoch": 43.98, + "learning_rate": 1.9120498614958448e-05, + "loss": 4.2634, + "step": 63500 + }, + { + "epoch": 44.0, + "eval_loss": 4.1764092445373535, + "eval_runtime": 74.4165, + "eval_samples_per_second": 6617.138, + "eval_steps_per_second": 2.163, + "step": 63536 + }, + { + "epoch": 44.32, + "learning_rate": 1.9113573407202218e-05, + "loss": 4.2594, + "step": 64000 + }, + { + "epoch": 44.67, + "learning_rate": 1.9106648199445985e-05, + "loss": 4.2598, + "step": 64500 + }, + { + "epoch": 45.0, + "eval_loss": 4.171951770782471, + "eval_runtime": 73.3834, + "eval_samples_per_second": 6710.29, + "eval_steps_per_second": 2.194, + "step": 64980 + }, + { + "epoch": 45.01, + "learning_rate": 1.9099722991689752e-05, + "loss": 4.256, + "step": 65000 + }, + { + "epoch": 45.36, + "learning_rate": 1.909279778393352e-05, + "loss": 4.2541, + "step": 65500 + }, + { + "epoch": 45.71, + "learning_rate": 1.9085872576177286e-05, + "loss": 4.2499, + "step": 66000 + }, + { + "epoch": 46.0, + "eval_loss": 4.162701606750488, + "eval_runtime": 73.3897, + "eval_samples_per_second": 6709.716, + "eval_steps_per_second": 2.194, + "step": 66424 + }, + { + "epoch": 46.05, + "learning_rate": 1.9078947368421056e-05, + "loss": 4.2489, + "step": 66500 + }, + { + "epoch": 46.4, + "learning_rate": 1.9072022160664823e-05, + "loss": 4.2488, + "step": 67000 + }, + { + "epoch": 46.75, + "learning_rate": 1.906509695290859e-05, + "loss": 4.2437, + "step": 67500 + }, + { + "epoch": 47.0, + "eval_loss": 4.157140254974365, + "eval_runtime": 73.7335, + "eval_samples_per_second": 6678.427, + "eval_steps_per_second": 2.184, + "step": 67868 + }, + { + "epoch": 47.09, + "learning_rate": 1.9058171745152357e-05, + "loss": 4.2438, + "step": 68000 + }, + { + "epoch": 47.44, + "learning_rate": 1.9051246537396124e-05, + "loss": 4.2436, + "step": 68500 + }, + { + "epoch": 47.78, + "learning_rate": 1.904432132963989e-05, + "loss": 4.24, + "step": 69000 + }, + { + "epoch": 48.0, + "eval_loss": 4.155510902404785, + "eval_runtime": 73.7665, + "eval_samples_per_second": 6675.438, + "eval_steps_per_second": 2.183, + "step": 69312 + }, + { + "epoch": 48.13, + "learning_rate": 1.9037396121883658e-05, + "loss": 4.2359, + "step": 69500 + }, + { + "epoch": 48.48, + "learning_rate": 1.9030470914127425e-05, + "loss": 4.2356, + "step": 70000 + }, + { + "epoch": 48.82, + "learning_rate": 1.9023545706371195e-05, + "loss": 4.2334, + "step": 70500 + }, + { + "epoch": 49.0, + "eval_loss": 4.148888111114502, + "eval_runtime": 76.009, + "eval_samples_per_second": 6478.494, + "eval_steps_per_second": 2.118, + "step": 70756 + }, + { + "epoch": 49.17, + "learning_rate": 1.901662049861496e-05, + "loss": 4.2319, + "step": 71000 + }, + { + "epoch": 49.52, + "learning_rate": 1.900969529085873e-05, + "loss": 4.2288, + "step": 71500 + }, + { + "epoch": 49.86, + "learning_rate": 1.9002770083102496e-05, + "loss": 4.2286, + "step": 72000 + }, + { + "epoch": 50.0, + "eval_loss": 4.136005401611328, + "eval_runtime": 74.0583, + "eval_samples_per_second": 6649.141, + "eval_steps_per_second": 2.174, + "step": 72200 + }, + { + "epoch": 50.21, + "learning_rate": 1.8995844875346263e-05, + "loss": 4.2234, + "step": 72500 + }, + { + "epoch": 50.55, + "learning_rate": 1.898891966759003e-05, + "loss": 4.2226, + "step": 73000 + }, + { + "epoch": 50.9, + "learning_rate": 1.8981994459833796e-05, + "loss": 4.2213, + "step": 73500 + }, + { + "epoch": 51.0, + "eval_loss": 4.137174606323242, + "eval_runtime": 74.5156, + "eval_samples_per_second": 6608.338, + "eval_steps_per_second": 2.161, + "step": 73644 + }, + { + "epoch": 51.25, + "learning_rate": 1.8975069252077563e-05, + "loss": 4.2186, + "step": 74000 + }, + { + "epoch": 51.59, + "learning_rate": 1.896814404432133e-05, + "loss": 4.2182, + "step": 74500 + }, + { + "epoch": 51.94, + "learning_rate": 1.8961218836565097e-05, + "loss": 4.2152, + "step": 75000 + }, + { + "epoch": 52.0, + "eval_loss": 4.127338886260986, + "eval_runtime": 76.201, + "eval_samples_per_second": 6462.173, + "eval_steps_per_second": 2.113, + "step": 75088 + }, + { + "epoch": 52.29, + "learning_rate": 1.8954293628808867e-05, + "loss": 4.2122, + "step": 75500 + }, + { + "epoch": 52.63, + "learning_rate": 1.894736842105263e-05, + "loss": 4.2135, + "step": 76000 + }, + { + "epoch": 52.98, + "learning_rate": 1.89404432132964e-05, + "loss": 4.211, + "step": 76500 + }, + { + "epoch": 53.0, + "eval_loss": 4.125185489654541, + "eval_runtime": 74.874, + "eval_samples_per_second": 6576.7, + "eval_steps_per_second": 2.15, + "step": 76532 + }, + { + "epoch": 53.32, + "learning_rate": 1.8933518005540168e-05, + "loss": 4.2079, + "step": 77000 + }, + { + "epoch": 53.67, + "learning_rate": 1.8926592797783935e-05, + "loss": 4.2036, + "step": 77500 + }, + { + "epoch": 54.0, + "eval_loss": 4.116921901702881, + "eval_runtime": 74.7043, + "eval_samples_per_second": 6591.639, + "eval_steps_per_second": 2.155, + "step": 77976 + }, + { + "epoch": 54.02, + "learning_rate": 1.8919667590027702e-05, + "loss": 4.2046, + "step": 78000 + }, + { + "epoch": 54.36, + "learning_rate": 1.891274238227147e-05, + "loss": 4.201, + "step": 78500 + }, + { + "epoch": 54.71, + "learning_rate": 1.8905817174515236e-05, + "loss": 4.1969, + "step": 79000 + }, + { + "epoch": 55.0, + "eval_loss": 4.117580890655518, + "eval_runtime": 74.4844, + "eval_samples_per_second": 6611.1, + "eval_steps_per_second": 2.162, + "step": 79420 + }, + { + "epoch": 55.06, + "learning_rate": 1.8898891966759006e-05, + "loss": 4.1987, + "step": 79500 + }, + { + "epoch": 55.4, + "learning_rate": 1.889196675900277e-05, + "loss": 4.1951, + "step": 80000 + }, + { + "epoch": 55.75, + "learning_rate": 1.888504155124654e-05, + "loss": 4.1944, + "step": 80500 + }, + { + "epoch": 56.0, + "eval_loss": 4.110030651092529, + "eval_runtime": 73.4183, + "eval_samples_per_second": 6707.104, + "eval_steps_per_second": 2.193, + "step": 80864 + }, + { + "epoch": 56.09, + "learning_rate": 1.8878116343490307e-05, + "loss": 4.1924, + "step": 81000 + }, + { + "epoch": 56.44, + "learning_rate": 1.8871191135734074e-05, + "loss": 4.1913, + "step": 81500 + }, + { + "epoch": 56.79, + "learning_rate": 1.886426592797784e-05, + "loss": 4.1923, + "step": 82000 + }, + { + "epoch": 57.0, + "eval_loss": 4.108407974243164, + "eval_runtime": 74.3146, + "eval_samples_per_second": 6626.209, + "eval_steps_per_second": 2.166, + "step": 82308 + }, + { + "epoch": 57.13, + "learning_rate": 1.8857340720221608e-05, + "loss": 4.1889, + "step": 82500 + }, + { + "epoch": 57.48, + "learning_rate": 1.8850415512465378e-05, + "loss": 4.1856, + "step": 83000 + }, + { + "epoch": 57.83, + "learning_rate": 1.884349030470914e-05, + "loss": 4.1869, + "step": 83500 + }, + { + "epoch": 58.0, + "eval_loss": 4.103403568267822, + "eval_runtime": 75.7605, + "eval_samples_per_second": 6499.745, + "eval_steps_per_second": 2.125, + "step": 83752 + }, + { + "epoch": 58.17, + "learning_rate": 1.883656509695291e-05, + "loss": 4.181, + "step": 84000 + }, + { + "epoch": 58.52, + "learning_rate": 1.882963988919668e-05, + "loss": 4.1801, + "step": 84500 + }, + { + "epoch": 58.86, + "learning_rate": 1.8822714681440442e-05, + "loss": 4.1802, + "step": 85000 + }, + { + "epoch": 59.0, + "eval_loss": 4.101166248321533, + "eval_runtime": 73.6678, + "eval_samples_per_second": 6684.382, + "eval_steps_per_second": 2.185, + "step": 85196 + }, + { + "epoch": 59.21, + "learning_rate": 1.8815789473684213e-05, + "loss": 4.1776, + "step": 85500 + }, + { + "epoch": 59.56, + "learning_rate": 1.880886426592798e-05, + "loss": 4.1765, + "step": 86000 + }, + { + "epoch": 59.9, + "learning_rate": 1.8801939058171746e-05, + "loss": 4.1764, + "step": 86500 + }, + { + "epoch": 60.0, + "eval_loss": 4.089655876159668, + "eval_runtime": 74.6724, + "eval_samples_per_second": 6594.462, + "eval_steps_per_second": 2.156, + "step": 86640 + }, + { + "epoch": 60.25, + "learning_rate": 1.8795013850415513e-05, + "loss": 4.1752, + "step": 87000 + }, + { + "epoch": 60.6, + "learning_rate": 1.878808864265928e-05, + "loss": 4.1747, + "step": 87500 + }, + { + "epoch": 60.94, + "learning_rate": 1.878116343490305e-05, + "loss": 4.1668, + "step": 88000 + }, + { + "epoch": 61.0, + "eval_loss": 4.0857367515563965, + "eval_runtime": 73.6262, + "eval_samples_per_second": 6688.167, + "eval_steps_per_second": 2.187, + "step": 88084 + }, + { + "epoch": 61.29, + "learning_rate": 1.8774238227146814e-05, + "loss": 4.1695, + "step": 88500 + }, + { + "epoch": 61.63, + "learning_rate": 1.8767313019390584e-05, + "loss": 4.1645, + "step": 89000 + }, + { + "epoch": 61.98, + "learning_rate": 1.876038781163435e-05, + "loss": 4.1633, + "step": 89500 + }, + { + "epoch": 62.0, + "eval_loss": 4.080641269683838, + "eval_runtime": 74.2817, + "eval_samples_per_second": 6629.141, + "eval_steps_per_second": 2.167, + "step": 89528 + }, + { + "epoch": 62.33, + "learning_rate": 1.8753462603878118e-05, + "loss": 4.1629, + "step": 90000 + }, + { + "epoch": 62.67, + "learning_rate": 1.8746537396121885e-05, + "loss": 4.1631, + "step": 90500 + }, + { + "epoch": 63.0, + "eval_loss": 4.07755708694458, + "eval_runtime": 73.9609, + "eval_samples_per_second": 6657.9, + "eval_steps_per_second": 2.177, + "step": 90972 + }, + { + "epoch": 63.02, + "learning_rate": 1.8739612188365652e-05, + "loss": 4.1607, + "step": 91000 + }, + { + "epoch": 63.37, + "learning_rate": 1.873268698060942e-05, + "loss": 4.1596, + "step": 91500 + }, + { + "epoch": 63.71, + "learning_rate": 1.872576177285319e-05, + "loss": 4.1559, + "step": 92000 + }, + { + "epoch": 64.0, + "eval_loss": 4.075015068054199, + "eval_runtime": 73.801, + "eval_samples_per_second": 6672.317, + "eval_steps_per_second": 2.182, + "step": 92416 + }, + { + "epoch": 64.06, + "learning_rate": 1.8718836565096953e-05, + "loss": 4.155, + "step": 92500 + }, + { + "epoch": 64.4, + "learning_rate": 1.8711911357340723e-05, + "loss": 4.1567, + "step": 93000 + }, + { + "epoch": 64.75, + "learning_rate": 1.870498614958449e-05, + "loss": 4.1529, + "step": 93500 + }, + { + "epoch": 65.0, + "eval_loss": 4.0660247802734375, + "eval_runtime": 73.35, + "eval_samples_per_second": 6713.346, + "eval_steps_per_second": 2.195, + "step": 93860 + }, + { + "epoch": 65.1, + "learning_rate": 1.8698060941828257e-05, + "loss": 4.1521, + "step": 94000 + }, + { + "epoch": 65.44, + "learning_rate": 1.8691135734072024e-05, + "loss": 4.1485, + "step": 94500 + }, + { + "epoch": 65.79, + "learning_rate": 1.868421052631579e-05, + "loss": 4.1485, + "step": 95000 + }, + { + "epoch": 66.0, + "eval_loss": 4.064459800720215, + "eval_runtime": 73.345, + "eval_samples_per_second": 6713.802, + "eval_steps_per_second": 2.195, + "step": 95304 + }, + { + "epoch": 66.14, + "learning_rate": 1.8677285318559558e-05, + "loss": 4.1469, + "step": 95500 + }, + { + "epoch": 66.48, + "learning_rate": 1.8670360110803325e-05, + "loss": 4.1459, + "step": 96000 + }, + { + "epoch": 66.83, + "learning_rate": 1.866343490304709e-05, + "loss": 4.1431, + "step": 96500 + }, + { + "epoch": 67.0, + "eval_loss": 4.058371067047119, + "eval_runtime": 72.3552, + "eval_samples_per_second": 6805.646, + "eval_steps_per_second": 2.225, + "step": 96748 + }, + { + "epoch": 67.17, + "learning_rate": 1.8656509695290862e-05, + "loss": 4.143, + "step": 97000 + }, + { + "epoch": 67.52, + "learning_rate": 1.8649584487534625e-05, + "loss": 4.1405, + "step": 97500 + }, + { + "epoch": 67.87, + "learning_rate": 1.8642659279778396e-05, + "loss": 4.1404, + "step": 98000 + }, + { + "epoch": 68.0, + "eval_loss": 4.054144382476807, + "eval_runtime": 73.8095, + "eval_samples_per_second": 6671.555, + "eval_steps_per_second": 2.181, + "step": 98192 + }, + { + "epoch": 68.21, + "learning_rate": 1.8635734072022163e-05, + "loss": 4.1381, + "step": 98500 + }, + { + "epoch": 68.56, + "learning_rate": 1.862880886426593e-05, + "loss": 4.1343, + "step": 99000 + }, + { + "epoch": 68.91, + "learning_rate": 1.8621883656509697e-05, + "loss": 4.1338, + "step": 99500 + }, + { + "epoch": 69.0, + "eval_loss": 4.050791263580322, + "eval_runtime": 75.7075, + "eval_samples_per_second": 6504.296, + "eval_steps_per_second": 2.127, + "step": 99636 + }, + { + "epoch": 69.25, + "learning_rate": 1.8614958448753463e-05, + "loss": 4.1313, + "step": 100000 + }, + { + "epoch": 69.6, + "learning_rate": 1.860803324099723e-05, + "loss": 4.132, + "step": 100500 + }, + { + "epoch": 69.94, + "learning_rate": 1.8601108033240997e-05, + "loss": 4.1352, + "step": 101000 + }, + { + "epoch": 70.0, + "eval_loss": 4.043735980987549, + "eval_runtime": 74.0516, + "eval_samples_per_second": 6649.741, + "eval_steps_per_second": 2.174, + "step": 101080 + }, + { + "epoch": 70.29, + "learning_rate": 1.8594182825484764e-05, + "loss": 4.1307, + "step": 101500 + }, + { + "epoch": 70.64, + "learning_rate": 1.8587257617728535e-05, + "loss": 4.1268, + "step": 102000 + }, + { + "epoch": 70.98, + "learning_rate": 1.85803324099723e-05, + "loss": 4.1307, + "step": 102500 + }, + { + "epoch": 71.0, + "eval_loss": 4.04426908493042, + "eval_runtime": 72.7883, + "eval_samples_per_second": 6765.157, + "eval_steps_per_second": 2.212, + "step": 102524 + }, + { + "epoch": 71.33, + "learning_rate": 1.857340720221607e-05, + "loss": 4.1277, + "step": 103000 + }, + { + "epoch": 71.68, + "learning_rate": 1.8566481994459835e-05, + "loss": 4.1241, + "step": 103500 + }, + { + "epoch": 72.0, + "eval_loss": 4.041337966918945, + "eval_runtime": 75.6211, + "eval_samples_per_second": 6511.731, + "eval_steps_per_second": 2.129, + "step": 103968 + }, + { + "epoch": 72.02, + "learning_rate": 1.8559556786703602e-05, + "loss": 4.1256, + "step": 104000 + }, + { + "epoch": 72.37, + "learning_rate": 1.8552631578947373e-05, + "loss": 4.1215, + "step": 104500 + }, + { + "epoch": 72.71, + "learning_rate": 1.8545706371191136e-05, + "loss": 4.1227, + "step": 105000 + }, + { + "epoch": 73.0, + "eval_loss": 4.042394161224365, + "eval_runtime": 75.0493, + "eval_samples_per_second": 6561.341, + "eval_steps_per_second": 2.145, + "step": 105412 + }, + { + "epoch": 73.06, + "learning_rate": 1.8538781163434903e-05, + "loss": 4.118, + "step": 105500 + }, + { + "epoch": 73.41, + "learning_rate": 1.8531855955678673e-05, + "loss": 4.1166, + "step": 106000 + }, + { + "epoch": 73.75, + "learning_rate": 1.8524930747922437e-05, + "loss": 4.1186, + "step": 106500 + }, + { + "epoch": 74.0, + "eval_loss": 4.030467510223389, + "eval_runtime": 73.3813, + "eval_samples_per_second": 6710.483, + "eval_steps_per_second": 2.194, + "step": 106856 + }, + { + "epoch": 74.1, + "learning_rate": 1.8518005540166207e-05, + "loss": 4.1165, + "step": 107000 + }, + { + "epoch": 74.45, + "learning_rate": 1.8511080332409974e-05, + "loss": 4.113, + "step": 107500 + }, + { + "epoch": 74.79, + "learning_rate": 1.850415512465374e-05, + "loss": 4.1118, + "step": 108000 + }, + { + "epoch": 75.0, + "eval_loss": 4.033474922180176, + "eval_runtime": 72.6909, + "eval_samples_per_second": 6774.219, + "eval_steps_per_second": 2.215, + "step": 108300 + }, + { + "epoch": 75.14, + "learning_rate": 1.8497229916897508e-05, + "loss": 4.1129, + "step": 108500 + }, + { + "epoch": 75.48, + "learning_rate": 1.8490304709141275e-05, + "loss": 4.1103, + "step": 109000 + }, + { + "epoch": 75.83, + "learning_rate": 1.8483379501385045e-05, + "loss": 4.1123, + "step": 109500 + }, + { + "epoch": 76.0, + "eval_loss": 4.0271382331848145, + "eval_runtime": 73.3585, + "eval_samples_per_second": 6712.566, + "eval_steps_per_second": 2.195, + "step": 109744 + }, + { + "epoch": 76.18, + "learning_rate": 1.847645429362881e-05, + "loss": 4.1101, + "step": 110000 + }, + { + "epoch": 76.52, + "learning_rate": 1.846952908587258e-05, + "loss": 4.1042, + "step": 110500 + }, + { + "epoch": 76.87, + "learning_rate": 1.8462603878116346e-05, + "loss": 4.1071, + "step": 111000 + }, + { + "epoch": 77.0, + "eval_loss": 4.022892951965332, + "eval_runtime": 74.3109, + "eval_samples_per_second": 6626.539, + "eval_steps_per_second": 2.167, + "step": 111188 + }, + { + "epoch": 77.22, + "learning_rate": 1.8455678670360113e-05, + "loss": 4.1012, + "step": 111500 + }, + { + "epoch": 77.56, + "learning_rate": 1.844875346260388e-05, + "loss": 4.101, + "step": 112000 + }, + { + "epoch": 77.91, + "learning_rate": 1.8441828254847647e-05, + "loss": 4.1028, + "step": 112500 + }, + { + "epoch": 78.0, + "eval_loss": 4.022953987121582, + "eval_runtime": 75.1521, + "eval_samples_per_second": 6552.363, + "eval_steps_per_second": 2.142, + "step": 112632 + }, + { + "epoch": 78.25, + "learning_rate": 1.8434903047091414e-05, + "loss": 4.1002, + "step": 113000 + }, + { + "epoch": 78.6, + "learning_rate": 1.842797783933518e-05, + "loss": 4.1001, + "step": 113500 + }, + { + "epoch": 78.95, + "learning_rate": 1.8421052631578947e-05, + "loss": 4.1002, + "step": 114000 + }, + { + "epoch": 79.0, + "eval_loss": 4.016762733459473, + "eval_runtime": 74.5194, + "eval_samples_per_second": 6607.994, + "eval_steps_per_second": 2.161, + "step": 114076 + }, + { + "epoch": 79.29, + "learning_rate": 1.8414127423822718e-05, + "loss": 4.0983, + "step": 114500 + }, + { + "epoch": 79.64, + "learning_rate": 1.8407202216066485e-05, + "loss": 4.0972, + "step": 115000 + }, + { + "epoch": 79.99, + "learning_rate": 1.840027700831025e-05, + "loss": 4.0958, + "step": 115500 + }, + { + "epoch": 80.0, + "eval_loss": 4.014660835266113, + "eval_runtime": 74.3599, + "eval_samples_per_second": 6622.175, + "eval_steps_per_second": 2.165, + "step": 115520 + }, + { + "epoch": 80.33, + "learning_rate": 1.839335180055402e-05, + "loss": 4.0944, + "step": 116000 + }, + { + "epoch": 80.68, + "learning_rate": 1.8386426592797785e-05, + "loss": 4.0919, + "step": 116500 + }, + { + "epoch": 81.0, + "eval_loss": 4.004740238189697, + "eval_runtime": 73.8401, + "eval_samples_per_second": 6668.784, + "eval_steps_per_second": 2.18, + "step": 116964 + }, + { + "epoch": 81.02, + "learning_rate": 1.8379501385041552e-05, + "loss": 4.091, + "step": 117000 + }, + { + "epoch": 81.37, + "learning_rate": 1.837257617728532e-05, + "loss": 4.0913, + "step": 117500 + }, + { + "epoch": 81.72, + "learning_rate": 1.8365650969529086e-05, + "loss": 4.0891, + "step": 118000 + }, + { + "epoch": 82.0, + "eval_loss": 4.007106781005859, + "eval_runtime": 73.8477, + "eval_samples_per_second": 6668.102, + "eval_steps_per_second": 2.18, + "step": 118408 + }, + { + "epoch": 82.06, + "learning_rate": 1.8358725761772856e-05, + "loss": 4.0869, + "step": 118500 + }, + { + "epoch": 82.41, + "learning_rate": 1.835180055401662e-05, + "loss": 4.0896, + "step": 119000 + }, + { + "epoch": 82.76, + "learning_rate": 1.834487534626039e-05, + "loss": 4.0902, + "step": 119500 + }, + { + "epoch": 83.0, + "eval_loss": 4.002437591552734, + "eval_runtime": 73.8311, + "eval_samples_per_second": 6669.604, + "eval_steps_per_second": 2.181, + "step": 119852 + }, + { + "epoch": 83.1, + "learning_rate": 1.8337950138504157e-05, + "loss": 4.0872, + "step": 120000 + }, + { + "epoch": 83.45, + "learning_rate": 1.8331024930747924e-05, + "loss": 4.084, + "step": 120500 + }, + { + "epoch": 83.8, + "learning_rate": 1.832409972299169e-05, + "loss": 4.0846, + "step": 121000 + }, + { + "epoch": 84.0, + "eval_loss": 3.9989962577819824, + "eval_runtime": 74.1606, + "eval_samples_per_second": 6639.967, + "eval_steps_per_second": 2.171, + "step": 121296 + }, + { + "epoch": 84.14, + "learning_rate": 1.8317174515235458e-05, + "loss": 4.0813, + "step": 121500 + }, + { + "epoch": 84.49, + "learning_rate": 1.8310249307479225e-05, + "loss": 4.0794, + "step": 122000 + }, + { + "epoch": 84.83, + "learning_rate": 1.8303324099722992e-05, + "loss": 4.0785, + "step": 122500 + }, + { + "epoch": 85.0, + "eval_loss": 3.9976210594177246, + "eval_runtime": 74.1372, + "eval_samples_per_second": 6642.06, + "eval_steps_per_second": 2.172, + "step": 122740 + }, + { + "epoch": 85.18, + "learning_rate": 1.829639889196676e-05, + "loss": 4.0776, + "step": 123000 + }, + { + "epoch": 85.53, + "learning_rate": 1.828947368421053e-05, + "loss": 4.079, + "step": 123500 + }, + { + "epoch": 85.87, + "learning_rate": 1.8282548476454296e-05, + "loss": 4.0788, + "step": 124000 + }, + { + "epoch": 86.0, + "eval_loss": 3.9947969913482666, + "eval_runtime": 74.7164, + "eval_samples_per_second": 6590.579, + "eval_steps_per_second": 2.155, + "step": 124184 + }, + { + "epoch": 86.22, + "learning_rate": 1.8275623268698063e-05, + "loss": 4.0777, + "step": 124500 + }, + { + "epoch": 86.57, + "learning_rate": 1.826869806094183e-05, + "loss": 4.0768, + "step": 125000 + }, + { + "epoch": 86.91, + "learning_rate": 1.8261772853185597e-05, + "loss": 4.0759, + "step": 125500 + }, + { + "epoch": 87.0, + "eval_loss": 3.9896225929260254, + "eval_runtime": 73.8648, + "eval_samples_per_second": 6666.556, + "eval_steps_per_second": 2.18, + "step": 125628 + }, + { + "epoch": 87.26, + "learning_rate": 1.8254847645429364e-05, + "loss": 4.0709, + "step": 126000 + }, + { + "epoch": 87.6, + "learning_rate": 1.824792243767313e-05, + "loss": 4.0711, + "step": 126500 + }, + { + "epoch": 87.95, + "learning_rate": 1.82409972299169e-05, + "loss": 4.0728, + "step": 127000 + }, + { + "epoch": 88.0, + "eval_loss": 3.988676071166992, + "eval_runtime": 73.3696, + "eval_samples_per_second": 6711.549, + "eval_steps_per_second": 2.194, + "step": 127072 + }, + { + "epoch": 88.3, + "learning_rate": 1.8234072022160668e-05, + "loss": 4.0693, + "step": 127500 + }, + { + "epoch": 88.64, + "learning_rate": 1.822714681440443e-05, + "loss": 4.0658, + "step": 128000 + }, + { + "epoch": 88.99, + "learning_rate": 1.82202216066482e-05, + "loss": 4.0668, + "step": 128500 + }, + { + "epoch": 89.0, + "eval_loss": 3.9832513332366943, + "eval_runtime": 73.9518, + "eval_samples_per_second": 6658.718, + "eval_steps_per_second": 2.177, + "step": 128516 + }, + { + "epoch": 89.34, + "learning_rate": 1.821329639889197e-05, + "loss": 4.0669, + "step": 129000 + }, + { + "epoch": 89.68, + "learning_rate": 1.8206371191135735e-05, + "loss": 4.0669, + "step": 129500 + }, + { + "epoch": 90.0, + "eval_loss": 3.9873828887939453, + "eval_runtime": 75.3146, + "eval_samples_per_second": 6538.224, + "eval_steps_per_second": 2.138, + "step": 129960 + }, + { + "epoch": 90.03, + "learning_rate": 1.8199445983379502e-05, + "loss": 4.0619, + "step": 130000 + }, + { + "epoch": 90.37, + "learning_rate": 1.819252077562327e-05, + "loss": 4.0611, + "step": 130500 + }, + { + "epoch": 90.72, + "learning_rate": 1.818559556786704e-05, + "loss": 4.0625, + "step": 131000 + }, + { + "epoch": 91.0, + "eval_loss": 3.9783737659454346, + "eval_runtime": 75.884, + "eval_samples_per_second": 6489.166, + "eval_steps_per_second": 2.122, + "step": 131404 + }, + { + "epoch": 91.07, + "learning_rate": 1.8178670360110803e-05, + "loss": 4.0609, + "step": 131500 + }, + { + "epoch": 91.41, + "learning_rate": 1.8171745152354573e-05, + "loss": 4.0605, + "step": 132000 + }, + { + "epoch": 91.76, + "learning_rate": 1.816481994459834e-05, + "loss": 4.0571, + "step": 132500 + }, + { + "epoch": 92.0, + "eval_loss": 3.976148843765259, + "eval_runtime": 74.7074, + "eval_samples_per_second": 6591.369, + "eval_steps_per_second": 2.155, + "step": 132848 + }, + { + "epoch": 92.11, + "learning_rate": 1.8157894736842107e-05, + "loss": 4.0565, + "step": 133000 + }, + { + "epoch": 92.45, + "learning_rate": 1.8150969529085874e-05, + "loss": 4.057, + "step": 133500 + }, + { + "epoch": 92.8, + "learning_rate": 1.814404432132964e-05, + "loss": 4.0572, + "step": 134000 + }, + { + "epoch": 93.0, + "eval_loss": 3.9746015071868896, + "eval_runtime": 73.6659, + "eval_samples_per_second": 6684.561, + "eval_steps_per_second": 2.186, + "step": 134292 + }, + { + "epoch": 93.14, + "learning_rate": 1.8137119113573408e-05, + "loss": 4.0551, + "step": 134500 + }, + { + "epoch": 93.49, + "learning_rate": 1.8130193905817175e-05, + "loss": 4.0562, + "step": 135000 + }, + { + "epoch": 93.84, + "learning_rate": 1.8123268698060942e-05, + "loss": 4.0533, + "step": 135500 + }, + { + "epoch": 94.0, + "eval_loss": 3.971306562423706, + "eval_runtime": 74.5978, + "eval_samples_per_second": 6601.056, + "eval_steps_per_second": 2.158, + "step": 135736 + }, + { + "epoch": 94.18, + "learning_rate": 1.8116343490304712e-05, + "loss": 4.0537, + "step": 136000 + }, + { + "epoch": 94.53, + "learning_rate": 1.810941828254848e-05, + "loss": 4.0532, + "step": 136500 + }, + { + "epoch": 94.88, + "learning_rate": 1.8102493074792246e-05, + "loss": 4.0508, + "step": 137000 + }, + { + "epoch": 95.0, + "eval_loss": 3.971055507659912, + "eval_runtime": 75.83, + "eval_samples_per_second": 6493.789, + "eval_steps_per_second": 2.123, + "step": 137180 + }, + { + "epoch": 95.22, + "learning_rate": 1.8095567867036013e-05, + "loss": 4.0504, + "step": 137500 + }, + { + "epoch": 95.57, + "learning_rate": 1.808864265927978e-05, + "loss": 4.0466, + "step": 138000 + }, + { + "epoch": 95.91, + "learning_rate": 1.8081717451523547e-05, + "loss": 4.0487, + "step": 138500 + }, + { + "epoch": 96.0, + "eval_loss": 3.964989185333252, + "eval_runtime": 75.3545, + "eval_samples_per_second": 6534.761, + "eval_steps_per_second": 2.137, + "step": 138624 + }, + { + "epoch": 96.26, + "learning_rate": 1.8074792243767314e-05, + "loss": 4.0443, + "step": 139000 + }, + { + "epoch": 96.61, + "learning_rate": 1.806786703601108e-05, + "loss": 4.044, + "step": 139500 + }, + { + "epoch": 96.95, + "learning_rate": 1.806094182825485e-05, + "loss": 4.0446, + "step": 140000 + }, + { + "epoch": 97.0, + "eval_loss": 3.9668915271759033, + "eval_runtime": 74.5714, + "eval_samples_per_second": 6603.39, + "eval_steps_per_second": 2.159, + "step": 140068 + }, + { + "epoch": 97.3, + "learning_rate": 1.8054016620498614e-05, + "loss": 4.046, + "step": 140500 + }, + { + "epoch": 97.65, + "learning_rate": 1.8047091412742385e-05, + "loss": 4.0446, + "step": 141000 + }, + { + "epoch": 97.99, + "learning_rate": 1.804016620498615e-05, + "loss": 4.0432, + "step": 141500 + }, + { + "epoch": 98.0, + "eval_loss": 3.958078145980835, + "eval_runtime": 76.6585, + "eval_samples_per_second": 6423.607, + "eval_steps_per_second": 2.1, + "step": 141512 + }, + { + "epoch": 98.34, + "learning_rate": 1.803324099722992e-05, + "loss": 4.043, + "step": 142000 + }, + { + "epoch": 98.68, + "learning_rate": 1.8026315789473685e-05, + "loss": 4.0413, + "step": 142500 + }, + { + "epoch": 99.0, + "eval_loss": 3.9584155082702637, + "eval_runtime": 74.9791, + "eval_samples_per_second": 6567.486, + "eval_steps_per_second": 2.147, + "step": 142956 + }, + { + "epoch": 99.03, + "learning_rate": 1.8019390581717452e-05, + "loss": 4.0386, + "step": 143000 + }, + { + "epoch": 99.38, + "learning_rate": 1.801246537396122e-05, + "loss": 4.0369, + "step": 143500 + }, + { + "epoch": 99.72, + "learning_rate": 1.8005540166204986e-05, + "loss": 4.0354, + "step": 144000 + }, + { + "epoch": 100.0, + "eval_loss": 3.9502811431884766, + "eval_runtime": 75.7506, + "eval_samples_per_second": 6500.598, + "eval_steps_per_second": 2.125, + "step": 144400 + }, + { + "epoch": 100.07, + "learning_rate": 1.7998614958448753e-05, + "loss": 4.0367, + "step": 144500 + }, + { + "epoch": 100.42, + "learning_rate": 1.7991689750692523e-05, + "loss": 4.0353, + "step": 145000 + }, + { + "epoch": 100.76, + "learning_rate": 1.798476454293629e-05, + "loss": 4.0357, + "step": 145500 + }, + { + "epoch": 101.0, + "eval_loss": 3.948287010192871, + "eval_runtime": 75.3101, + "eval_samples_per_second": 6538.622, + "eval_steps_per_second": 2.138, + "step": 145844 + }, + { + "epoch": 101.11, + "learning_rate": 1.7977839335180057e-05, + "loss": 4.0344, + "step": 146000 + }, + { + "epoch": 101.45, + "learning_rate": 1.7970914127423824e-05, + "loss": 4.0308, + "step": 146500 + }, + { + "epoch": 101.8, + "learning_rate": 1.796398891966759e-05, + "loss": 4.0321, + "step": 147000 + }, + { + "epoch": 102.0, + "eval_loss": 3.9529218673706055, + "eval_runtime": 74.5513, + "eval_samples_per_second": 6605.174, + "eval_steps_per_second": 2.16, + "step": 147288 + }, + { + "epoch": 102.15, + "learning_rate": 1.7957063711911358e-05, + "loss": 4.0305, + "step": 147500 + }, + { + "epoch": 102.49, + "learning_rate": 1.7950138504155125e-05, + "loss": 4.0291, + "step": 148000 + }, + { + "epoch": 102.84, + "learning_rate": 1.7943213296398895e-05, + "loss": 4.0292, + "step": 148500 + }, + { + "epoch": 103.0, + "eval_loss": 3.9466750621795654, + "eval_runtime": 76.3415, + "eval_samples_per_second": 6450.282, + "eval_steps_per_second": 2.109, + "step": 148732 + }, + { + "epoch": 103.19, + "learning_rate": 1.7936288088642662e-05, + "loss": 4.0281, + "step": 149000 + }, + { + "epoch": 103.53, + "learning_rate": 1.7929362880886426e-05, + "loss": 4.0281, + "step": 149500 + }, + { + "epoch": 103.88, + "learning_rate": 1.7922437673130196e-05, + "loss": 4.0271, + "step": 150000 + }, + { + "epoch": 104.0, + "eval_loss": 3.9457340240478516, + "eval_runtime": 77.413, + "eval_samples_per_second": 6361.001, + "eval_steps_per_second": 2.08, + "step": 150176 + }, + { + "epoch": 104.22, + "learning_rate": 1.7915512465373963e-05, + "loss": 4.0259, + "step": 150500 + }, + { + "epoch": 104.57, + "learning_rate": 1.790858725761773e-05, + "loss": 4.0247, + "step": 151000 + }, + { + "epoch": 104.92, + "learning_rate": 1.7901662049861497e-05, + "loss": 4.0245, + "step": 151500 + }, + { + "epoch": 105.0, + "eval_loss": 3.944822311401367, + "eval_runtime": 79.2928, + "eval_samples_per_second": 6210.2, + "eval_steps_per_second": 2.03, + "step": 151620 + }, + { + "epoch": 105.26, + "learning_rate": 1.7894736842105264e-05, + "loss": 4.0233, + "step": 152000 + }, + { + "epoch": 105.61, + "learning_rate": 1.7887811634349034e-05, + "loss": 4.0232, + "step": 152500 + }, + { + "epoch": 105.96, + "learning_rate": 1.7880886426592798e-05, + "loss": 4.0204, + "step": 153000 + }, + { + "epoch": 106.0, + "eval_loss": 3.9429657459259033, + "eval_runtime": 76.6917, + "eval_samples_per_second": 6420.824, + "eval_steps_per_second": 2.099, + "step": 153064 + }, + { + "epoch": 106.3, + "learning_rate": 1.7873961218836568e-05, + "loss": 4.0201, + "step": 153500 + }, + { + "epoch": 106.65, + "learning_rate": 1.7867036011080335e-05, + "loss": 4.0191, + "step": 154000 + }, + { + "epoch": 106.99, + "learning_rate": 1.7860110803324102e-05, + "loss": 4.0218, + "step": 154500 + }, + { + "epoch": 107.0, + "eval_loss": 3.9408297538757324, + "eval_runtime": 75.1357, + "eval_samples_per_second": 6553.796, + "eval_steps_per_second": 2.143, + "step": 154508 + }, + { + "epoch": 107.34, + "learning_rate": 1.785318559556787e-05, + "loss": 4.0232, + "step": 155000 + }, + { + "epoch": 107.69, + "learning_rate": 1.7846260387811636e-05, + "loss": 4.018, + "step": 155500 + }, + { + "epoch": 108.0, + "eval_loss": 3.9399232864379883, + "eval_runtime": 74.43, + "eval_samples_per_second": 6615.938, + "eval_steps_per_second": 2.163, + "step": 155952 + }, + { + "epoch": 108.03, + "learning_rate": 1.7839335180055402e-05, + "loss": 4.0175, + "step": 156000 + }, + { + "epoch": 108.38, + "learning_rate": 1.783240997229917e-05, + "loss": 4.0165, + "step": 156500 + }, + { + "epoch": 108.73, + "learning_rate": 1.7825484764542936e-05, + "loss": 4.0137, + "step": 157000 + }, + { + "epoch": 109.0, + "eval_loss": 3.935981035232544, + "eval_runtime": 75.1558, + "eval_samples_per_second": 6552.043, + "eval_steps_per_second": 2.142, + "step": 157396 + }, + { + "epoch": 109.07, + "learning_rate": 1.7818559556786707e-05, + "loss": 4.0159, + "step": 157500 + }, + { + "epoch": 109.42, + "learning_rate": 1.7811634349030474e-05, + "loss": 4.0177, + "step": 158000 + }, + { + "epoch": 109.76, + "learning_rate": 1.780470914127424e-05, + "loss": 4.0114, + "step": 158500 + }, + { + "epoch": 110.0, + "eval_loss": 3.934030771255493, + "eval_runtime": 75.3274, + "eval_samples_per_second": 6537.118, + "eval_steps_per_second": 2.137, + "step": 158840 + }, + { + "epoch": 110.11, + "learning_rate": 1.7797783933518007e-05, + "loss": 4.011, + "step": 159000 + }, + { + "epoch": 110.46, + "learning_rate": 1.7790858725761774e-05, + "loss": 4.0116, + "step": 159500 + }, + { + "epoch": 110.8, + "learning_rate": 1.778393351800554e-05, + "loss": 4.0122, + "step": 160000 + }, + { + "epoch": 111.0, + "eval_loss": 3.9282586574554443, + "eval_runtime": 75.7551, + "eval_samples_per_second": 6500.211, + "eval_steps_per_second": 2.125, + "step": 160284 + }, + { + "epoch": 111.15, + "learning_rate": 1.7777008310249308e-05, + "loss": 4.0088, + "step": 160500 + }, + { + "epoch": 111.5, + "learning_rate": 1.7770083102493075e-05, + "loss": 4.0097, + "step": 161000 + }, + { + "epoch": 111.84, + "learning_rate": 1.7763157894736845e-05, + "loss": 4.0093, + "step": 161500 + }, + { + "epoch": 112.0, + "eval_loss": 3.9263861179351807, + "eval_runtime": 77.7083, + "eval_samples_per_second": 6336.824, + "eval_steps_per_second": 2.072, + "step": 161728 + }, + { + "epoch": 112.19, + "learning_rate": 1.775623268698061e-05, + "loss": 4.0096, + "step": 162000 + }, + { + "epoch": 112.53, + "learning_rate": 1.774930747922438e-05, + "loss": 4.0042, + "step": 162500 + }, + { + "epoch": 112.88, + "learning_rate": 1.7742382271468146e-05, + "loss": 4.0068, + "step": 163000 + }, + { + "epoch": 113.0, + "eval_loss": 3.9213218688964844, + "eval_runtime": 76.9094, + "eval_samples_per_second": 6402.654, + "eval_steps_per_second": 2.093, + "step": 163172 + }, + { + "epoch": 113.23, + "learning_rate": 1.7735457063711913e-05, + "loss": 4.005, + "step": 163500 + }, + { + "epoch": 113.57, + "learning_rate": 1.772853185595568e-05, + "loss": 4.0043, + "step": 164000 + }, + { + "epoch": 113.92, + "learning_rate": 1.7721606648199447e-05, + "loss": 4.002, + "step": 164500 + }, + { + "epoch": 114.0, + "eval_loss": 3.9210433959960938, + "eval_runtime": 73.884, + "eval_samples_per_second": 6664.824, + "eval_steps_per_second": 2.179, + "step": 164616 + }, + { + "epoch": 114.27, + "learning_rate": 1.7714681440443214e-05, + "loss": 4.0039, + "step": 165000 + }, + { + "epoch": 114.61, + "learning_rate": 1.770775623268698e-05, + "loss": 4.0012, + "step": 165500 + }, + { + "epoch": 114.96, + "learning_rate": 1.7700831024930748e-05, + "loss": 4.0013, + "step": 166000 + }, + { + "epoch": 115.0, + "eval_loss": 3.9188640117645264, + "eval_runtime": 74.0122, + "eval_samples_per_second": 6653.285, + "eval_steps_per_second": 2.175, + "step": 166060 + }, + { + "epoch": 115.3, + "learning_rate": 1.7693905817174518e-05, + "loss": 4.0011, + "step": 166500 + }, + { + "epoch": 115.65, + "learning_rate": 1.7686980609418285e-05, + "loss": 4.0031, + "step": 167000 + }, + { + "epoch": 116.0, + "learning_rate": 1.7680055401662052e-05, + "loss": 3.9978, + "step": 167500 + }, + { + "epoch": 116.0, + "eval_loss": 3.920518398284912, + "eval_runtime": 74.6861, + "eval_samples_per_second": 6593.245, + "eval_steps_per_second": 2.156, + "step": 167504 + }, + { + "epoch": 116.34, + "learning_rate": 1.767313019390582e-05, + "loss": 3.9936, + "step": 168000 + }, + { + "epoch": 116.69, + "learning_rate": 1.7666204986149586e-05, + "loss": 3.9978, + "step": 168500 + }, + { + "epoch": 117.0, + "eval_loss": 3.918830394744873, + "eval_runtime": 74.6329, + "eval_samples_per_second": 6597.945, + "eval_steps_per_second": 2.157, + "step": 168948 + }, + { + "epoch": 117.04, + "learning_rate": 1.7659279778393353e-05, + "loss": 3.9979, + "step": 169000 + }, + { + "epoch": 117.38, + "learning_rate": 1.765235457063712e-05, + "loss": 3.9972, + "step": 169500 + }, + { + "epoch": 117.73, + "learning_rate": 1.764542936288089e-05, + "loss": 3.9966, + "step": 170000 + }, + { + "epoch": 118.0, + "eval_loss": 3.914811611175537, + "eval_runtime": 74.0729, + "eval_samples_per_second": 6647.825, + "eval_steps_per_second": 2.174, + "step": 170392 + }, + { + "epoch": 118.07, + "learning_rate": 1.7638504155124657e-05, + "loss": 3.9952, + "step": 170500 + }, + { + "epoch": 118.42, + "learning_rate": 1.763157894736842e-05, + "loss": 3.9942, + "step": 171000 + }, + { + "epoch": 118.77, + "learning_rate": 1.762465373961219e-05, + "loss": 3.9923, + "step": 171500 + }, + { + "epoch": 119.0, + "eval_loss": 3.915539503097534, + "eval_runtime": 77.3659, + "eval_samples_per_second": 6364.87, + "eval_steps_per_second": 2.081, + "step": 171836 + }, + { + "epoch": 119.11, + "learning_rate": 1.7617728531855957e-05, + "loss": 3.9941, + "step": 172000 + }, + { + "epoch": 119.46, + "learning_rate": 1.7610803324099724e-05, + "loss": 3.9925, + "step": 172500 + }, + { + "epoch": 119.81, + "learning_rate": 1.760387811634349e-05, + "loss": 3.9901, + "step": 173000 + }, + { + "epoch": 120.0, + "eval_loss": 3.9078481197357178, + "eval_runtime": 73.0002, + "eval_samples_per_second": 6745.514, + "eval_steps_per_second": 2.205, + "step": 173280 + }, + { + "epoch": 120.15, + "learning_rate": 1.7596952908587258e-05, + "loss": 3.9908, + "step": 173500 + }, + { + "epoch": 120.5, + "learning_rate": 1.759002770083103e-05, + "loss": 3.9875, + "step": 174000 + }, + { + "epoch": 120.84, + "learning_rate": 1.7583102493074792e-05, + "loss": 3.989, + "step": 174500 + }, + { + "epoch": 121.0, + "eval_loss": 3.9121310710906982, + "eval_runtime": 73.6203, + "eval_samples_per_second": 6688.695, + "eval_steps_per_second": 2.187, + "step": 174724 + }, + { + "epoch": 121.19, + "learning_rate": 1.7576177285318562e-05, + "loss": 3.987, + "step": 175000 + }, + { + "epoch": 121.54, + "learning_rate": 1.756925207756233e-05, + "loss": 3.991, + "step": 175500 + }, + { + "epoch": 121.88, + "learning_rate": 1.7562326869806096e-05, + "loss": 3.9876, + "step": 176000 + }, + { + "epoch": 122.0, + "eval_loss": 3.9086544513702393, + "eval_runtime": 74.7783, + "eval_samples_per_second": 6585.118, + "eval_steps_per_second": 2.153, + "step": 176168 + }, + { + "epoch": 122.23, + "learning_rate": 1.7555401662049863e-05, + "loss": 3.9863, + "step": 176500 + }, + { + "epoch": 122.58, + "learning_rate": 1.754847645429363e-05, + "loss": 3.9866, + "step": 177000 + }, + { + "epoch": 122.92, + "learning_rate": 1.7541551246537397e-05, + "loss": 3.9856, + "step": 177500 + }, + { + "epoch": 123.0, + "eval_loss": 3.9068329334259033, + "eval_runtime": 74.5275, + "eval_samples_per_second": 6607.276, + "eval_steps_per_second": 2.16, + "step": 177612 + }, + { + "epoch": 123.27, + "learning_rate": 1.7534626038781164e-05, + "loss": 3.9857, + "step": 178000 + }, + { + "epoch": 123.61, + "learning_rate": 1.752770083102493e-05, + "loss": 3.9853, + "step": 178500 + }, + { + "epoch": 123.96, + "learning_rate": 1.75207756232687e-05, + "loss": 3.9834, + "step": 179000 + }, + { + "epoch": 124.0, + "eval_loss": 3.90472412109375, + "eval_runtime": 74.8567, + "eval_samples_per_second": 6578.224, + "eval_steps_per_second": 2.151, + "step": 179056 + }, + { + "epoch": 124.31, + "learning_rate": 1.7513850415512468e-05, + "loss": 3.9833, + "step": 179500 + }, + { + "epoch": 124.65, + "learning_rate": 1.7506925207756235e-05, + "loss": 3.9812, + "step": 180000 + }, + { + "epoch": 125.0, + "learning_rate": 1.7500000000000002e-05, + "loss": 3.9786, + "step": 180500 + }, + { + "epoch": 125.0, + "eval_loss": 3.89973783493042, + "eval_runtime": 73.8136, + "eval_samples_per_second": 6671.181, + "eval_steps_per_second": 2.181, + "step": 180500 + }, + { + "epoch": 125.35, + "learning_rate": 1.749307479224377e-05, + "loss": 3.9793, + "step": 181000 + }, + { + "epoch": 125.69, + "learning_rate": 1.7486149584487536e-05, + "loss": 3.9788, + "step": 181500 + }, + { + "epoch": 126.0, + "eval_loss": 3.900991201400757, + "eval_runtime": 76.4593, + "eval_samples_per_second": 6440.341, + "eval_steps_per_second": 2.106, + "step": 181944 + }, + { + "epoch": 126.04, + "learning_rate": 1.7479224376731303e-05, + "loss": 3.98, + "step": 182000 + }, + { + "epoch": 126.39, + "learning_rate": 1.747229916897507e-05, + "loss": 3.979, + "step": 182500 + }, + { + "epoch": 126.73, + "learning_rate": 1.746537396121884e-05, + "loss": 3.9782, + "step": 183000 + }, + { + "epoch": 127.0, + "eval_loss": 3.898531436920166, + "eval_runtime": 73.137, + "eval_samples_per_second": 6732.899, + "eval_steps_per_second": 2.201, + "step": 183388 + }, + { + "epoch": 127.08, + "learning_rate": 1.7458448753462603e-05, + "loss": 3.9772, + "step": 183500 + }, + { + "epoch": 127.42, + "learning_rate": 1.7451523545706374e-05, + "loss": 3.9759, + "step": 184000 + }, + { + "epoch": 127.77, + "learning_rate": 1.744459833795014e-05, + "loss": 3.9733, + "step": 184500 + }, + { + "epoch": 128.0, + "eval_loss": 3.892979145050049, + "eval_runtime": 74.6554, + "eval_samples_per_second": 6595.957, + "eval_steps_per_second": 2.157, + "step": 184832 + }, + { + "epoch": 128.12, + "learning_rate": 1.7437673130193908e-05, + "loss": 3.9761, + "step": 185000 + }, + { + "epoch": 128.46, + "learning_rate": 1.7430747922437674e-05, + "loss": 3.9733, + "step": 185500 + }, + { + "epoch": 128.81, + "learning_rate": 1.742382271468144e-05, + "loss": 3.9765, + "step": 186000 + }, + { + "epoch": 129.0, + "eval_loss": 3.8959548473358154, + "eval_runtime": 72.9991, + "eval_samples_per_second": 6745.614, + "eval_steps_per_second": 2.206, + "step": 186276 + }, + { + "epoch": 129.16, + "learning_rate": 1.741689750692521e-05, + "loss": 3.9713, + "step": 186500 + }, + { + "epoch": 129.5, + "learning_rate": 1.7409972299168975e-05, + "loss": 3.9714, + "step": 187000 + }, + { + "epoch": 129.85, + "learning_rate": 1.7403047091412742e-05, + "loss": 3.9733, + "step": 187500 + }, + { + "epoch": 130.0, + "eval_loss": 3.8933827877044678, + "eval_runtime": 75.3723, + "eval_samples_per_second": 6533.221, + "eval_steps_per_second": 2.136, + "step": 187720 + }, + { + "epoch": 130.19, + "learning_rate": 1.7396121883656512e-05, + "loss": 3.9725, + "step": 188000 + }, + { + "epoch": 130.54, + "learning_rate": 1.738919667590028e-05, + "loss": 3.9698, + "step": 188500 + }, + { + "epoch": 130.89, + "learning_rate": 1.7382271468144046e-05, + "loss": 3.9695, + "step": 189000 + }, + { + "epoch": 131.0, + "eval_loss": 3.888564348220825, + "eval_runtime": 76.8687, + "eval_samples_per_second": 6406.041, + "eval_steps_per_second": 2.094, + "step": 189164 + }, + { + "epoch": 131.23, + "learning_rate": 1.7375346260387813e-05, + "loss": 3.9677, + "step": 189500 + }, + { + "epoch": 131.58, + "learning_rate": 1.736842105263158e-05, + "loss": 3.9709, + "step": 190000 + }, + { + "epoch": 131.93, + "learning_rate": 1.7361495844875347e-05, + "loss": 3.9708, + "step": 190500 + }, + { + "epoch": 132.0, + "eval_loss": 3.889085054397583, + "eval_runtime": 73.3155, + "eval_samples_per_second": 6716.506, + "eval_steps_per_second": 2.196, + "step": 190608 + }, + { + "epoch": 132.27, + "learning_rate": 1.7354570637119114e-05, + "loss": 3.9635, + "step": 191000 + }, + { + "epoch": 132.62, + "learning_rate": 1.7347645429362884e-05, + "loss": 3.9658, + "step": 191500 + }, + { + "epoch": 132.96, + "learning_rate": 1.734072022160665e-05, + "loss": 3.9673, + "step": 192000 + }, + { + "epoch": 133.0, + "eval_loss": 3.885761022567749, + "eval_runtime": 72.7481, + "eval_samples_per_second": 6768.893, + "eval_steps_per_second": 2.213, + "step": 192052 + }, + { + "epoch": 133.31, + "learning_rate": 1.7333795013850415e-05, + "loss": 3.9664, + "step": 192500 + }, + { + "epoch": 133.66, + "learning_rate": 1.7326869806094185e-05, + "loss": 3.9635, + "step": 193000 + }, + { + "epoch": 134.0, + "eval_loss": 3.8835644721984863, + "eval_runtime": 72.9182, + "eval_samples_per_second": 6753.106, + "eval_steps_per_second": 2.208, + "step": 193496 + }, + { + "epoch": 134.0, + "learning_rate": 1.7319944598337952e-05, + "loss": 3.9667, + "step": 193500 + }, + { + "epoch": 134.35, + "learning_rate": 1.731301939058172e-05, + "loss": 3.9627, + "step": 194000 + }, + { + "epoch": 134.7, + "learning_rate": 1.7306094182825486e-05, + "loss": 3.962, + "step": 194500 + }, + { + "epoch": 135.0, + "eval_loss": 3.8817522525787354, + "eval_runtime": 74.832, + "eval_samples_per_second": 6580.394, + "eval_steps_per_second": 2.151, + "step": 194940 + }, + { + "epoch": 135.04, + "learning_rate": 1.7299168975069253e-05, + "loss": 3.9648, + "step": 195000 + }, + { + "epoch": 135.39, + "learning_rate": 1.7292243767313023e-05, + "loss": 3.9619, + "step": 195500 + }, + { + "epoch": 135.73, + "learning_rate": 1.7285318559556787e-05, + "loss": 3.9635, + "step": 196000 + }, + { + "epoch": 136.0, + "eval_loss": 3.8839354515075684, + "eval_runtime": 75.8115, + "eval_samples_per_second": 6495.378, + "eval_steps_per_second": 2.124, + "step": 196384 + }, + { + "epoch": 136.08, + "learning_rate": 1.7278393351800557e-05, + "loss": 3.9585, + "step": 196500 + }, + { + "epoch": 136.43, + "learning_rate": 1.7271468144044324e-05, + "loss": 3.9597, + "step": 197000 + }, + { + "epoch": 136.77, + "learning_rate": 1.7264542936288087e-05, + "loss": 3.9601, + "step": 197500 + }, + { + "epoch": 137.0, + "eval_loss": 3.8804266452789307, + "eval_runtime": 75.6839, + "eval_samples_per_second": 6506.321, + "eval_steps_per_second": 2.127, + "step": 197828 + }, + { + "epoch": 137.12, + "learning_rate": 1.7257617728531858e-05, + "loss": 3.9609, + "step": 198000 + }, + { + "epoch": 137.47, + "learning_rate": 1.7250692520775625e-05, + "loss": 3.956, + "step": 198500 + }, + { + "epoch": 137.81, + "learning_rate": 1.724376731301939e-05, + "loss": 3.9593, + "step": 199000 + }, + { + "epoch": 138.0, + "eval_loss": 3.879089117050171, + "eval_runtime": 74.4504, + "eval_samples_per_second": 6614.122, + "eval_steps_per_second": 2.163, + "step": 199272 + }, + { + "epoch": 138.16, + "learning_rate": 1.723684210526316e-05, + "loss": 3.9569, + "step": 199500 + }, + { + "epoch": 138.5, + "learning_rate": 1.7229916897506925e-05, + "loss": 3.9541, + "step": 200000 + }, + { + "epoch": 138.85, + "learning_rate": 1.7222991689750696e-05, + "loss": 3.9566, + "step": 200500 + }, + { + "epoch": 139.0, + "eval_loss": 3.8803153038024902, + "eval_runtime": 77.8029, + "eval_samples_per_second": 6329.123, + "eval_steps_per_second": 2.069, + "step": 200716 + }, + { + "epoch": 139.2, + "learning_rate": 1.7216066481994462e-05, + "loss": 3.956, + "step": 201000 + }, + { + "epoch": 139.54, + "learning_rate": 1.720914127423823e-05, + "loss": 3.9517, + "step": 201500 + }, + { + "epoch": 139.89, + "learning_rate": 1.7202216066481996e-05, + "loss": 3.9535, + "step": 202000 + }, + { + "epoch": 140.0, + "eval_loss": 3.872063159942627, + "eval_runtime": 75.6675, + "eval_samples_per_second": 6507.738, + "eval_steps_per_second": 2.128, + "step": 202160 + }, + { + "epoch": 140.24, + "learning_rate": 1.7195290858725763e-05, + "loss": 3.9534, + "step": 202500 + }, + { + "epoch": 140.58, + "learning_rate": 1.718836565096953e-05, + "loss": 3.9534, + "step": 203000 + }, + { + "epoch": 140.93, + "learning_rate": 1.7181440443213297e-05, + "loss": 3.9525, + "step": 203500 + }, + { + "epoch": 141.0, + "eval_loss": 3.8730010986328125, + "eval_runtime": 73.7321, + "eval_samples_per_second": 6678.556, + "eval_steps_per_second": 2.184, + "step": 203604 + }, + { + "epoch": 141.27, + "learning_rate": 1.7174515235457064e-05, + "loss": 3.9519, + "step": 204000 + }, + { + "epoch": 141.62, + "learning_rate": 1.7167590027700834e-05, + "loss": 3.9488, + "step": 204500 + }, + { + "epoch": 141.97, + "learning_rate": 1.7160664819944598e-05, + "loss": 3.9515, + "step": 205000 + }, + { + "epoch": 142.0, + "eval_loss": 3.870518207550049, + "eval_runtime": 74.6993, + "eval_samples_per_second": 6592.087, + "eval_steps_per_second": 2.155, + "step": 205048 + }, + { + "epoch": 142.31, + "learning_rate": 1.7153739612188368e-05, + "loss": 3.9499, + "step": 205500 + }, + { + "epoch": 142.66, + "learning_rate": 1.7146814404432135e-05, + "loss": 3.9488, + "step": 206000 + }, + { + "epoch": 143.0, + "eval_loss": 3.868440628051758, + "eval_runtime": 73.8028, + "eval_samples_per_second": 6672.162, + "eval_steps_per_second": 2.181, + "step": 206492 + }, + { + "epoch": 143.01, + "learning_rate": 1.7139889196675902e-05, + "loss": 3.9525, + "step": 206500 + }, + { + "epoch": 143.35, + "learning_rate": 1.713296398891967e-05, + "loss": 3.9484, + "step": 207000 + }, + { + "epoch": 143.7, + "learning_rate": 1.7126038781163436e-05, + "loss": 3.944, + "step": 207500 + }, + { + "epoch": 144.0, + "eval_loss": 3.8690621852874756, + "eval_runtime": 74.7082, + "eval_samples_per_second": 6591.3, + "eval_steps_per_second": 2.155, + "step": 207936 + }, + { + "epoch": 144.04, + "learning_rate": 1.7119113573407206e-05, + "loss": 3.9512, + "step": 208000 + }, + { + "epoch": 144.39, + "learning_rate": 1.711218836565097e-05, + "loss": 3.9497, + "step": 208500 + }, + { + "epoch": 144.74, + "learning_rate": 1.7105263157894737e-05, + "loss": 3.9455, + "step": 209000 + }, + { + "epoch": 145.0, + "eval_loss": 3.8670220375061035, + "eval_runtime": 74.3832, + "eval_samples_per_second": 6620.095, + "eval_steps_per_second": 2.164, + "step": 209380 + }, + { + "epoch": 145.08, + "learning_rate": 1.7098337950138507e-05, + "loss": 3.9445, + "step": 209500 + }, + { + "epoch": 145.43, + "learning_rate": 1.709141274238227e-05, + "loss": 3.945, + "step": 210000 + }, + { + "epoch": 145.78, + "learning_rate": 1.708448753462604e-05, + "loss": 3.9439, + "step": 210500 + }, + { + "epoch": 146.0, + "eval_loss": 3.865227699279785, + "eval_runtime": 76.1017, + "eval_samples_per_second": 6470.6, + "eval_steps_per_second": 2.116, + "step": 210824 + }, + { + "epoch": 146.12, + "learning_rate": 1.7077562326869808e-05, + "loss": 3.9444, + "step": 211000 + }, + { + "epoch": 146.47, + "learning_rate": 1.7070637119113575e-05, + "loss": 3.9402, + "step": 211500 + }, + { + "epoch": 146.81, + "learning_rate": 1.706371191135734e-05, + "loss": 3.9431, + "step": 212000 + }, + { + "epoch": 147.0, + "eval_loss": 3.8667104244232178, + "eval_runtime": 74.0634, + "eval_samples_per_second": 6648.686, + "eval_steps_per_second": 2.174, + "step": 212268 + }, + { + "epoch": 147.16, + "learning_rate": 1.705678670360111e-05, + "loss": 3.9421, + "step": 212500 + }, + { + "epoch": 147.51, + "learning_rate": 1.704986149584488e-05, + "loss": 3.9406, + "step": 213000 + }, + { + "epoch": 147.85, + "learning_rate": 1.7042936288088646e-05, + "loss": 3.9422, + "step": 213500 + }, + { + "epoch": 148.0, + "eval_loss": 3.8651583194732666, + "eval_runtime": 74.2586, + "eval_samples_per_second": 6631.208, + "eval_steps_per_second": 2.168, + "step": 213712 + }, + { + "epoch": 148.2, + "learning_rate": 1.703601108033241e-05, + "loss": 3.9379, + "step": 214000 + }, + { + "epoch": 148.55, + "learning_rate": 1.702908587257618e-05, + "loss": 3.9402, + "step": 214500 + }, + { + "epoch": 148.89, + "learning_rate": 1.7022160664819946e-05, + "loss": 3.9389, + "step": 215000 + }, + { + "epoch": 149.0, + "eval_loss": 3.8652150630950928, + "eval_runtime": 74.5415, + "eval_samples_per_second": 6606.034, + "eval_steps_per_second": 2.16, + "step": 215156 + }, + { + "epoch": 149.24, + "learning_rate": 1.7015235457063713e-05, + "loss": 3.9387, + "step": 215500 + }, + { + "epoch": 149.58, + "learning_rate": 1.700831024930748e-05, + "loss": 3.9394, + "step": 216000 + }, + { + "epoch": 149.93, + "learning_rate": 1.7001385041551247e-05, + "loss": 3.9387, + "step": 216500 + }, + { + "epoch": 150.0, + "eval_loss": 3.8585476875305176, + "eval_runtime": 75.2231, + "eval_samples_per_second": 6546.184, + "eval_steps_per_second": 2.14, + "step": 216600 + }, + { + "epoch": 150.28, + "learning_rate": 1.6994459833795017e-05, + "loss": 3.9363, + "step": 217000 + }, + { + "epoch": 150.62, + "learning_rate": 1.698753462603878e-05, + "loss": 3.9406, + "step": 217500 + }, + { + "epoch": 150.97, + "learning_rate": 1.698060941828255e-05, + "loss": 3.9375, + "step": 218000 + }, + { + "epoch": 151.0, + "eval_loss": 3.8584671020507812, + "eval_runtime": 75.4049, + "eval_samples_per_second": 6530.4, + "eval_steps_per_second": 2.135, + "step": 218044 + }, + { + "epoch": 151.32, + "learning_rate": 1.6973684210526318e-05, + "loss": 3.9365, + "step": 218500 + }, + { + "epoch": 151.66, + "learning_rate": 1.6966759002770085e-05, + "loss": 3.9365, + "step": 219000 + }, + { + "epoch": 152.0, + "eval_loss": 3.8540356159210205, + "eval_runtime": 72.8254, + "eval_samples_per_second": 6761.71, + "eval_steps_per_second": 2.211, + "step": 219488 + }, + { + "epoch": 152.01, + "learning_rate": 1.6959833795013852e-05, + "loss": 3.9354, + "step": 219500 + }, + { + "epoch": 152.35, + "learning_rate": 1.695290858725762e-05, + "loss": 3.9347, + "step": 220000 + }, + { + "epoch": 152.7, + "learning_rate": 1.6945983379501386e-05, + "loss": 3.9313, + "step": 220500 + }, + { + "epoch": 153.0, + "eval_loss": 3.853323459625244, + "eval_runtime": 74.3001, + "eval_samples_per_second": 6627.5, + "eval_steps_per_second": 2.167, + "step": 220932 + }, + { + "epoch": 153.05, + "learning_rate": 1.6939058171745153e-05, + "loss": 3.9335, + "step": 221000 + }, + { + "epoch": 153.39, + "learning_rate": 1.693213296398892e-05, + "loss": 3.932, + "step": 221500 + }, + { + "epoch": 153.74, + "learning_rate": 1.692520775623269e-05, + "loss": 3.9287, + "step": 222000 + }, + { + "epoch": 154.0, + "eval_loss": 3.853574275970459, + "eval_runtime": 74.3871, + "eval_samples_per_second": 6619.754, + "eval_steps_per_second": 2.164, + "step": 222376 + }, + { + "epoch": 154.09, + "learning_rate": 1.6918282548476454e-05, + "loss": 3.935, + "step": 222500 + }, + { + "epoch": 154.43, + "learning_rate": 1.6911357340720224e-05, + "loss": 3.9323, + "step": 223000 + }, + { + "epoch": 154.78, + "learning_rate": 1.690443213296399e-05, + "loss": 3.9304, + "step": 223500 + }, + { + "epoch": 155.0, + "eval_loss": 3.854647636413574, + "eval_runtime": 75.0693, + "eval_samples_per_second": 6559.592, + "eval_steps_per_second": 2.145, + "step": 223820 + }, + { + "epoch": 155.12, + "learning_rate": 1.6897506925207758e-05, + "loss": 3.9285, + "step": 224000 + }, + { + "epoch": 155.47, + "learning_rate": 1.6890581717451525e-05, + "loss": 3.927, + "step": 224500 + }, + { + "epoch": 155.82, + "learning_rate": 1.688365650969529e-05, + "loss": 3.9304, + "step": 225000 + }, + { + "epoch": 156.0, + "eval_loss": 3.8501617908477783, + "eval_runtime": 75.8197, + "eval_samples_per_second": 6494.673, + "eval_steps_per_second": 2.123, + "step": 225264 + }, + { + "epoch": 156.16, + "learning_rate": 1.687673130193906e-05, + "loss": 3.9299, + "step": 225500 + }, + { + "epoch": 156.51, + "learning_rate": 1.686980609418283e-05, + "loss": 3.9282, + "step": 226000 + }, + { + "epoch": 156.86, + "learning_rate": 1.6862880886426592e-05, + "loss": 3.9257, + "step": 226500 + }, + { + "epoch": 157.0, + "eval_loss": 3.8476805686950684, + "eval_runtime": 73.4445, + "eval_samples_per_second": 6704.713, + "eval_steps_per_second": 2.192, + "step": 226708 + }, + { + "epoch": 157.2, + "learning_rate": 1.6855955678670363e-05, + "loss": 3.9263, + "step": 227000 + }, + { + "epoch": 157.55, + "learning_rate": 1.684903047091413e-05, + "loss": 3.9247, + "step": 227500 + }, + { + "epoch": 157.89, + "learning_rate": 1.6842105263157896e-05, + "loss": 3.9242, + "step": 228000 + }, + { + "epoch": 158.0, + "eval_loss": 3.8495914936065674, + "eval_runtime": 74.4187, + "eval_samples_per_second": 6616.942, + "eval_steps_per_second": 2.163, + "step": 228152 + }, + { + "epoch": 158.24, + "learning_rate": 1.6835180055401663e-05, + "loss": 3.9235, + "step": 228500 + }, + { + "epoch": 158.59, + "learning_rate": 1.682825484764543e-05, + "loss": 3.9275, + "step": 229000 + }, + { + "epoch": 158.93, + "learning_rate": 1.68213296398892e-05, + "loss": 3.9219, + "step": 229500 + }, + { + "epoch": 159.0, + "eval_loss": 3.850391387939453, + "eval_runtime": 74.2125, + "eval_samples_per_second": 6635.322, + "eval_steps_per_second": 2.169, + "step": 229596 + }, + { + "epoch": 159.28, + "learning_rate": 1.6814404432132964e-05, + "loss": 3.9203, + "step": 230000 + }, + { + "epoch": 159.63, + "learning_rate": 1.680747922437673e-05, + "loss": 3.9239, + "step": 230500 + }, + { + "epoch": 159.97, + "learning_rate": 1.68005540166205e-05, + "loss": 3.9236, + "step": 231000 + }, + { + "epoch": 160.0, + "eval_loss": 3.8478267192840576, + "eval_runtime": 74.9917, + "eval_samples_per_second": 6566.379, + "eval_steps_per_second": 2.147, + "step": 231040 + }, + { + "epoch": 160.32, + "learning_rate": 1.6793628808864265e-05, + "loss": 3.9238, + "step": 231500 + }, + { + "epoch": 160.66, + "learning_rate": 1.6786703601108035e-05, + "loss": 3.9222, + "step": 232000 + }, + { + "epoch": 161.0, + "eval_loss": 3.8385064601898193, + "eval_runtime": 74.3809, + "eval_samples_per_second": 6620.298, + "eval_steps_per_second": 2.165, + "step": 232484 + }, + { + "epoch": 161.01, + "learning_rate": 1.6779778393351802e-05, + "loss": 3.92, + "step": 232500 + }, + { + "epoch": 161.36, + "learning_rate": 1.677285318559557e-05, + "loss": 3.9184, + "step": 233000 + }, + { + "epoch": 161.7, + "learning_rate": 1.6765927977839336e-05, + "loss": 3.9231, + "step": 233500 + }, + { + "epoch": 162.0, + "eval_loss": 3.844747543334961, + "eval_runtime": 73.4727, + "eval_samples_per_second": 6702.138, + "eval_steps_per_second": 2.191, + "step": 233928 + }, + { + "epoch": 162.05, + "learning_rate": 1.6759002770083103e-05, + "loss": 3.9186, + "step": 234000 + }, + { + "epoch": 162.4, + "learning_rate": 1.6752077562326873e-05, + "loss": 3.9207, + "step": 234500 + }, + { + "epoch": 162.74, + "learning_rate": 1.674515235457064e-05, + "loss": 3.9161, + "step": 235000 + }, + { + "epoch": 163.0, + "eval_loss": 3.839341640472412, + "eval_runtime": 74.6904, + "eval_samples_per_second": 6592.87, + "eval_steps_per_second": 2.156, + "step": 235372 + }, + { + "epoch": 163.09, + "learning_rate": 1.6738227146814404e-05, + "loss": 3.9223, + "step": 235500 + }, + { + "epoch": 163.43, + "learning_rate": 1.6731301939058174e-05, + "loss": 3.9151, + "step": 236000 + }, + { + "epoch": 163.78, + "learning_rate": 1.672437673130194e-05, + "loss": 3.918, + "step": 236500 + }, + { + "epoch": 164.0, + "eval_loss": 3.8450632095336914, + "eval_runtime": 74.8435, + "eval_samples_per_second": 6579.385, + "eval_steps_per_second": 2.151, + "step": 236816 + }, + { + "epoch": 164.13, + "learning_rate": 1.6717451523545708e-05, + "loss": 3.9184, + "step": 237000 + }, + { + "epoch": 164.47, + "learning_rate": 1.6710526315789475e-05, + "loss": 3.9175, + "step": 237500 + }, + { + "epoch": 164.82, + "learning_rate": 1.670360110803324e-05, + "loss": 3.9153, + "step": 238000 + }, + { + "epoch": 165.0, + "eval_loss": 3.8364455699920654, + "eval_runtime": 75.2179, + "eval_samples_per_second": 6546.636, + "eval_steps_per_second": 2.14, + "step": 238260 + }, + { + "epoch": 165.17, + "learning_rate": 1.6696675900277012e-05, + "loss": 3.9151, + "step": 238500 + }, + { + "epoch": 165.51, + "learning_rate": 1.6689750692520775e-05, + "loss": 3.9139, + "step": 239000 + }, + { + "epoch": 165.86, + "learning_rate": 1.6682825484764546e-05, + "loss": 3.9158, + "step": 239500 + }, + { + "epoch": 166.0, + "eval_loss": 3.8352112770080566, + "eval_runtime": 74.8807, + "eval_samples_per_second": 6576.117, + "eval_steps_per_second": 2.15, + "step": 239704 + }, + { + "epoch": 166.2, + "learning_rate": 1.6675900277008313e-05, + "loss": 3.9151, + "step": 240000 + }, + { + "epoch": 166.55, + "learning_rate": 1.666897506925208e-05, + "loss": 3.9146, + "step": 240500 + }, + { + "epoch": 166.9, + "learning_rate": 1.6662049861495847e-05, + "loss": 3.9138, + "step": 241000 + }, + { + "epoch": 167.0, + "eval_loss": 3.835671901702881, + "eval_runtime": 74.7206, + "eval_samples_per_second": 6590.203, + "eval_steps_per_second": 2.155, + "step": 241148 + }, + { + "epoch": 167.24, + "learning_rate": 1.6655124653739613e-05, + "loss": 3.9147, + "step": 241500 + }, + { + "epoch": 167.59, + "learning_rate": 1.664819944598338e-05, + "loss": 3.9128, + "step": 242000 + }, + { + "epoch": 167.94, + "learning_rate": 1.6641274238227147e-05, + "loss": 3.9135, + "step": 242500 + }, + { + "epoch": 168.0, + "eval_loss": 3.829854726791382, + "eval_runtime": 72.8907, + "eval_samples_per_second": 6755.652, + "eval_steps_per_second": 2.209, + "step": 242592 + }, + { + "epoch": 168.28, + "learning_rate": 1.6634349030470914e-05, + "loss": 3.9116, + "step": 243000 + }, + { + "epoch": 168.63, + "learning_rate": 1.6627423822714685e-05, + "loss": 3.9104, + "step": 243500 + }, + { + "epoch": 168.98, + "learning_rate": 1.6620498614958448e-05, + "loss": 3.9086, + "step": 244000 + }, + { + "epoch": 169.0, + "eval_loss": 3.83186411857605, + "eval_runtime": 73.8836, + "eval_samples_per_second": 6664.86, + "eval_steps_per_second": 2.179, + "step": 244036 + }, + { + "epoch": 169.32, + "learning_rate": 1.661357340720222e-05, + "loss": 3.9102, + "step": 244500 + }, + { + "epoch": 169.67, + "learning_rate": 1.6606648199445985e-05, + "loss": 3.9093, + "step": 245000 + }, + { + "epoch": 170.0, + "eval_loss": 3.831186532974243, + "eval_runtime": 75.6023, + "eval_samples_per_second": 6513.346, + "eval_steps_per_second": 2.13, + "step": 245480 + }, + { + "epoch": 170.01, + "learning_rate": 1.6599722991689752e-05, + "loss": 3.9092, + "step": 245500 + }, + { + "epoch": 170.36, + "learning_rate": 1.659279778393352e-05, + "loss": 3.9083, + "step": 246000 + }, + { + "epoch": 170.71, + "learning_rate": 1.6585872576177286e-05, + "loss": 3.9087, + "step": 246500 + }, + { + "epoch": 171.0, + "eval_loss": 3.8296992778778076, + "eval_runtime": 74.0581, + "eval_samples_per_second": 6649.158, + "eval_steps_per_second": 2.174, + "step": 246924 + }, + { + "epoch": 171.05, + "learning_rate": 1.6578947368421053e-05, + "loss": 3.9097, + "step": 247000 + }, + { + "epoch": 171.4, + "learning_rate": 1.6572022160664823e-05, + "loss": 3.9103, + "step": 247500 + }, + { + "epoch": 171.75, + "learning_rate": 1.6565096952908587e-05, + "loss": 3.9078, + "step": 248000 + }, + { + "epoch": 172.0, + "eval_loss": 3.8280537128448486, + "eval_runtime": 74.6178, + "eval_samples_per_second": 6599.284, + "eval_steps_per_second": 2.158, + "step": 248368 + }, + { + "epoch": 172.09, + "learning_rate": 1.6558171745152357e-05, + "loss": 3.9065, + "step": 248500 + }, + { + "epoch": 172.44, + "learning_rate": 1.6551246537396124e-05, + "loss": 3.9052, + "step": 249000 + }, + { + "epoch": 172.78, + "learning_rate": 1.654432132963989e-05, + "loss": 3.9052, + "step": 249500 + }, + { + "epoch": 173.0, + "eval_loss": 3.828386068344116, + "eval_runtime": 74.5947, + "eval_samples_per_second": 6601.323, + "eval_steps_per_second": 2.158, + "step": 249812 + }, + { + "epoch": 173.13, + "learning_rate": 1.6537396121883658e-05, + "loss": 3.9037, + "step": 250000 + }, + { + "epoch": 173.48, + "learning_rate": 1.6530470914127425e-05, + "loss": 3.9046, + "step": 250500 + }, + { + "epoch": 173.82, + "learning_rate": 1.6523545706371195e-05, + "loss": 3.9059, + "step": 251000 + }, + { + "epoch": 174.0, + "eval_loss": 3.82259464263916, + "eval_runtime": 75.1329, + "eval_samples_per_second": 6554.041, + "eval_steps_per_second": 2.143, + "step": 251256 + }, + { + "epoch": 174.17, + "learning_rate": 1.651662049861496e-05, + "loss": 3.9044, + "step": 251500 + }, + { + "epoch": 174.52, + "learning_rate": 1.6509695290858726e-05, + "loss": 3.9041, + "step": 252000 + }, + { + "epoch": 174.86, + "learning_rate": 1.6502770083102496e-05, + "loss": 3.9006, + "step": 252500 + }, + { + "epoch": 175.0, + "eval_loss": 3.8257510662078857, + "eval_runtime": 75.0788, + "eval_samples_per_second": 6558.759, + "eval_steps_per_second": 2.144, + "step": 252700 + }, + { + "epoch": 175.21, + "learning_rate": 1.649584487534626e-05, + "loss": 3.9025, + "step": 253000 + }, + { + "epoch": 175.55, + "learning_rate": 1.648891966759003e-05, + "loss": 3.9015, + "step": 253500 + }, + { + "epoch": 175.9, + "learning_rate": 1.6481994459833797e-05, + "loss": 3.9046, + "step": 254000 + }, + { + "epoch": 176.0, + "eval_loss": 3.8246407508850098, + "eval_runtime": 76.1781, + "eval_samples_per_second": 6464.119, + "eval_steps_per_second": 2.113, + "step": 254144 + }, + { + "epoch": 176.25, + "learning_rate": 1.6475069252077564e-05, + "loss": 3.9034, + "step": 254500 + }, + { + "epoch": 176.59, + "learning_rate": 1.646814404432133e-05, + "loss": 3.8996, + "step": 255000 + }, + { + "epoch": 176.94, + "learning_rate": 1.6461218836565097e-05, + "loss": 3.9027, + "step": 255500 + }, + { + "epoch": 177.0, + "eval_loss": 3.81931471824646, + "eval_runtime": 74.6152, + "eval_samples_per_second": 6599.512, + "eval_steps_per_second": 2.158, + "step": 255588 + }, + { + "epoch": 177.29, + "learning_rate": 1.6454293628808868e-05, + "loss": 3.9021, + "step": 256000 + }, + { + "epoch": 177.63, + "learning_rate": 1.644736842105263e-05, + "loss": 3.8989, + "step": 256500 + }, + { + "epoch": 177.98, + "learning_rate": 1.6440443213296398e-05, + "loss": 3.9024, + "step": 257000 + }, + { + "epoch": 178.0, + "eval_loss": 3.819705009460449, + "eval_runtime": 75.4024, + "eval_samples_per_second": 6530.618, + "eval_steps_per_second": 2.135, + "step": 257032 + }, + { + "epoch": 178.32, + "learning_rate": 1.643351800554017e-05, + "loss": 3.9012, + "step": 257500 + }, + { + "epoch": 178.67, + "learning_rate": 1.6426592797783935e-05, + "loss": 3.9007, + "step": 258000 + }, + { + "epoch": 179.0, + "eval_loss": 3.817988634109497, + "eval_runtime": 74.2631, + "eval_samples_per_second": 6630.806, + "eval_steps_per_second": 2.168, + "step": 258476 + }, + { + "epoch": 179.02, + "learning_rate": 1.6419667590027702e-05, + "loss": 3.8962, + "step": 258500 + }, + { + "epoch": 179.36, + "learning_rate": 1.641274238227147e-05, + "loss": 3.9005, + "step": 259000 + }, + { + "epoch": 179.71, + "learning_rate": 1.6405817174515236e-05, + "loss": 3.897, + "step": 259500 + }, + { + "epoch": 180.0, + "eval_loss": 3.8208701610565186, + "eval_runtime": 75.7163, + "eval_samples_per_second": 6503.537, + "eval_steps_per_second": 2.126, + "step": 259920 + }, + { + "epoch": 180.06, + "learning_rate": 1.6398891966759006e-05, + "loss": 3.9019, + "step": 260000 + }, + { + "epoch": 180.4, + "learning_rate": 1.639196675900277e-05, + "loss": 3.8984, + "step": 260500 + }, + { + "epoch": 180.75, + "learning_rate": 1.638504155124654e-05, + "loss": 3.8967, + "step": 261000 + }, + { + "epoch": 181.0, + "eval_loss": 3.814295768737793, + "eval_runtime": 74.2326, + "eval_samples_per_second": 6633.527, + "eval_steps_per_second": 2.169, + "step": 261364 + }, + { + "epoch": 181.09, + "learning_rate": 1.6378116343490307e-05, + "loss": 3.8996, + "step": 261500 + }, + { + "epoch": 181.44, + "learning_rate": 1.6371191135734074e-05, + "loss": 3.8951, + "step": 262000 + }, + { + "epoch": 181.79, + "learning_rate": 1.636426592797784e-05, + "loss": 3.8978, + "step": 262500 + }, + { + "epoch": 182.0, + "eval_loss": 3.816028118133545, + "eval_runtime": 74.7425, + "eval_samples_per_second": 6588.271, + "eval_steps_per_second": 2.154, + "step": 262808 + }, + { + "epoch": 182.13, + "learning_rate": 1.6357340720221608e-05, + "loss": 3.896, + "step": 263000 + }, + { + "epoch": 182.48, + "learning_rate": 1.6350415512465375e-05, + "loss": 3.8943, + "step": 263500 + }, + { + "epoch": 182.83, + "learning_rate": 1.6343490304709142e-05, + "loss": 3.894, + "step": 264000 + }, + { + "epoch": 183.0, + "eval_loss": 3.8133652210235596, + "eval_runtime": 76.198, + "eval_samples_per_second": 6462.426, + "eval_steps_per_second": 2.113, + "step": 264252 + }, + { + "epoch": 183.17, + "learning_rate": 1.633656509695291e-05, + "loss": 3.8947, + "step": 264500 + }, + { + "epoch": 183.52, + "learning_rate": 1.632963988919668e-05, + "loss": 3.8949, + "step": 265000 + }, + { + "epoch": 183.86, + "learning_rate": 1.6322714681440443e-05, + "loss": 3.8919, + "step": 265500 + }, + { + "epoch": 184.0, + "eval_loss": 3.8209118843078613, + "eval_runtime": 74.4007, + "eval_samples_per_second": 6618.538, + "eval_steps_per_second": 2.164, + "step": 265696 + }, + { + "epoch": 184.21, + "learning_rate": 1.6315789473684213e-05, + "loss": 3.8918, + "step": 266000 + }, + { + "epoch": 184.56, + "learning_rate": 1.630886426592798e-05, + "loss": 3.893, + "step": 266500 + }, + { + "epoch": 184.9, + "learning_rate": 1.6301939058171747e-05, + "loss": 3.8915, + "step": 267000 + }, + { + "epoch": 185.0, + "eval_loss": 3.812711477279663, + "eval_runtime": 74.8088, + "eval_samples_per_second": 6582.437, + "eval_steps_per_second": 2.152, + "step": 267140 + }, + { + "epoch": 185.25, + "learning_rate": 1.6295013850415514e-05, + "loss": 3.8911, + "step": 267500 + }, + { + "epoch": 185.6, + "learning_rate": 1.628808864265928e-05, + "loss": 3.8878, + "step": 268000 + }, + { + "epoch": 185.94, + "learning_rate": 1.6281163434903047e-05, + "loss": 3.8909, + "step": 268500 + }, + { + "epoch": 186.0, + "eval_loss": 3.8139772415161133, + "eval_runtime": 73.8157, + "eval_samples_per_second": 6670.991, + "eval_steps_per_second": 2.181, + "step": 268584 + }, + { + "epoch": 186.29, + "learning_rate": 1.6274238227146814e-05, + "loss": 3.8906, + "step": 269000 + }, + { + "epoch": 186.63, + "learning_rate": 1.626731301939058e-05, + "loss": 3.8891, + "step": 269500 + }, + { + "epoch": 186.98, + "learning_rate": 1.626038781163435e-05, + "loss": 3.8894, + "step": 270000 + }, + { + "epoch": 187.0, + "eval_loss": 3.811793327331543, + "eval_runtime": 76.5559, + "eval_samples_per_second": 6432.215, + "eval_steps_per_second": 2.103, + "step": 270028 + }, + { + "epoch": 187.33, + "learning_rate": 1.625346260387812e-05, + "loss": 3.8857, + "step": 270500 + }, + { + "epoch": 187.67, + "learning_rate": 1.6246537396121885e-05, + "loss": 3.886, + "step": 271000 + }, + { + "epoch": 188.0, + "eval_loss": 3.812798500061035, + "eval_runtime": 73.9286, + "eval_samples_per_second": 6660.807, + "eval_steps_per_second": 2.178, + "step": 271472 + }, + { + "epoch": 188.02, + "learning_rate": 1.6239612188365652e-05, + "loss": 3.8875, + "step": 271500 + }, + { + "epoch": 188.37, + "learning_rate": 1.623268698060942e-05, + "loss": 3.8906, + "step": 272000 + }, + { + "epoch": 188.71, + "learning_rate": 1.622576177285319e-05, + "loss": 3.8868, + "step": 272500 + }, + { + "epoch": 189.0, + "eval_loss": 3.808310031890869, + "eval_runtime": 74.6, + "eval_samples_per_second": 6600.861, + "eval_steps_per_second": 2.158, + "step": 272916 + }, + { + "epoch": 189.06, + "learning_rate": 1.6218836565096953e-05, + "loss": 3.8846, + "step": 273000 + }, + { + "epoch": 189.4, + "learning_rate": 1.621191135734072e-05, + "loss": 3.8873, + "step": 273500 + }, + { + "epoch": 189.75, + "learning_rate": 1.620498614958449e-05, + "loss": 3.8827, + "step": 274000 + }, + { + "epoch": 190.0, + "eval_loss": 3.8040313720703125, + "eval_runtime": 73.574, + "eval_samples_per_second": 6692.91, + "eval_steps_per_second": 2.188, + "step": 274360 + }, + { + "epoch": 190.1, + "learning_rate": 1.6198060941828254e-05, + "loss": 3.888, + "step": 274500 + }, + { + "epoch": 190.44, + "learning_rate": 1.6191135734072024e-05, + "loss": 3.8879, + "step": 275000 + }, + { + "epoch": 190.79, + "learning_rate": 1.618421052631579e-05, + "loss": 3.8854, + "step": 275500 + }, + { + "epoch": 191.0, + "eval_loss": 3.8076326847076416, + "eval_runtime": 73.5075, + "eval_samples_per_second": 6698.963, + "eval_steps_per_second": 2.19, + "step": 275804 + }, + { + "epoch": 191.14, + "learning_rate": 1.6177285318559558e-05, + "loss": 3.8866, + "step": 276000 + }, + { + "epoch": 191.48, + "learning_rate": 1.6170360110803325e-05, + "loss": 3.8838, + "step": 276500 + }, + { + "epoch": 191.83, + "learning_rate": 1.6163434903047092e-05, + "loss": 3.8814, + "step": 277000 + }, + { + "epoch": 192.0, + "eval_loss": 3.8077051639556885, + "eval_runtime": 74.0447, + "eval_samples_per_second": 6650.364, + "eval_steps_per_second": 2.174, + "step": 277248 + }, + { + "epoch": 192.17, + "learning_rate": 1.6156509695290862e-05, + "loss": 3.8826, + "step": 277500 + }, + { + "epoch": 192.52, + "learning_rate": 1.6149584487534626e-05, + "loss": 3.8837, + "step": 278000 + }, + { + "epoch": 192.87, + "learning_rate": 1.6142659279778396e-05, + "loss": 3.8825, + "step": 278500 + }, + { + "epoch": 193.0, + "eval_loss": 3.8101003170013428, + "eval_runtime": 74.4578, + "eval_samples_per_second": 6613.463, + "eval_steps_per_second": 2.162, + "step": 278692 + }, + { + "epoch": 193.21, + "learning_rate": 1.6135734072022163e-05, + "loss": 3.8808, + "step": 279000 + }, + { + "epoch": 193.56, + "learning_rate": 1.612880886426593e-05, + "loss": 3.8832, + "step": 279500 + }, + { + "epoch": 193.91, + "learning_rate": 1.6121883656509697e-05, + "loss": 3.8824, + "step": 280000 + }, + { + "epoch": 194.0, + "eval_loss": 3.8035974502563477, + "eval_runtime": 73.2562, + "eval_samples_per_second": 6721.947, + "eval_steps_per_second": 2.198, + "step": 280136 + }, + { + "epoch": 194.25, + "learning_rate": 1.6114958448753464e-05, + "loss": 3.8806, + "step": 280500 + }, + { + "epoch": 194.6, + "learning_rate": 1.610803324099723e-05, + "loss": 3.8783, + "step": 281000 + }, + { + "epoch": 194.94, + "learning_rate": 1.6101108033240998e-05, + "loss": 3.8784, + "step": 281500 + }, + { + "epoch": 195.0, + "eval_loss": 3.8016698360443115, + "eval_runtime": 75.1162, + "eval_samples_per_second": 6555.495, + "eval_steps_per_second": 2.143, + "step": 281580 + }, + { + "epoch": 195.29, + "learning_rate": 1.6094182825484764e-05, + "loss": 3.8792, + "step": 282000 + }, + { + "epoch": 195.64, + "learning_rate": 1.6087257617728535e-05, + "loss": 3.8828, + "step": 282500 + }, + { + "epoch": 195.98, + "learning_rate": 1.60803324099723e-05, + "loss": 3.8796, + "step": 283000 + }, + { + "epoch": 196.0, + "eval_loss": 3.8056745529174805, + "eval_runtime": 74.4209, + "eval_samples_per_second": 6616.747, + "eval_steps_per_second": 2.163, + "step": 283024 + }, + { + "epoch": 196.33, + "learning_rate": 1.607340720221607e-05, + "loss": 3.8769, + "step": 283500 + }, + { + "epoch": 196.68, + "learning_rate": 1.6066481994459835e-05, + "loss": 3.8801, + "step": 284000 + }, + { + "epoch": 197.0, + "eval_loss": 3.8007116317749023, + "eval_runtime": 77.2912, + "eval_samples_per_second": 6371.024, + "eval_steps_per_second": 2.083, + "step": 284468 + }, + { + "epoch": 197.02, + "learning_rate": 1.6059556786703602e-05, + "loss": 3.8765, + "step": 284500 + }, + { + "epoch": 197.37, + "learning_rate": 1.605263157894737e-05, + "loss": 3.8789, + "step": 285000 + }, + { + "epoch": 197.71, + "learning_rate": 1.6045706371191136e-05, + "loss": 3.8787, + "step": 285500 + }, + { + "epoch": 198.0, + "eval_loss": 3.7981278896331787, + "eval_runtime": 74.6743, + "eval_samples_per_second": 6594.29, + "eval_steps_per_second": 2.156, + "step": 285912 + }, + { + "epoch": 198.06, + "learning_rate": 1.6038781163434903e-05, + "loss": 3.8782, + "step": 286000 + }, + { + "epoch": 198.41, + "learning_rate": 1.6031855955678673e-05, + "loss": 3.8791, + "step": 286500 + }, + { + "epoch": 198.75, + "learning_rate": 1.6024930747922437e-05, + "loss": 3.876, + "step": 287000 + }, + { + "epoch": 199.0, + "eval_loss": 3.800157070159912, + "eval_runtime": 74.4204, + "eval_samples_per_second": 6616.785, + "eval_steps_per_second": 2.163, + "step": 287356 + }, + { + "epoch": 199.1, + "learning_rate": 1.6018005540166207e-05, + "loss": 3.8771, + "step": 287500 + }, + { + "epoch": 199.45, + "learning_rate": 1.6011080332409974e-05, + "loss": 3.8753, + "step": 288000 + }, + { + "epoch": 199.79, + "learning_rate": 1.600415512465374e-05, + "loss": 3.8739, + "step": 288500 + }, + { + "epoch": 200.0, + "eval_loss": 3.801537036895752, + "eval_runtime": 73.7536, + "eval_samples_per_second": 6676.611, + "eval_steps_per_second": 2.183, + "step": 288800 + }, + { + "epoch": 200.14, + "learning_rate": 1.5997229916897508e-05, + "loss": 3.8727, + "step": 289000 + }, + { + "epoch": 200.48, + "learning_rate": 1.5990304709141275e-05, + "loss": 3.8757, + "step": 289500 + }, + { + "epoch": 200.83, + "learning_rate": 1.5983379501385042e-05, + "loss": 3.8718, + "step": 290000 + }, + { + "epoch": 201.0, + "eval_loss": 3.7952911853790283, + "eval_runtime": 79.0161, + "eval_samples_per_second": 6231.949, + "eval_steps_per_second": 2.038, + "step": 290244 + }, + { + "epoch": 201.18, + "learning_rate": 1.597645429362881e-05, + "loss": 3.8703, + "step": 290500 + }, + { + "epoch": 201.52, + "learning_rate": 1.5969529085872576e-05, + "loss": 3.8742, + "step": 291000 + }, + { + "epoch": 201.87, + "learning_rate": 1.5962603878116346e-05, + "loss": 3.8728, + "step": 291500 + }, + { + "epoch": 202.0, + "eval_loss": 3.794914484024048, + "eval_runtime": 73.3296, + "eval_samples_per_second": 6715.218, + "eval_steps_per_second": 2.196, + "step": 291688 + }, + { + "epoch": 202.22, + "learning_rate": 1.5955678670360113e-05, + "loss": 3.8716, + "step": 292000 + }, + { + "epoch": 202.56, + "learning_rate": 1.594875346260388e-05, + "loss": 3.8721, + "step": 292500 + }, + { + "epoch": 202.91, + "learning_rate": 1.5941828254847647e-05, + "loss": 3.871, + "step": 293000 + }, + { + "epoch": 203.0, + "eval_loss": 3.791077136993408, + "eval_runtime": 75.7759, + "eval_samples_per_second": 6498.426, + "eval_steps_per_second": 2.125, + "step": 293132 + }, + { + "epoch": 203.25, + "learning_rate": 1.5934903047091414e-05, + "loss": 3.8718, + "step": 293500 + }, + { + "epoch": 203.6, + "learning_rate": 1.592797783933518e-05, + "loss": 3.8714, + "step": 294000 + }, + { + "epoch": 203.95, + "learning_rate": 1.5921052631578948e-05, + "loss": 3.8686, + "step": 294500 + }, + { + "epoch": 204.0, + "eval_loss": 3.791466474533081, + "eval_runtime": 76.174, + "eval_samples_per_second": 6464.462, + "eval_steps_per_second": 2.114, + "step": 294576 + }, + { + "epoch": 204.29, + "learning_rate": 1.5914127423822714e-05, + "loss": 3.8699, + "step": 295000 + }, + { + "epoch": 204.64, + "learning_rate": 1.5907202216066485e-05, + "loss": 3.8704, + "step": 295500 + }, + { + "epoch": 204.99, + "learning_rate": 1.590027700831025e-05, + "loss": 3.8727, + "step": 296000 + }, + { + "epoch": 205.0, + "eval_loss": 3.7933382987976074, + "eval_runtime": 75.2857, + "eval_samples_per_second": 6540.737, + "eval_steps_per_second": 2.139, + "step": 296020 + }, + { + "epoch": 205.33, + "learning_rate": 1.589335180055402e-05, + "loss": 3.8698, + "step": 296500 + }, + { + "epoch": 205.68, + "learning_rate": 1.5886426592797786e-05, + "loss": 3.8697, + "step": 297000 + }, + { + "epoch": 206.0, + "eval_loss": 3.792696952819824, + "eval_runtime": 77.0496, + "eval_samples_per_second": 6390.998, + "eval_steps_per_second": 2.09, + "step": 297464 + }, + { + "epoch": 206.02, + "learning_rate": 1.5879501385041552e-05, + "loss": 3.8676, + "step": 297500 + }, + { + "epoch": 206.37, + "learning_rate": 1.587257617728532e-05, + "loss": 3.8685, + "step": 298000 + }, + { + "epoch": 206.72, + "learning_rate": 1.5865650969529086e-05, + "loss": 3.8684, + "step": 298500 + }, + { + "epoch": 207.0, + "eval_loss": 3.792511224746704, + "eval_runtime": 72.6673, + "eval_samples_per_second": 6776.414, + "eval_steps_per_second": 2.216, + "step": 298908 + }, + { + "epoch": 207.06, + "learning_rate": 1.5858725761772857e-05, + "loss": 3.8679, + "step": 299000 + }, + { + "epoch": 207.41, + "learning_rate": 1.585180055401662e-05, + "loss": 3.8687, + "step": 299500 + }, + { + "epoch": 207.76, + "learning_rate": 1.584487534626039e-05, + "loss": 3.8681, + "step": 300000 + } + ], + "max_steps": 1444000, + "num_train_epochs": 1000, + "total_flos": 1.410332775288172e+18, + "trial_name": null, + "trial_params": null +}