diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,7 +1,7 @@ { - "best_metric": 2.000014066696167, - "best_model_checkpoint": "./model_tweets_2020_Q2_full/checkpoint-2400000", - "epoch": 2.618552224296455, + "best_metric": 1.9251400232315063, + "best_model_checkpoint": "./model_tweets_2020_Q2_full/checkpoint-1216000", + "epoch": 5.052631578947368, "eval_steps": 8000, "global_step": 2400000, "is_hyper_param_search": false, @@ -9,3320 +9,3320 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.01, - "eval_loss": 2.1043460369110107, - "eval_runtime": 841.4118, - "eval_samples_per_second": 917.292, - "eval_steps_per_second": 57.331, + "epoch": 0.02, + "eval_loss": 2.222038507461548, + "eval_runtime": 443.0178, + "eval_samples_per_second": 902.898, + "eval_steps_per_second": 56.431, "step": 8000 }, { - "epoch": 0.02, + "epoch": 0.03, "learning_rate": 4.0726666666666665e-07, - "loss": 2.2608, + "loss": 2.4154, "step": 16000 }, { - "epoch": 0.02, - "eval_loss": 2.093374013900757, - "eval_runtime": 839.5651, - "eval_samples_per_second": 919.309, - "eval_steps_per_second": 57.457, + "epoch": 0.03, + "eval_loss": 2.142679214477539, + "eval_runtime": 443.0058, + "eval_samples_per_second": 902.923, + "eval_steps_per_second": 56.433, "step": 16000 }, { - "epoch": 0.03, - "eval_loss": 2.0861904621124268, - "eval_runtime": 840.327, - "eval_samples_per_second": 918.476, - "eval_steps_per_second": 57.405, + "epoch": 0.05, + "eval_loss": 2.1027772426605225, + "eval_runtime": 442.7033, + "eval_samples_per_second": 903.54, + "eval_steps_per_second": 56.471, "step": 24000 }, { - "epoch": 0.03, + "epoch": 0.07, "learning_rate": 4.0453333333333336e-07, - "loss": 2.2409, + "loss": 2.2273, "step": 32000 }, { - "epoch": 0.03, - "eval_loss": 2.080547332763672, - "eval_runtime": 841.1888, - "eval_samples_per_second": 917.535, - "eval_steps_per_second": 57.346, + "epoch": 0.07, + "eval_loss": 2.0823824405670166, + "eval_runtime": 443.6086, + "eval_samples_per_second": 901.696, + "eval_steps_per_second": 56.356, "step": 32000 }, { - "epoch": 0.04, - "eval_loss": 2.079263210296631, - "eval_runtime": 841.5773, - "eval_samples_per_second": 917.111, - "eval_steps_per_second": 57.32, + "epoch": 0.08, + "eval_loss": 2.0644640922546387, + "eval_runtime": 443.4406, + "eval_samples_per_second": 902.037, + "eval_steps_per_second": 56.377, "step": 40000 }, { - "epoch": 0.05, + "epoch": 0.1, "learning_rate": 4.018e-07, - "loss": 2.2278, + "loss": 2.1774, "step": 48000 }, { - "epoch": 0.05, - "eval_loss": 2.071790933609009, - "eval_runtime": 841.5993, - "eval_samples_per_second": 917.087, - "eval_steps_per_second": 57.318, + "epoch": 0.1, + "eval_loss": 2.0478036403656006, + "eval_runtime": 444.714, + "eval_samples_per_second": 899.454, + "eval_steps_per_second": 56.216, "step": 48000 }, { - "epoch": 0.06, - "eval_loss": 2.0752639770507812, - "eval_runtime": 841.4195, - "eval_samples_per_second": 917.283, - "eval_steps_per_second": 57.33, + "epoch": 0.12, + "eval_loss": 2.0327365398406982, + "eval_runtime": 445.1474, + "eval_samples_per_second": 898.579, + "eval_steps_per_second": 56.161, "step": 56000 }, { - "epoch": 0.07, + "epoch": 0.13, "learning_rate": 3.9906666666666667e-07, - "loss": 2.2059, + "loss": 2.1569, "step": 64000 }, { - "epoch": 0.07, - "eval_loss": 2.0668046474456787, - "eval_runtime": 844.6529, - "eval_samples_per_second": 913.772, - "eval_steps_per_second": 57.111, + "epoch": 0.13, + "eval_loss": 2.0247883796691895, + "eval_runtime": 443.3798, + "eval_samples_per_second": 902.161, + "eval_steps_per_second": 56.385, "step": 64000 }, { - "epoch": 0.08, - "eval_loss": 2.0657169818878174, - "eval_runtime": 844.5291, - "eval_samples_per_second": 913.906, - "eval_steps_per_second": 57.119, + "epoch": 0.15, + "eval_loss": 2.020857334136963, + "eval_runtime": 443.9842, + "eval_samples_per_second": 900.933, + "eval_steps_per_second": 56.308, "step": 72000 }, { - "epoch": 0.09, + "epoch": 0.17, "learning_rate": 3.963333333333333e-07, - "loss": 2.1997, + "loss": 2.1439, "step": 80000 }, { - "epoch": 0.09, - "eval_loss": 2.062004566192627, - "eval_runtime": 845.6772, - "eval_samples_per_second": 912.665, - "eval_steps_per_second": 57.042, + "epoch": 0.17, + "eval_loss": 2.004868507385254, + "eval_runtime": 446.7998, + "eval_samples_per_second": 895.256, + "eval_steps_per_second": 55.953, "step": 80000 }, { - "epoch": 0.1, - "eval_loss": 2.0553247928619385, - "eval_runtime": 846.4058, - "eval_samples_per_second": 911.879, - "eval_steps_per_second": 56.993, + "epoch": 0.19, + "eval_loss": 2.0112717151641846, + "eval_runtime": 445.8029, + "eval_samples_per_second": 897.257, + "eval_steps_per_second": 56.079, "step": 88000 }, { - "epoch": 0.1, + "epoch": 0.2, "learning_rate": 3.936e-07, - "loss": 2.1988, + "loss": 2.1271, "step": 96000 }, { - "epoch": 0.1, - "eval_loss": 2.0569465160369873, - "eval_runtime": 841.5807, - "eval_samples_per_second": 917.108, - "eval_steps_per_second": 57.32, + "epoch": 0.2, + "eval_loss": 2.0037786960601807, + "eval_runtime": 447.4745, + "eval_samples_per_second": 893.906, + "eval_steps_per_second": 55.869, "step": 96000 }, { - "epoch": 0.11, - "eval_loss": 2.052541971206665, - "eval_runtime": 845.3721, - "eval_samples_per_second": 912.994, - "eval_steps_per_second": 57.062, + "epoch": 0.22, + "eval_loss": 2.0065057277679443, + "eval_runtime": 449.3147, + "eval_samples_per_second": 890.245, + "eval_steps_per_second": 55.64, "step": 104000 }, { - "epoch": 0.12, + "epoch": 0.24, "learning_rate": 3.908666666666667e-07, - "loss": 2.1861, + "loss": 2.1211, "step": 112000 }, { - "epoch": 0.12, - "eval_loss": 2.05564284324646, - "eval_runtime": 847.9385, - "eval_samples_per_second": 910.231, - "eval_steps_per_second": 56.89, + "epoch": 0.24, + "eval_loss": 1.998653769493103, + "eval_runtime": 447.7781, + "eval_samples_per_second": 893.3, + "eval_steps_per_second": 55.831, "step": 112000 }, { - "epoch": 0.13, - "eval_loss": 2.04929256439209, - "eval_runtime": 842.77, - "eval_samples_per_second": 915.813, - "eval_steps_per_second": 57.239, + "epoch": 0.25, + "eval_loss": 1.9929168224334717, + "eval_runtime": 444.7126, + "eval_samples_per_second": 899.457, + "eval_steps_per_second": 56.216, "step": 120000 }, { - "epoch": 0.14, + "epoch": 0.27, "learning_rate": 3.8813333333333334e-07, - "loss": 2.1823, + "loss": 2.1194, "step": 128000 }, { - "epoch": 0.14, - "eval_loss": 2.0508854389190674, - "eval_runtime": 846.1188, - "eval_samples_per_second": 912.189, - "eval_steps_per_second": 57.012, + "epoch": 0.27, + "eval_loss": 1.9922369718551636, + "eval_runtime": 447.3304, + "eval_samples_per_second": 894.194, + "eval_steps_per_second": 55.887, "step": 128000 }, { - "epoch": 0.15, - "eval_loss": 2.0460989475250244, - "eval_runtime": 845.0745, - "eval_samples_per_second": 913.316, - "eval_steps_per_second": 57.083, + "epoch": 0.29, + "eval_loss": 1.9916871786117554, + "eval_runtime": 444.9748, + "eval_samples_per_second": 898.927, + "eval_steps_per_second": 56.183, "step": 136000 }, { - "epoch": 0.16, + "epoch": 0.3, "learning_rate": 3.854e-07, - "loss": 2.1851, + "loss": 2.1118, "step": 144000 }, { - "epoch": 0.16, - "eval_loss": 2.0476059913635254, - "eval_runtime": 844.5648, - "eval_samples_per_second": 913.867, - "eval_steps_per_second": 57.117, + "epoch": 0.3, + "eval_loss": 1.9884706735610962, + "eval_runtime": 444.0647, + "eval_samples_per_second": 900.77, + "eval_steps_per_second": 56.298, "step": 144000 }, { - "epoch": 0.17, - "eval_loss": 2.04502010345459, - "eval_runtime": 845.4437, - "eval_samples_per_second": 912.917, - "eval_steps_per_second": 57.058, + "epoch": 0.32, + "eval_loss": 1.9870291948318481, + "eval_runtime": 445.1345, + "eval_samples_per_second": 898.605, + "eval_steps_per_second": 56.163, "step": 152000 }, { - "epoch": 0.17, + "epoch": 0.34, "learning_rate": 3.8266666666666665e-07, - "loss": 2.1862, + "loss": 2.1047, "step": 160000 }, { - "epoch": 0.17, - "eval_loss": 2.046872854232788, - "eval_runtime": 843.2469, - "eval_samples_per_second": 915.295, - "eval_steps_per_second": 57.206, + "epoch": 0.34, + "eval_loss": 1.9842537641525269, + "eval_runtime": 444.6868, + "eval_samples_per_second": 899.51, + "eval_steps_per_second": 56.219, "step": 160000 }, { - "epoch": 0.18, - "eval_loss": 2.0441744327545166, - "eval_runtime": 845.6614, - "eval_samples_per_second": 912.682, - "eval_steps_per_second": 57.043, + "epoch": 0.35, + "eval_loss": 1.9826897382736206, + "eval_runtime": 446.0995, + "eval_samples_per_second": 896.661, + "eval_steps_per_second": 56.041, "step": 168000 }, { - "epoch": 0.19, + "epoch": 0.37, "learning_rate": 3.799333333333333e-07, - "loss": 2.1741, + "loss": 2.1015, "step": 176000 }, { - "epoch": 0.19, - "eval_loss": 2.0456435680389404, - "eval_runtime": 847.979, - "eval_samples_per_second": 910.188, - "eval_steps_per_second": 56.887, + "epoch": 0.37, + "eval_loss": 1.98258376121521, + "eval_runtime": 447.5655, + "eval_samples_per_second": 893.724, + "eval_steps_per_second": 55.858, "step": 176000 }, { - "epoch": 0.2, - "eval_loss": 2.0441741943359375, - "eval_runtime": 846.0243, - "eval_samples_per_second": 912.291, - "eval_steps_per_second": 57.018, + "epoch": 0.39, + "eval_loss": 1.9773671627044678, + "eval_runtime": 446.1508, + "eval_samples_per_second": 896.558, + "eval_steps_per_second": 56.035, "step": 184000 }, { - "epoch": 0.21, + "epoch": 0.4, "learning_rate": 3.772e-07, - "loss": 2.181, + "loss": 2.1042, "step": 192000 }, { - "epoch": 0.21, - "eval_loss": 2.040196418762207, - "eval_runtime": 851.9304, - "eval_samples_per_second": 905.966, - "eval_steps_per_second": 56.623, + "epoch": 0.4, + "eval_loss": 1.9771127700805664, + "eval_runtime": 446.0195, + "eval_samples_per_second": 896.822, + "eval_steps_per_second": 56.051, "step": 192000 }, { - "epoch": 0.22, - "eval_loss": 2.0422918796539307, - "eval_runtime": 847.5458, - "eval_samples_per_second": 910.653, - "eval_steps_per_second": 56.916, + "epoch": 0.42, + "eval_loss": 1.9769738912582397, + "eval_runtime": 447.4748, + "eval_samples_per_second": 893.905, + "eval_steps_per_second": 55.869, "step": 200000 }, { - "epoch": 0.23, + "epoch": 0.44, "learning_rate": 3.7446666666666667e-07, - "loss": 2.1692, + "loss": 2.0919, "step": 208000 }, { - "epoch": 0.23, - "eval_loss": 2.041342258453369, - "eval_runtime": 847.0952, - "eval_samples_per_second": 911.137, - "eval_steps_per_second": 56.946, + "epoch": 0.44, + "eval_loss": 1.975176453590393, + "eval_runtime": 445.3011, + "eval_samples_per_second": 898.269, + "eval_steps_per_second": 56.142, "step": 208000 }, { - "epoch": 0.24, - "eval_loss": 2.0448198318481445, - "eval_runtime": 846.694, - "eval_samples_per_second": 911.569, - "eval_steps_per_second": 56.973, + "epoch": 0.45, + "eval_loss": 1.9774501323699951, + "eval_runtime": 445.3222, + "eval_samples_per_second": 898.226, + "eval_steps_per_second": 56.139, "step": 216000 }, { - "epoch": 0.24, + "epoch": 0.47, "learning_rate": 3.7173333333333333e-07, - "loss": 2.1678, + "loss": 2.0953, "step": 224000 }, { - "epoch": 0.24, - "eval_loss": 2.0417792797088623, - "eval_runtime": 846.2034, - "eval_samples_per_second": 912.098, - "eval_steps_per_second": 57.006, + "epoch": 0.47, + "eval_loss": 1.9684234857559204, + "eval_runtime": 445.4157, + "eval_samples_per_second": 898.038, + "eval_steps_per_second": 56.127, "step": 224000 }, { - "epoch": 0.25, - "eval_loss": 2.041692018508911, - "eval_runtime": 848.6147, - "eval_samples_per_second": 909.506, - "eval_steps_per_second": 56.844, + "epoch": 0.49, + "eval_loss": 1.97478187084198, + "eval_runtime": 446.0926, + "eval_samples_per_second": 896.675, + "eval_steps_per_second": 56.042, "step": 232000 }, { - "epoch": 0.26, + "epoch": 0.51, "learning_rate": 3.69e-07, - "loss": 2.1756, + "loss": 2.0848, "step": 240000 }, { - "epoch": 0.26, - "eval_loss": 2.034193754196167, - "eval_runtime": 847.0585, - "eval_samples_per_second": 911.177, - "eval_steps_per_second": 56.949, + "epoch": 0.51, + "eval_loss": 1.9713643789291382, + "eval_runtime": 445.9733, + "eval_samples_per_second": 896.915, + "eval_steps_per_second": 56.057, "step": 240000 }, { - "epoch": 0.27, - "eval_loss": 2.037684202194214, - "eval_runtime": 846.2239, - "eval_samples_per_second": 912.075, - "eval_steps_per_second": 57.005, + "epoch": 0.52, + "eval_loss": 1.9780809879302979, + "eval_runtime": 446.8053, + "eval_samples_per_second": 895.245, + "eval_steps_per_second": 55.953, "step": 248000 }, { - "epoch": 0.28, + "epoch": 0.54, "learning_rate": 3.6626666666666664e-07, - "loss": 2.1752, + "loss": 2.0882, "step": 256000 }, { - "epoch": 0.28, - "eval_loss": 2.0381019115448, - "eval_runtime": 846.7119, - "eval_samples_per_second": 911.55, - "eval_steps_per_second": 56.972, + "epoch": 0.54, + "eval_loss": 1.9709409475326538, + "eval_runtime": 446.8628, + "eval_samples_per_second": 895.129, + "eval_steps_per_second": 55.946, "step": 256000 }, { - "epoch": 0.29, - "eval_loss": 2.035405158996582, - "eval_runtime": 851.4653, - "eval_samples_per_second": 906.461, - "eval_steps_per_second": 56.654, + "epoch": 0.56, + "eval_loss": 1.9659632444381714, + "eval_runtime": 446.6679, + "eval_samples_per_second": 895.52, + "eval_steps_per_second": 55.97, "step": 264000 }, { - "epoch": 0.3, + "epoch": 0.57, "learning_rate": 3.6353333333333335e-07, - "loss": 2.1673, + "loss": 2.0922, "step": 272000 }, { - "epoch": 0.3, - "eval_loss": 2.038097858428955, - "eval_runtime": 846.955, - "eval_samples_per_second": 911.288, - "eval_steps_per_second": 56.956, + "epoch": 0.57, + "eval_loss": 1.9651156663894653, + "eval_runtime": 447.5675, + "eval_samples_per_second": 893.72, + "eval_steps_per_second": 55.857, "step": 272000 }, { - "epoch": 0.31, - "eval_loss": 2.0375349521636963, - "eval_runtime": 846.7581, - "eval_samples_per_second": 911.5, - "eval_steps_per_second": 56.969, + "epoch": 0.59, + "eval_loss": 1.9677698612213135, + "eval_runtime": 447.5478, + "eval_samples_per_second": 893.759, + "eval_steps_per_second": 55.86, "step": 280000 }, { - "epoch": 0.31, + "epoch": 0.61, "learning_rate": 3.608e-07, - "loss": 2.1585, + "loss": 2.0938, "step": 288000 }, { - "epoch": 0.31, - "eval_loss": 2.033590078353882, - "eval_runtime": 848.1336, - "eval_samples_per_second": 910.022, - "eval_steps_per_second": 56.877, + "epoch": 0.61, + "eval_loss": 1.9667097330093384, + "eval_runtime": 445.5237, + "eval_samples_per_second": 897.82, + "eval_steps_per_second": 56.114, "step": 288000 }, { - "epoch": 0.32, - "eval_loss": 2.0344314575195312, - "eval_runtime": 847.2304, - "eval_samples_per_second": 910.992, - "eval_steps_per_second": 56.937, + "epoch": 0.62, + "eval_loss": 1.9630117416381836, + "eval_runtime": 447.5815, + "eval_samples_per_second": 893.692, + "eval_steps_per_second": 55.856, "step": 296000 }, { - "epoch": 0.33, + "epoch": 0.64, "learning_rate": 3.5806666666666666e-07, - "loss": 2.1703, + "loss": 2.095, "step": 304000 }, { - "epoch": 0.33, - "eval_loss": 2.034810781478882, - "eval_runtime": 846.3544, - "eval_samples_per_second": 911.935, - "eval_steps_per_second": 56.996, + "epoch": 0.64, + "eval_loss": 1.964175820350647, + "eval_runtime": 446.8848, + "eval_samples_per_second": 895.085, + "eval_steps_per_second": 55.943, "step": 304000 }, { - "epoch": 0.34, - "eval_loss": 2.0329954624176025, - "eval_runtime": 847.3997, - "eval_samples_per_second": 910.81, - "eval_steps_per_second": 56.926, + "epoch": 0.66, + "eval_loss": 1.9624465703964233, + "eval_runtime": 446.5827, + "eval_samples_per_second": 895.691, + "eval_steps_per_second": 55.981, "step": 312000 }, { - "epoch": 0.35, + "epoch": 0.67, "learning_rate": 3.553333333333333e-07, - "loss": 2.1667, + "loss": 2.0908, "step": 320000 }, { - "epoch": 0.35, - "eval_loss": 2.0352213382720947, - "eval_runtime": 846.3586, - "eval_samples_per_second": 911.93, - "eval_steps_per_second": 56.996, + "epoch": 0.67, + "eval_loss": 1.9602919816970825, + "eval_runtime": 446.0632, + "eval_samples_per_second": 896.734, + "eval_steps_per_second": 56.046, "step": 320000 }, { - "epoch": 0.36, - "eval_loss": 2.0359089374542236, - "eval_runtime": 848.8487, - "eval_samples_per_second": 909.255, - "eval_steps_per_second": 56.829, + "epoch": 0.69, + "eval_loss": 1.9648582935333252, + "eval_runtime": 446.2312, + "eval_samples_per_second": 896.396, + "eval_steps_per_second": 56.025, "step": 328000 }, { - "epoch": 0.37, + "epoch": 0.71, "learning_rate": 3.5259999999999997e-07, - "loss": 2.1649, + "loss": 2.0927, "step": 336000 }, { - "epoch": 0.37, - "eval_loss": 2.031733512878418, - "eval_runtime": 848.6246, - "eval_samples_per_second": 909.495, - "eval_steps_per_second": 56.844, + "epoch": 0.71, + "eval_loss": 1.9641398191452026, + "eval_runtime": 449.2021, + "eval_samples_per_second": 890.468, + "eval_steps_per_second": 55.654, "step": 336000 }, { - "epoch": 0.38, - "eval_loss": 2.0314059257507324, - "eval_runtime": 851.4761, - "eval_samples_per_second": 906.449, - "eval_steps_per_second": 56.653, + "epoch": 0.72, + "eval_loss": 1.9602792263031006, + "eval_runtime": 447.5196, + "eval_samples_per_second": 893.816, + "eval_steps_per_second": 55.863, "step": 344000 }, { - "epoch": 0.38, + "epoch": 0.74, "learning_rate": 3.498666666666667e-07, - "loss": 2.1564, + "loss": 2.0931, "step": 352000 }, { - "epoch": 0.38, - "eval_loss": 2.030597686767578, - "eval_runtime": 850.4287, - "eval_samples_per_second": 907.566, - "eval_steps_per_second": 56.723, + "epoch": 0.74, + "eval_loss": 1.9589898586273193, + "eval_runtime": 447.4097, + "eval_samples_per_second": 894.035, + "eval_steps_per_second": 55.877, "step": 352000 }, { - "epoch": 0.39, - "eval_loss": 2.029878616333008, - "eval_runtime": 850.6967, - "eval_samples_per_second": 907.28, - "eval_steps_per_second": 56.705, + "epoch": 0.76, + "eval_loss": 1.964429259300232, + "eval_runtime": 447.2712, + "eval_samples_per_second": 894.312, + "eval_steps_per_second": 55.895, "step": 360000 }, { - "epoch": 0.4, + "epoch": 0.77, "learning_rate": 3.4713333333333333e-07, - "loss": 2.161, + "loss": 2.087, "step": 368000 }, { - "epoch": 0.4, - "eval_loss": 2.0317320823669434, - "eval_runtime": 851.1347, - "eval_samples_per_second": 906.813, - "eval_steps_per_second": 56.676, + "epoch": 0.77, + "eval_loss": 1.963483452796936, + "eval_runtime": 447.9748, + "eval_samples_per_second": 892.907, + "eval_steps_per_second": 55.807, "step": 368000 }, { - "epoch": 0.41, - "eval_loss": 2.032505989074707, - "eval_runtime": 854.4271, - "eval_samples_per_second": 903.319, - "eval_steps_per_second": 56.458, + "epoch": 0.79, + "eval_loss": 1.9613640308380127, + "eval_runtime": 448.1094, + "eval_samples_per_second": 892.639, + "eval_steps_per_second": 55.79, "step": 376000 }, { - "epoch": 0.42, + "epoch": 0.81, "learning_rate": 3.444e-07, - "loss": 2.1551, + "loss": 2.0792, "step": 384000 }, { - "epoch": 0.42, - "eval_loss": 2.0273916721343994, - "eval_runtime": 850.6274, - "eval_samples_per_second": 907.354, - "eval_steps_per_second": 56.71, + "epoch": 0.81, + "eval_loss": 1.959149956703186, + "eval_runtime": 447.4535, + "eval_samples_per_second": 893.948, + "eval_steps_per_second": 55.872, "step": 384000 }, { - "epoch": 0.43, - "eval_loss": 2.0281741619110107, - "eval_runtime": 850.1523, - "eval_samples_per_second": 907.861, - "eval_steps_per_second": 56.742, + "epoch": 0.83, + "eval_loss": 1.9575395584106445, + "eval_runtime": 448.4642, + "eval_samples_per_second": 891.933, + "eval_steps_per_second": 55.746, "step": 392000 }, { - "epoch": 0.44, + "epoch": 0.84, "learning_rate": 3.416666666666667e-07, - "loss": 2.1602, + "loss": 2.0899, "step": 400000 }, { - "epoch": 0.44, - "eval_loss": 2.0300543308258057, - "eval_runtime": 852.8839, - "eval_samples_per_second": 904.953, - "eval_steps_per_second": 56.56, + "epoch": 0.84, + "eval_loss": 1.959159016609192, + "eval_runtime": 446.7586, + "eval_samples_per_second": 895.338, + "eval_steps_per_second": 55.959, "step": 400000 }, { - "epoch": 0.45, - "eval_loss": 2.0302786827087402, - "eval_runtime": 854.7636, - "eval_samples_per_second": 902.963, - "eval_steps_per_second": 56.435, + "epoch": 0.86, + "eval_loss": 1.9618536233901978, + "eval_runtime": 449.0448, + "eval_samples_per_second": 890.78, + "eval_steps_per_second": 55.674, "step": 408000 }, { - "epoch": 0.45, + "epoch": 0.88, "learning_rate": 3.3893333333333335e-07, - "loss": 2.1581, + "loss": 2.0812, "step": 416000 }, { - "epoch": 0.45, - "eval_loss": 2.026031732559204, - "eval_runtime": 852.2087, - "eval_samples_per_second": 905.67, - "eval_steps_per_second": 56.605, + "epoch": 0.88, + "eval_loss": 1.9582098722457886, + "eval_runtime": 448.2032, + "eval_samples_per_second": 892.452, + "eval_steps_per_second": 55.778, "step": 416000 }, { - "epoch": 0.46, - "eval_loss": 2.0248208045959473, - "eval_runtime": 850.4117, - "eval_samples_per_second": 907.584, - "eval_steps_per_second": 56.724, + "epoch": 0.89, + "eval_loss": 1.958033800125122, + "eval_runtime": 448.1857, + "eval_samples_per_second": 892.487, + "eval_steps_per_second": 55.78, "step": 424000 }, { - "epoch": 0.47, + "epoch": 0.91, "learning_rate": 3.3619999999999995e-07, - "loss": 2.1494, + "loss": 2.0948, "step": 432000 }, { - "epoch": 0.47, - "eval_loss": 2.026501178741455, - "eval_runtime": 848.7671, - "eval_samples_per_second": 909.343, - "eval_steps_per_second": 56.834, + "epoch": 0.91, + "eval_loss": 1.9587032794952393, + "eval_runtime": 447.736, + "eval_samples_per_second": 893.384, + "eval_steps_per_second": 55.836, "step": 432000 }, { - "epoch": 0.48, - "eval_loss": 2.0246880054473877, - "eval_runtime": 849.7267, - "eval_samples_per_second": 908.316, - "eval_steps_per_second": 56.77, + "epoch": 0.93, + "eval_loss": 1.9593387842178345, + "eval_runtime": 448.6332, + "eval_samples_per_second": 891.597, + "eval_steps_per_second": 55.725, "step": 440000 }, { - "epoch": 0.49, + "epoch": 0.94, "learning_rate": 3.3346666666666666e-07, - "loss": 2.1508, + "loss": 2.0895, "step": 448000 }, { - "epoch": 0.49, - "eval_loss": 2.0231027603149414, - "eval_runtime": 849.0484, - "eval_samples_per_second": 909.041, - "eval_steps_per_second": 56.815, + "epoch": 0.94, + "eval_loss": 1.960838794708252, + "eval_runtime": 448.8111, + "eval_samples_per_second": 891.244, + "eval_steps_per_second": 55.703, "step": 448000 }, { - "epoch": 0.5, - "eval_loss": 2.0276315212249756, - "eval_runtime": 849.4168, - "eval_samples_per_second": 908.647, - "eval_steps_per_second": 56.791, + "epoch": 0.96, + "eval_loss": 1.956569790840149, + "eval_runtime": 448.6177, + "eval_samples_per_second": 891.628, + "eval_steps_per_second": 55.727, "step": 456000 }, { - "epoch": 0.51, + "epoch": 0.98, "learning_rate": 3.307333333333333e-07, - "loss": 2.153, + "loss": 2.0756, "step": 464000 }, { - "epoch": 0.51, - "eval_loss": 2.0275754928588867, - "eval_runtime": 848.4629, - "eval_samples_per_second": 909.669, - "eval_steps_per_second": 56.855, + "epoch": 0.98, + "eval_loss": 1.952539086341858, + "eval_runtime": 448.9343, + "eval_samples_per_second": 890.999, + "eval_steps_per_second": 55.687, "step": 464000 }, { - "epoch": 0.51, - "eval_loss": 2.0241763591766357, - "eval_runtime": 849.6091, - "eval_samples_per_second": 908.441, - "eval_steps_per_second": 56.778, + "epoch": 0.99, + "eval_loss": 1.9540966749191284, + "eval_runtime": 449.2991, + "eval_samples_per_second": 890.275, + "eval_steps_per_second": 55.642, "step": 472000 }, { - "epoch": 0.52, + "epoch": 1.01, "learning_rate": 3.28e-07, - "loss": 2.1489, + "loss": 2.0842, "step": 480000 }, { - "epoch": 0.52, - "eval_loss": 2.0259480476379395, - "eval_runtime": 849.4664, - "eval_samples_per_second": 908.594, - "eval_steps_per_second": 56.787, + "epoch": 1.01, + "eval_loss": 1.9600876569747925, + "eval_runtime": 449.599, + "eval_samples_per_second": 889.682, + "eval_steps_per_second": 55.605, "step": 480000 }, { - "epoch": 0.53, - "eval_loss": 2.025740623474121, - "eval_runtime": 850.1732, - "eval_samples_per_second": 907.839, - "eval_steps_per_second": 56.74, + "epoch": 1.03, + "eval_loss": 1.9564368724822998, + "eval_runtime": 452.8797, + "eval_samples_per_second": 883.237, + "eval_steps_per_second": 55.202, "step": 488000 }, { - "epoch": 0.54, + "epoch": 1.04, "learning_rate": 3.252666666666667e-07, - "loss": 2.1468, + "loss": 2.0935, "step": 496000 }, { - "epoch": 0.54, - "eval_loss": 2.027461528778076, - "eval_runtime": 850.2923, - "eval_samples_per_second": 907.711, - "eval_steps_per_second": 56.732, + "epoch": 1.04, + "eval_loss": 1.9522088766098022, + "eval_runtime": 452.7891, + "eval_samples_per_second": 883.414, + "eval_steps_per_second": 55.213, "step": 496000 }, { - "epoch": 0.55, - "eval_loss": 2.030271053314209, - "eval_runtime": 851.4114, - "eval_samples_per_second": 906.518, - "eval_steps_per_second": 56.658, + "epoch": 1.06, + "eval_loss": 1.95320463180542, + "eval_runtime": 454.7973, + "eval_samples_per_second": 879.513, + "eval_steps_per_second": 54.97, "step": 504000 }, { - "epoch": 0.56, + "epoch": 1.08, "learning_rate": 3.2253333333333334e-07, - "loss": 2.1446, + "loss": 2.0836, "step": 512000 }, { - "epoch": 0.56, - "eval_loss": 2.0248193740844727, - "eval_runtime": 852.1182, - "eval_samples_per_second": 905.766, - "eval_steps_per_second": 56.611, + "epoch": 1.08, + "eval_loss": 1.9537328481674194, + "eval_runtime": 451.2722, + "eval_samples_per_second": 886.383, + "eval_steps_per_second": 55.399, "step": 512000 }, { - "epoch": 0.57, - "eval_loss": 2.0285604000091553, - "eval_runtime": 849.8013, - "eval_samples_per_second": 908.236, - "eval_steps_per_second": 56.765, + "epoch": 1.09, + "eval_loss": 1.9553171396255493, + "eval_runtime": 453.3773, + "eval_samples_per_second": 882.267, + "eval_steps_per_second": 55.142, "step": 520000 }, { - "epoch": 0.58, + "epoch": 1.11, "learning_rate": 3.198e-07, - "loss": 2.1409, + "loss": 2.0876, "step": 528000 }, { - "epoch": 0.58, - "eval_loss": 2.0211498737335205, - "eval_runtime": 855.0597, - "eval_samples_per_second": 902.65, - "eval_steps_per_second": 56.416, + "epoch": 1.11, + "eval_loss": 1.946892499923706, + "eval_runtime": 451.8379, + "eval_samples_per_second": 885.273, + "eval_steps_per_second": 55.33, "step": 528000 }, { - "epoch": 0.58, - "eval_loss": 2.0204012393951416, - "eval_runtime": 856.0145, - "eval_samples_per_second": 901.644, - "eval_steps_per_second": 56.353, + "epoch": 1.13, + "eval_loss": 1.9497493505477905, + "eval_runtime": 452.793, + "eval_samples_per_second": 883.406, + "eval_steps_per_second": 55.213, "step": 536000 }, { - "epoch": 0.59, + "epoch": 1.15, "learning_rate": 3.1706666666666665e-07, - "loss": 2.1536, + "loss": 2.0778, "step": 544000 }, { - "epoch": 0.59, - "eval_loss": 2.0198850631713867, - "eval_runtime": 856.7067, - "eval_samples_per_second": 900.915, - "eval_steps_per_second": 56.307, + "epoch": 1.15, + "eval_loss": 1.9541878700256348, + "eval_runtime": 449.7452, + "eval_samples_per_second": 889.392, + "eval_steps_per_second": 55.587, "step": 544000 }, { - "epoch": 0.6, - "eval_loss": 2.0281307697296143, - "eval_runtime": 867.0343, - "eval_samples_per_second": 890.184, - "eval_steps_per_second": 55.637, + "epoch": 1.16, + "eval_loss": 1.951553463935852, + "eval_runtime": 451.9652, + "eval_samples_per_second": 885.024, + "eval_steps_per_second": 55.314, "step": 552000 }, { - "epoch": 0.61, + "epoch": 1.18, "learning_rate": 3.1433333333333336e-07, - "loss": 2.1416, + "loss": 2.0829, "step": 560000 }, { - "epoch": 0.61, - "eval_loss": 2.0237483978271484, - "eval_runtime": 866.1166, - "eval_samples_per_second": 891.127, - "eval_steps_per_second": 55.696, + "epoch": 1.18, + "eval_loss": 1.9506231546401978, + "eval_runtime": 448.9862, + "eval_samples_per_second": 890.896, + "eval_steps_per_second": 55.681, "step": 560000 }, { - "epoch": 0.62, - "eval_loss": 2.0231337547302246, - "eval_runtime": 863.3507, - "eval_samples_per_second": 893.982, - "eval_steps_per_second": 55.874, + "epoch": 1.2, + "eval_loss": 1.9504673480987549, + "eval_runtime": 448.899, + "eval_samples_per_second": 891.069, + "eval_steps_per_second": 55.692, "step": 568000 }, { - "epoch": 0.63, + "epoch": 1.21, "learning_rate": 3.116e-07, - "loss": 2.1502, + "loss": 2.0864, "step": 576000 }, { - "epoch": 0.63, - "eval_loss": 2.0205323696136475, - "eval_runtime": 857.8171, - "eval_samples_per_second": 899.749, - "eval_steps_per_second": 56.235, + "epoch": 1.21, + "eval_loss": 1.9530742168426514, + "eval_runtime": 450.2299, + "eval_samples_per_second": 888.435, + "eval_steps_per_second": 55.527, "step": 576000 }, { - "epoch": 0.64, - "eval_loss": 2.021655559539795, - "eval_runtime": 853.6943, - "eval_samples_per_second": 904.094, - "eval_steps_per_second": 56.506, + "epoch": 1.23, + "eval_loss": 1.9455041885375977, + "eval_runtime": 449.7657, + "eval_samples_per_second": 889.352, + "eval_steps_per_second": 55.584, "step": 584000 }, { - "epoch": 0.65, + "epoch": 1.25, "learning_rate": 3.0886666666666667e-07, - "loss": 2.1424, + "loss": 2.0893, "step": 592000 }, { - "epoch": 0.65, - "eval_loss": 2.024162769317627, - "eval_runtime": 861.2895, - "eval_samples_per_second": 896.121, - "eval_steps_per_second": 56.008, + "epoch": 1.25, + "eval_loss": 1.9471276998519897, + "eval_runtime": 450.0648, + "eval_samples_per_second": 888.761, + "eval_steps_per_second": 55.548, "step": 592000 }, { - "epoch": 0.65, - "eval_loss": 2.0237643718719482, - "eval_runtime": 859.5317, - "eval_samples_per_second": 897.954, - "eval_steps_per_second": 56.122, + "epoch": 1.26, + "eval_loss": 1.9539045095443726, + "eval_runtime": 449.915, + "eval_samples_per_second": 889.057, + "eval_steps_per_second": 55.566, "step": 600000 }, { - "epoch": 0.66, + "epoch": 1.28, "learning_rate": 3.061333333333333e-07, - "loss": 2.1469, + "loss": 2.0808, "step": 608000 }, { - "epoch": 0.66, - "eval_loss": 2.0191547870635986, - "eval_runtime": 855.9495, - "eval_samples_per_second": 901.712, - "eval_steps_per_second": 56.357, + "epoch": 1.28, + "eval_loss": 1.9455146789550781, + "eval_runtime": 450.3195, + "eval_samples_per_second": 888.258, + "eval_steps_per_second": 55.516, "step": 608000 }, { - "epoch": 0.67, - "eval_loss": 2.024866819381714, - "eval_runtime": 857.0469, - "eval_samples_per_second": 900.557, - "eval_steps_per_second": 56.285, + "epoch": 1.3, + "eval_loss": 1.9497132301330566, + "eval_runtime": 452.443, + "eval_samples_per_second": 884.089, + "eval_steps_per_second": 55.256, "step": 616000 }, { - "epoch": 0.68, + "epoch": 1.31, "learning_rate": 3.034e-07, - "loss": 2.145, + "loss": 2.0838, "step": 624000 }, { - "epoch": 0.68, - "eval_loss": 2.0195770263671875, - "eval_runtime": 858.8544, - "eval_samples_per_second": 898.662, - "eval_steps_per_second": 56.167, + "epoch": 1.31, + "eval_loss": 1.946612000465393, + "eval_runtime": 452.7896, + "eval_samples_per_second": 883.413, + "eval_steps_per_second": 55.213, "step": 624000 }, { - "epoch": 0.69, - "eval_loss": 2.022365093231201, - "eval_runtime": 854.0414, - "eval_samples_per_second": 903.727, - "eval_steps_per_second": 56.483, + "epoch": 1.33, + "eval_loss": 1.9498077630996704, + "eval_runtime": 453.0599, + "eval_samples_per_second": 882.886, + "eval_steps_per_second": 55.18, "step": 632000 }, { - "epoch": 0.7, + "epoch": 1.35, "learning_rate": 3.0066666666666663e-07, - "loss": 2.1503, + "loss": 2.0812, "step": 640000 }, { - "epoch": 0.7, - "eval_loss": 2.0216493606567383, - "eval_runtime": 854.8203, - "eval_samples_per_second": 902.903, - "eval_steps_per_second": 56.432, + "epoch": 1.35, + "eval_loss": 1.9510040283203125, + "eval_runtime": 451.3079, + "eval_samples_per_second": 886.313, + "eval_steps_per_second": 55.395, "step": 640000 }, { - "epoch": 0.71, - "eval_loss": 2.022836208343506, - "eval_runtime": 857.6145, - "eval_samples_per_second": 899.962, - "eval_steps_per_second": 56.248, + "epoch": 1.36, + "eval_loss": 1.9526185989379883, + "eval_runtime": 453.3541, + "eval_samples_per_second": 882.313, + "eval_steps_per_second": 55.145, "step": 648000 }, { - "epoch": 0.72, + "epoch": 1.38, "learning_rate": 2.9793333333333334e-07, - "loss": 2.1355, + "loss": 2.0793, "step": 656000 }, { - "epoch": 0.72, - "eval_loss": 2.019666910171509, - "eval_runtime": 859.7029, - "eval_samples_per_second": 897.775, - "eval_steps_per_second": 56.111, + "epoch": 1.38, + "eval_loss": 1.9471242427825928, + "eval_runtime": 453.3637, + "eval_samples_per_second": 882.294, + "eval_steps_per_second": 55.143, "step": 656000 }, { - "epoch": 0.72, - "eval_loss": 2.0240182876586914, - "eval_runtime": 858.0715, - "eval_samples_per_second": 899.482, - "eval_steps_per_second": 56.218, + "epoch": 1.4, + "eval_loss": 1.9469006061553955, + "eval_runtime": 450.6009, + "eval_samples_per_second": 887.703, + "eval_steps_per_second": 55.481, "step": 664000 }, { - "epoch": 0.73, + "epoch": 1.41, "learning_rate": 2.952e-07, - "loss": 2.1392, + "loss": 2.0789, "step": 672000 }, { - "epoch": 0.73, - "eval_loss": 2.0232093334198, - "eval_runtime": 856.593, - "eval_samples_per_second": 901.035, - "eval_steps_per_second": 56.315, + "epoch": 1.41, + "eval_loss": 1.9454900026321411, + "eval_runtime": 447.7405, + "eval_samples_per_second": 893.375, + "eval_steps_per_second": 55.836, "step": 672000 }, { - "epoch": 0.74, - "eval_loss": 2.020932912826538, - "eval_runtime": 858.8309, - "eval_samples_per_second": 898.687, - "eval_steps_per_second": 56.168, + "epoch": 1.43, + "eval_loss": 1.9469287395477295, + "eval_runtime": 447.6789, + "eval_samples_per_second": 893.498, + "eval_steps_per_second": 55.844, "step": 680000 }, { - "epoch": 0.75, + "epoch": 1.45, "learning_rate": 2.9246666666666665e-07, - "loss": 2.1378, + "loss": 2.0883, "step": 688000 }, { - "epoch": 0.75, - "eval_loss": 2.0219063758850098, - "eval_runtime": 860.0126, - "eval_samples_per_second": 897.452, - "eval_steps_per_second": 56.091, + "epoch": 1.45, + "eval_loss": 1.94387948513031, + "eval_runtime": 448.8787, + "eval_samples_per_second": 891.109, + "eval_steps_per_second": 55.694, "step": 688000 }, { - "epoch": 0.76, - "eval_loss": 2.019192695617676, - "eval_runtime": 861.8149, - "eval_samples_per_second": 895.575, - "eval_steps_per_second": 55.974, + "epoch": 1.47, + "eval_loss": 1.9438937902450562, + "eval_runtime": 448.7929, + "eval_samples_per_second": 891.28, + "eval_steps_per_second": 55.705, "step": 696000 }, { - "epoch": 0.77, + "epoch": 1.48, "learning_rate": 2.897333333333333e-07, - "loss": 2.1446, + "loss": 2.09, "step": 704000 }, { - "epoch": 0.77, - "eval_loss": 2.0194740295410156, - "eval_runtime": 857.8914, - "eval_samples_per_second": 899.671, - "eval_steps_per_second": 56.23, + "epoch": 1.48, + "eval_loss": 1.9415992498397827, + "eval_runtime": 449.4189, + "eval_samples_per_second": 890.038, + "eval_steps_per_second": 55.627, "step": 704000 }, { - "epoch": 0.78, - "eval_loss": 2.01971173286438, - "eval_runtime": 857.8638, - "eval_samples_per_second": 899.7, - "eval_steps_per_second": 56.232, + "epoch": 1.5, + "eval_loss": 1.9492276906967163, + "eval_runtime": 448.9056, + "eval_samples_per_second": 891.056, + "eval_steps_per_second": 55.691, "step": 712000 }, { - "epoch": 0.79, + "epoch": 1.52, "learning_rate": 2.8699999999999996e-07, - "loss": 2.1351, + "loss": 2.0845, "step": 720000 }, { - "epoch": 0.79, - "eval_loss": 2.0183634757995605, - "eval_runtime": 857.8713, - "eval_samples_per_second": 899.692, - "eval_steps_per_second": 56.231, + "epoch": 1.52, + "eval_loss": 1.943044900894165, + "eval_runtime": 449.7144, + "eval_samples_per_second": 889.453, + "eval_steps_per_second": 55.591, "step": 720000 }, { - "epoch": 0.79, - "eval_loss": 2.0162270069122314, - "eval_runtime": 857.9238, - "eval_samples_per_second": 899.637, - "eval_steps_per_second": 56.228, + "epoch": 1.53, + "eval_loss": 1.948350191116333, + "eval_runtime": 450.4171, + "eval_samples_per_second": 888.066, + "eval_steps_per_second": 55.504, "step": 728000 }, { - "epoch": 0.8, + "epoch": 1.55, "learning_rate": 2.8426666666666667e-07, - "loss": 2.1437, + "loss": 2.0742, "step": 736000 }, { - "epoch": 0.8, - "eval_loss": 2.015068531036377, - "eval_runtime": 857.7851, - "eval_samples_per_second": 899.783, - "eval_steps_per_second": 56.237, + "epoch": 1.55, + "eval_loss": 1.9455984830856323, + "eval_runtime": 449.7084, + "eval_samples_per_second": 889.465, + "eval_steps_per_second": 55.592, "step": 736000 }, { - "epoch": 0.81, - "eval_loss": 2.0202245712280273, - "eval_runtime": 857.6732, - "eval_samples_per_second": 899.9, - "eval_steps_per_second": 56.244, + "epoch": 1.57, + "eval_loss": 1.9379788637161255, + "eval_runtime": 450.7564, + "eval_samples_per_second": 887.397, + "eval_steps_per_second": 55.462, "step": 744000 }, { - "epoch": 0.82, + "epoch": 1.58, "learning_rate": 2.815333333333333e-07, - "loss": 2.1249, + "loss": 2.0839, "step": 752000 }, { - "epoch": 0.82, - "eval_loss": 2.0169003009796143, - "eval_runtime": 860.8823, - "eval_samples_per_second": 896.545, - "eval_steps_per_second": 56.034, + "epoch": 1.58, + "eval_loss": 1.9418160915374756, + "eval_runtime": 448.5558, + "eval_samples_per_second": 891.751, + "eval_steps_per_second": 55.734, "step": 752000 }, { - "epoch": 0.83, - "eval_loss": 2.018857002258301, - "eval_runtime": 856.9399, - "eval_samples_per_second": 900.67, - "eval_steps_per_second": 56.292, + "epoch": 1.6, + "eval_loss": 1.943367600440979, + "eval_runtime": 449.0202, + "eval_samples_per_second": 890.829, + "eval_steps_per_second": 55.677, "step": 760000 }, { - "epoch": 0.84, + "epoch": 1.62, "learning_rate": 2.7880000000000003e-07, - "loss": 2.1355, + "loss": 2.0806, "step": 768000 }, { - "epoch": 0.84, - "eval_loss": 2.022115707397461, - "eval_runtime": 860.0914, - "eval_samples_per_second": 897.37, - "eval_steps_per_second": 56.086, + "epoch": 1.62, + "eval_loss": 1.9450091123580933, + "eval_runtime": 450.2134, + "eval_samples_per_second": 888.468, + "eval_steps_per_second": 55.529, "step": 768000 }, { - "epoch": 0.85, - "eval_loss": 2.0194284915924072, - "eval_runtime": 858.1451, - "eval_samples_per_second": 899.405, - "eval_steps_per_second": 56.213, + "epoch": 1.63, + "eval_loss": 1.9426227807998657, + "eval_runtime": 450.7888, + "eval_samples_per_second": 887.334, + "eval_steps_per_second": 55.458, "step": 776000 }, { - "epoch": 0.86, + "epoch": 1.65, "learning_rate": 2.7606666666666664e-07, - "loss": 2.1387, + "loss": 2.0805, "step": 784000 }, { - "epoch": 0.86, - "eval_loss": 2.018942356109619, - "eval_runtime": 862.7177, - "eval_samples_per_second": 894.638, - "eval_steps_per_second": 55.915, + "epoch": 1.65, + "eval_loss": 1.944136619567871, + "eval_runtime": 450.1142, + "eval_samples_per_second": 888.663, + "eval_steps_per_second": 55.541, "step": 784000 }, { - "epoch": 0.86, - "eval_loss": 2.016535520553589, - "eval_runtime": 858.1148, - "eval_samples_per_second": 899.437, - "eval_steps_per_second": 56.215, + "epoch": 1.67, + "eval_loss": 1.945942997932434, + "eval_runtime": 449.1687, + "eval_samples_per_second": 890.534, + "eval_steps_per_second": 55.658, "step": 792000 }, { - "epoch": 0.87, + "epoch": 1.68, "learning_rate": 2.733333333333333e-07, - "loss": 2.1334, + "loss": 2.0833, "step": 800000 }, { - "epoch": 0.87, - "eval_loss": 2.0169451236724854, - "eval_runtime": 860.3041, - "eval_samples_per_second": 897.148, - "eval_steps_per_second": 56.072, + "epoch": 1.68, + "eval_loss": 1.9434839487075806, + "eval_runtime": 450.7395, + "eval_samples_per_second": 887.43, + "eval_steps_per_second": 55.464, "step": 800000 }, { - "epoch": 0.88, - "eval_loss": 2.0188918113708496, - "eval_runtime": 861.004, - "eval_samples_per_second": 896.419, - "eval_steps_per_second": 56.026, + "epoch": 1.7, + "eval_loss": 1.9455143213272095, + "eval_runtime": 449.9182, + "eval_samples_per_second": 889.051, + "eval_steps_per_second": 55.566, "step": 808000 }, { - "epoch": 0.89, + "epoch": 1.72, "learning_rate": 2.706e-07, - "loss": 2.137, + "loss": 2.0763, "step": 816000 }, { - "epoch": 0.89, - "eval_loss": 2.016237258911133, - "eval_runtime": 862.2544, - "eval_samples_per_second": 895.119, - "eval_steps_per_second": 55.945, + "epoch": 1.72, + "eval_loss": 1.9420539140701294, + "eval_runtime": 452.5501, + "eval_samples_per_second": 883.88, + "eval_steps_per_second": 55.242, "step": 816000 }, { - "epoch": 0.9, - "eval_loss": 2.0168325901031494, - "eval_runtime": 860.8877, - "eval_samples_per_second": 896.54, - "eval_steps_per_second": 56.034, + "epoch": 1.73, + "eval_loss": 1.9438146352767944, + "eval_runtime": 450.6041, + "eval_samples_per_second": 887.697, + "eval_steps_per_second": 55.481, "step": 824000 }, { - "epoch": 0.91, + "epoch": 1.75, "learning_rate": 2.6786666666666666e-07, - "loss": 2.1331, + "loss": 2.0758, "step": 832000 }, { - "epoch": 0.91, - "eval_loss": 2.0192737579345703, - "eval_runtime": 859.4597, - "eval_samples_per_second": 898.029, - "eval_steps_per_second": 56.127, + "epoch": 1.75, + "eval_loss": 1.937113642692566, + "eval_runtime": 452.047, + "eval_samples_per_second": 884.864, + "eval_steps_per_second": 55.304, "step": 832000 }, { - "epoch": 0.92, - "eval_loss": 2.016619920730591, - "eval_runtime": 863.1851, - "eval_samples_per_second": 894.153, - "eval_steps_per_second": 55.885, + "epoch": 1.77, + "eval_loss": 1.9432255029678345, + "eval_runtime": 451.3314, + "eval_samples_per_second": 886.267, + "eval_steps_per_second": 55.392, "step": 840000 }, { - "epoch": 0.93, + "epoch": 1.79, "learning_rate": 2.651333333333333e-07, - "loss": 2.1293, + "loss": 2.0888, "step": 848000 }, { - "epoch": 0.93, - "eval_loss": 2.013720989227295, - "eval_runtime": 863.4541, - "eval_samples_per_second": 893.875, - "eval_steps_per_second": 55.867, + "epoch": 1.79, + "eval_loss": 1.941444993019104, + "eval_runtime": 449.4111, + "eval_samples_per_second": 890.054, + "eval_steps_per_second": 55.628, "step": 848000 }, { - "epoch": 0.93, - "eval_loss": 2.018291711807251, - "eval_runtime": 877.742, - "eval_samples_per_second": 879.324, - "eval_steps_per_second": 54.958, + "epoch": 1.8, + "eval_loss": 1.9443625211715698, + "eval_runtime": 449.8026, + "eval_samples_per_second": 889.279, + "eval_steps_per_second": 55.58, "step": 856000 }, { - "epoch": 0.94, + "epoch": 1.82, "learning_rate": 2.624e-07, - "loss": 2.1358, + "loss": 2.0786, "step": 864000 }, { - "epoch": 0.94, - "eval_loss": 2.018421173095703, - "eval_runtime": 873.6563, - "eval_samples_per_second": 883.437, - "eval_steps_per_second": 55.215, + "epoch": 1.82, + "eval_loss": 1.9407739639282227, + "eval_runtime": 450.2493, + "eval_samples_per_second": 888.397, + "eval_steps_per_second": 55.525, "step": 864000 }, { - "epoch": 0.95, - "eval_loss": 2.017104387283325, - "eval_runtime": 874.261, - "eval_samples_per_second": 882.826, - "eval_steps_per_second": 55.177, + "epoch": 1.84, + "eval_loss": 1.9396659135818481, + "eval_runtime": 452.6466, + "eval_samples_per_second": 883.692, + "eval_steps_per_second": 55.231, "step": 872000 }, { - "epoch": 0.96, + "epoch": 1.85, "learning_rate": 2.596666666666667e-07, - "loss": 2.1296, + "loss": 2.079, "step": 880000 }, { - "epoch": 0.96, - "eval_loss": 2.0179190635681152, - "eval_runtime": 874.7051, - "eval_samples_per_second": 882.377, - "eval_steps_per_second": 55.149, + "epoch": 1.85, + "eval_loss": 1.9406061172485352, + "eval_runtime": 451.5534, + "eval_samples_per_second": 885.831, + "eval_steps_per_second": 55.364, "step": 880000 }, { - "epoch": 0.97, - "eval_loss": 2.015188455581665, - "eval_runtime": 875.6595, - "eval_samples_per_second": 881.416, - "eval_steps_per_second": 55.089, + "epoch": 1.87, + "eval_loss": 1.944212555885315, + "eval_runtime": 452.1338, + "eval_samples_per_second": 884.694, + "eval_steps_per_second": 55.293, "step": 888000 }, { - "epoch": 0.98, + "epoch": 1.89, "learning_rate": 2.5693333333333333e-07, - "loss": 2.1319, + "loss": 2.0817, "step": 896000 }, { - "epoch": 0.98, - "eval_loss": 2.0173678398132324, - "eval_runtime": 877.4749, - "eval_samples_per_second": 879.592, - "eval_steps_per_second": 54.975, + "epoch": 1.89, + "eval_loss": 1.9403553009033203, + "eval_runtime": 450.9134, + "eval_samples_per_second": 887.088, + "eval_steps_per_second": 55.443, "step": 896000 }, { - "epoch": 0.99, - "eval_loss": 2.020580291748047, - "eval_runtime": 874.219, - "eval_samples_per_second": 882.868, - "eval_steps_per_second": 55.18, + "epoch": 1.9, + "eval_loss": 1.945005178451538, + "eval_runtime": 453.0225, + "eval_samples_per_second": 882.958, + "eval_steps_per_second": 55.185, "step": 904000 }, { - "epoch": 1.0, + "epoch": 1.92, "learning_rate": 2.542e-07, - "loss": 2.1344, + "loss": 2.0792, "step": 912000 }, { - "epoch": 1.0, - "eval_loss": 2.0178616046905518, - "eval_runtime": 871.4372, - "eval_samples_per_second": 885.686, - "eval_steps_per_second": 55.356, + "epoch": 1.92, + "eval_loss": 1.9379777908325195, + "eval_runtime": 450.9233, + "eval_samples_per_second": 887.069, + "eval_steps_per_second": 55.442, "step": 912000 }, { - "epoch": 1.0, - "eval_loss": 2.0153729915618896, - "eval_runtime": 874.8229, - "eval_samples_per_second": 882.259, - "eval_steps_per_second": 55.141, + "epoch": 1.94, + "eval_loss": 1.938461422920227, + "eval_runtime": 451.8903, + "eval_samples_per_second": 885.171, + "eval_steps_per_second": 55.323, "step": 920000 }, { - "epoch": 1.01, + "epoch": 1.95, "learning_rate": 2.5146666666666664e-07, - "loss": 2.1352, + "loss": 2.0741, "step": 928000 }, { - "epoch": 1.01, - "eval_loss": 2.018483877182007, - "eval_runtime": 876.0163, - "eval_samples_per_second": 881.057, - "eval_steps_per_second": 55.066, + "epoch": 1.95, + "eval_loss": 1.9448977708816528, + "eval_runtime": 452.6642, + "eval_samples_per_second": 883.657, + "eval_steps_per_second": 55.229, "step": 928000 }, { - "epoch": 1.02, - "eval_loss": 2.016976833343506, - "eval_runtime": 878.2619, - "eval_samples_per_second": 878.804, - "eval_steps_per_second": 54.926, + "epoch": 1.97, + "eval_loss": 1.9413866996765137, + "eval_runtime": 452.86, + "eval_samples_per_second": 883.275, + "eval_steps_per_second": 55.205, "step": 936000 }, { - "epoch": 1.03, + "epoch": 1.99, "learning_rate": 2.4873333333333335e-07, - "loss": 2.1336, + "loss": 2.0832, "step": 944000 }, { - "epoch": 1.03, - "eval_loss": 2.016388416290283, - "eval_runtime": 877.6593, - "eval_samples_per_second": 879.407, - "eval_steps_per_second": 54.963, + "epoch": 1.99, + "eval_loss": 1.9401600360870361, + "eval_runtime": 451.3974, + "eval_samples_per_second": 886.137, + "eval_steps_per_second": 55.384, "step": 944000 }, { - "epoch": 1.04, - "eval_loss": 2.013742208480835, - "eval_runtime": 871.0407, - "eval_samples_per_second": 886.09, - "eval_steps_per_second": 55.381, + "epoch": 2.0, + "eval_loss": 1.940971851348877, + "eval_runtime": 452.3973, + "eval_samples_per_second": 884.178, + "eval_steps_per_second": 55.261, "step": 952000 }, { - "epoch": 1.05, + "epoch": 2.02, "learning_rate": 2.46e-07, - "loss": 2.1315, + "loss": 2.0695, "step": 960000 }, { - "epoch": 1.05, - "eval_loss": 2.0176327228546143, - "eval_runtime": 877.004, - "eval_samples_per_second": 880.064, - "eval_steps_per_second": 55.004, + "epoch": 2.02, + "eval_loss": 1.9370893239974976, + "eval_runtime": 453.3847, + "eval_samples_per_second": 882.253, + "eval_steps_per_second": 55.141, "step": 960000 }, { - "epoch": 1.06, - "eval_loss": 2.0155346393585205, - "eval_runtime": 872.5922, - "eval_samples_per_second": 884.514, - "eval_steps_per_second": 55.282, + "epoch": 2.04, + "eval_loss": 1.934204339981079, + "eval_runtime": 452.8123, + "eval_samples_per_second": 883.368, + "eval_steps_per_second": 55.211, "step": 968000 }, { - "epoch": 1.06, + "epoch": 2.05, "learning_rate": 2.4326666666666666e-07, - "loss": 2.1255, + "loss": 2.0813, "step": 976000 }, { - "epoch": 1.06, - "eval_loss": 2.014533281326294, - "eval_runtime": 871.4139, - "eval_samples_per_second": 885.71, - "eval_steps_per_second": 55.357, + "epoch": 2.05, + "eval_loss": 1.9375728368759155, + "eval_runtime": 454.1486, + "eval_samples_per_second": 880.769, + "eval_steps_per_second": 55.048, "step": 976000 }, { - "epoch": 1.07, - "eval_loss": 2.023314952850342, - "eval_runtime": 879.3224, - "eval_samples_per_second": 877.744, - "eval_steps_per_second": 54.859, + "epoch": 2.07, + "eval_loss": 1.939716100692749, + "eval_runtime": 451.3741, + "eval_samples_per_second": 886.183, + "eval_steps_per_second": 55.386, "step": 984000 }, { - "epoch": 1.08, + "epoch": 2.09, "learning_rate": 2.405333333333333e-07, - "loss": 2.1249, + "loss": 2.0804, "step": 992000 }, { - "epoch": 1.08, - "eval_loss": 2.0147762298583984, - "eval_runtime": 866.8225, - "eval_samples_per_second": 890.401, - "eval_steps_per_second": 55.65, + "epoch": 2.09, + "eval_loss": 1.9394439458847046, + "eval_runtime": 453.2345, + "eval_samples_per_second": 882.545, + "eval_steps_per_second": 55.159, "step": 992000 }, { - "epoch": 1.09, - "eval_loss": 2.016249895095825, - "eval_runtime": 867.6683, - "eval_samples_per_second": 889.533, - "eval_steps_per_second": 55.596, + "epoch": 2.11, + "eval_loss": 1.9370408058166504, + "eval_runtime": 453.6157, + "eval_samples_per_second": 881.804, + "eval_steps_per_second": 55.113, "step": 1000000 }, { - "epoch": 1.1, + "epoch": 2.12, "learning_rate": 2.3779999999999997e-07, - "loss": 2.123, + "loss": 2.0789, "step": 1008000 }, { - "epoch": 1.1, - "eval_loss": 2.017381191253662, - "eval_runtime": 868.2141, - "eval_samples_per_second": 888.974, - "eval_steps_per_second": 55.561, + "epoch": 2.12, + "eval_loss": 1.93497896194458, + "eval_runtime": 452.9534, + "eval_samples_per_second": 883.093, + "eval_steps_per_second": 55.193, "step": 1008000 }, { - "epoch": 1.11, - "eval_loss": 2.015009880065918, - "eval_runtime": 865.5792, - "eval_samples_per_second": 891.68, - "eval_steps_per_second": 55.73, + "epoch": 2.14, + "eval_loss": 1.9327107667922974, + "eval_runtime": 453.3718, + "eval_samples_per_second": 882.278, + "eval_steps_per_second": 55.142, "step": 1016000 }, { - "epoch": 1.12, + "epoch": 2.16, "learning_rate": 2.3506666666666668e-07, - "loss": 2.1263, + "loss": 2.0754, "step": 1024000 }, { - "epoch": 1.12, - "eval_loss": 2.0160863399505615, - "eval_runtime": 869.2474, - "eval_samples_per_second": 887.917, - "eval_steps_per_second": 55.495, + "epoch": 2.16, + "eval_loss": 1.9420740604400635, + "eval_runtime": 454.3554, + "eval_samples_per_second": 880.368, + "eval_steps_per_second": 55.023, "step": 1024000 }, { - "epoch": 1.13, - "eval_loss": 2.0128889083862305, - "eval_runtime": 866.9502, - "eval_samples_per_second": 890.27, - "eval_steps_per_second": 55.642, + "epoch": 2.17, + "eval_loss": 1.937127947807312, + "eval_runtime": 453.978, + "eval_samples_per_second": 881.1, + "eval_steps_per_second": 55.069, "step": 1032000 }, { - "epoch": 1.13, + "epoch": 2.19, "learning_rate": 2.3233333333333334e-07, - "loss": 2.1232, + "loss": 2.0774, "step": 1040000 }, { - "epoch": 1.13, - "eval_loss": 2.0166754722595215, - "eval_runtime": 901.7962, - "eval_samples_per_second": 855.87, - "eval_steps_per_second": 53.492, + "epoch": 2.19, + "eval_loss": 1.9410624504089355, + "eval_runtime": 453.3501, + "eval_samples_per_second": 882.32, + "eval_steps_per_second": 55.145, "step": 1040000 }, { - "epoch": 1.14, - "eval_loss": 2.012477397918701, - "eval_runtime": 911.6669, - "eval_samples_per_second": 846.603, - "eval_steps_per_second": 52.913, + "epoch": 2.21, + "eval_loss": 1.9337198734283447, + "eval_runtime": 454.7842, + "eval_samples_per_second": 879.538, + "eval_steps_per_second": 54.971, "step": 1048000 }, { - "epoch": 1.15, + "epoch": 2.22, "learning_rate": 2.2960000000000002e-07, - "loss": 2.1168, + "loss": 2.0766, "step": 1056000 }, { - "epoch": 1.15, - "eval_loss": 2.0113391876220703, - "eval_runtime": 912.2557, - "eval_samples_per_second": 846.057, - "eval_steps_per_second": 52.879, + "epoch": 2.22, + "eval_loss": 1.9387423992156982, + "eval_runtime": 455.5914, + "eval_samples_per_second": 877.98, + "eval_steps_per_second": 54.874, "step": 1056000 }, { - "epoch": 1.16, - "eval_loss": 2.013575792312622, - "eval_runtime": 901.3301, - "eval_samples_per_second": 856.312, - "eval_steps_per_second": 53.52, + "epoch": 2.24, + "eval_loss": 1.9334228038787842, + "eval_runtime": 454.6433, + "eval_samples_per_second": 879.811, + "eval_steps_per_second": 54.988, "step": 1064000 }, { - "epoch": 1.17, + "epoch": 2.26, "learning_rate": 2.2686666666666667e-07, - "loss": 2.1307, + "loss": 2.079, "step": 1072000 }, { - "epoch": 1.17, - "eval_loss": 2.014338254928589, - "eval_runtime": 891.2807, - "eval_samples_per_second": 865.967, - "eval_steps_per_second": 54.123, + "epoch": 2.26, + "eval_loss": 1.938550353050232, + "eval_runtime": 454.9961, + "eval_samples_per_second": 879.128, + "eval_steps_per_second": 54.946, "step": 1072000 }, { - "epoch": 1.18, - "eval_loss": 2.0166401863098145, - "eval_runtime": 886.4005, - "eval_samples_per_second": 870.735, - "eval_steps_per_second": 54.421, + "epoch": 2.27, + "eval_loss": 1.9334533214569092, + "eval_runtime": 453.0142, + "eval_samples_per_second": 882.975, + "eval_steps_per_second": 55.186, "step": 1080000 }, { - "epoch": 1.19, + "epoch": 2.29, "learning_rate": 2.2413333333333333e-07, - "loss": 2.1336, + "loss": 2.068, "step": 1088000 }, { - "epoch": 1.19, - "eval_loss": 2.0103185176849365, - "eval_runtime": 886.4458, - "eval_samples_per_second": 870.691, - "eval_steps_per_second": 54.418, + "epoch": 2.29, + "eval_loss": 1.9363125562667847, + "eval_runtime": 452.9254, + "eval_samples_per_second": 883.148, + "eval_steps_per_second": 55.197, "step": 1088000 }, { - "epoch": 1.2, - "eval_loss": 2.0129764080047607, - "eval_runtime": 890.355, - "eval_samples_per_second": 866.868, - "eval_steps_per_second": 54.18, + "epoch": 2.31, + "eval_loss": 1.9420162439346313, + "eval_runtime": 453.9753, + "eval_samples_per_second": 881.105, + "eval_steps_per_second": 55.069, "step": 1096000 }, { - "epoch": 1.2, + "epoch": 2.32, "learning_rate": 2.214e-07, - "loss": 2.1227, + "loss": 2.0786, "step": 1104000 }, { - "epoch": 1.2, - "eval_loss": 2.012451648712158, - "eval_runtime": 895.3428, - "eval_samples_per_second": 862.039, - "eval_steps_per_second": 53.878, + "epoch": 2.32, + "eval_loss": 1.9330793619155884, + "eval_runtime": 454.6549, + "eval_samples_per_second": 879.788, + "eval_steps_per_second": 54.987, "step": 1104000 }, { - "epoch": 1.21, - "eval_loss": 2.0183231830596924, - "eval_runtime": 888.3913, - "eval_samples_per_second": 868.784, - "eval_steps_per_second": 54.299, + "epoch": 2.34, + "eval_loss": 1.9327301979064941, + "eval_runtime": 455.7252, + "eval_samples_per_second": 877.722, + "eval_steps_per_second": 54.858, "step": 1112000 }, { - "epoch": 1.22, + "epoch": 2.36, "learning_rate": 2.1866666666666667e-07, - "loss": 2.1223, + "loss": 2.0734, "step": 1120000 }, { - "epoch": 1.22, - "eval_loss": 2.014848470687866, - "eval_runtime": 889.5583, - "eval_samples_per_second": 867.644, - "eval_steps_per_second": 54.228, + "epoch": 2.36, + "eval_loss": 1.939077615737915, + "eval_runtime": 456.0008, + "eval_samples_per_second": 877.191, + "eval_steps_per_second": 54.824, "step": 1120000 }, { - "epoch": 1.23, - "eval_loss": 2.0147109031677246, - "eval_runtime": 884.3146, - "eval_samples_per_second": 872.789, - "eval_steps_per_second": 54.55, + "epoch": 2.37, + "eval_loss": 1.9362618923187256, + "eval_runtime": 455.372, + "eval_samples_per_second": 878.403, + "eval_steps_per_second": 54.9, "step": 1128000 }, { - "epoch": 1.24, + "epoch": 2.39, "learning_rate": 2.1593333333333332e-07, - "loss": 2.1289, + "loss": 2.0787, "step": 1136000 }, { - "epoch": 1.24, - "eval_loss": 2.0108699798583984, - "eval_runtime": 888.3584, - "eval_samples_per_second": 868.816, - "eval_steps_per_second": 54.301, + "epoch": 2.39, + "eval_loss": 1.932115912437439, + "eval_runtime": 454.5174, + "eval_samples_per_second": 880.054, + "eval_steps_per_second": 55.003, "step": 1136000 }, { - "epoch": 1.25, - "eval_loss": 2.0163819789886475, - "eval_runtime": 887.4195, - "eval_samples_per_second": 869.735, - "eval_steps_per_second": 54.359, + "epoch": 2.41, + "eval_loss": 1.9333146810531616, + "eval_runtime": 458.093, + "eval_samples_per_second": 873.185, + "eval_steps_per_second": 54.574, "step": 1144000 }, { - "epoch": 1.26, + "epoch": 2.43, "learning_rate": 2.132e-07, - "loss": 2.1278, + "loss": 2.0731, "step": 1152000 }, { - "epoch": 1.26, - "eval_loss": 2.0163345336914062, - "eval_runtime": 886.1604, - "eval_samples_per_second": 870.971, - "eval_steps_per_second": 54.436, + "epoch": 2.43, + "eval_loss": 1.9368531703948975, + "eval_runtime": 454.4244, + "eval_samples_per_second": 880.234, + "eval_steps_per_second": 55.015, "step": 1152000 }, { - "epoch": 1.27, - "eval_loss": 2.012103319168091, - "eval_runtime": 889.5174, - "eval_samples_per_second": 867.684, - "eval_steps_per_second": 54.231, + "epoch": 2.44, + "eval_loss": 1.9357047080993652, + "eval_runtime": 456.0905, + "eval_samples_per_second": 877.019, + "eval_steps_per_second": 54.814, "step": 1160000 }, { - "epoch": 1.27, + "epoch": 2.46, "learning_rate": 2.1046666666666666e-07, - "loss": 2.1261, + "loss": 2.0816, "step": 1168000 }, { - "epoch": 1.27, - "eval_loss": 2.011343240737915, - "eval_runtime": 890.9332, - "eval_samples_per_second": 866.305, - "eval_steps_per_second": 54.144, + "epoch": 2.46, + "eval_loss": 1.9352905750274658, + "eval_runtime": 457.1609, + "eval_samples_per_second": 874.966, + "eval_steps_per_second": 54.685, "step": 1168000 }, { - "epoch": 1.28, - "eval_loss": 2.0137104988098145, - "eval_runtime": 883.4659, - "eval_samples_per_second": 873.627, - "eval_steps_per_second": 54.602, + "epoch": 2.48, + "eval_loss": 1.9318699836730957, + "eval_runtime": 457.231, + "eval_samples_per_second": 874.831, + "eval_steps_per_second": 54.677, "step": 1176000 }, { - "epoch": 1.29, + "epoch": 2.49, "learning_rate": 2.0773333333333334e-07, - "loss": 2.126, + "loss": 2.0758, "step": 1184000 }, { - "epoch": 1.29, - "eval_loss": 2.015174627304077, - "eval_runtime": 885.9678, - "eval_samples_per_second": 871.16, - "eval_steps_per_second": 54.448, + "epoch": 2.49, + "eval_loss": 1.9365841150283813, + "eval_runtime": 455.9751, + "eval_samples_per_second": 877.241, + "eval_steps_per_second": 54.828, "step": 1184000 }, { - "epoch": 1.3, - "eval_loss": 2.010411500930786, - "eval_runtime": 888.6748, - "eval_samples_per_second": 868.507, - "eval_steps_per_second": 54.282, + "epoch": 2.51, + "eval_loss": 1.9300509691238403, + "eval_runtime": 460.0832, + "eval_samples_per_second": 869.408, + "eval_steps_per_second": 54.338, "step": 1192000 }, { - "epoch": 1.31, + "epoch": 2.53, "learning_rate": 2.05e-07, - "loss": 2.1235, + "loss": 2.0725, "step": 1200000 }, { - "epoch": 1.31, - "eval_loss": 2.013165235519409, - "eval_runtime": 888.6503, - "eval_samples_per_second": 868.531, - "eval_steps_per_second": 54.283, + "epoch": 2.53, + "eval_loss": 1.9328936338424683, + "eval_runtime": 456.956, + "eval_samples_per_second": 875.358, + "eval_steps_per_second": 54.71, "step": 1200000 }, { - "epoch": 1.32, - "eval_loss": 2.0113847255706787, - "eval_runtime": 884.261, - "eval_samples_per_second": 872.842, - "eval_steps_per_second": 54.553, + "epoch": 2.54, + "eval_loss": 1.937027931213379, + "eval_runtime": 455.5679, + "eval_samples_per_second": 878.025, + "eval_steps_per_second": 54.877, "step": 1208000 }, { - "epoch": 1.33, + "epoch": 2.56, "learning_rate": 2.0226666666666668e-07, - "loss": 2.1229, + "loss": 2.085, "step": 1216000 }, { - "epoch": 1.33, - "eval_loss": 2.010532855987549, - "eval_runtime": 887.5065, - "eval_samples_per_second": 869.65, - "eval_steps_per_second": 54.353, + "epoch": 2.56, + "eval_loss": 1.9251400232315063, + "eval_runtime": 457.3381, + "eval_samples_per_second": 874.626, + "eval_steps_per_second": 54.664, "step": 1216000 }, { - "epoch": 1.34, - "eval_loss": 2.0130858421325684, - "eval_runtime": 881.1399, - "eval_samples_per_second": 875.934, - "eval_steps_per_second": 54.746, + "epoch": 2.58, + "eval_loss": 1.9369462728500366, + "eval_runtime": 459.1359, + "eval_samples_per_second": 871.202, + "eval_steps_per_second": 54.45, "step": 1224000 }, { - "epoch": 1.34, + "epoch": 2.59, "learning_rate": 1.9953333333333333e-07, - "loss": 2.1213, + "loss": 2.0809, "step": 1232000 }, { - "epoch": 1.34, - "eval_loss": 2.0141072273254395, - "eval_runtime": 882.2467, - "eval_samples_per_second": 874.835, - "eval_steps_per_second": 54.677, + "epoch": 2.59, + "eval_loss": 1.9377222061157227, + "eval_runtime": 457.7839, + "eval_samples_per_second": 873.775, + "eval_steps_per_second": 54.611, "step": 1232000 }, { - "epoch": 1.35, - "eval_loss": 2.010868549346924, - "eval_runtime": 881.7078, - "eval_samples_per_second": 875.369, - "eval_steps_per_second": 54.711, + "epoch": 2.61, + "eval_loss": 1.9397977590560913, + "eval_runtime": 458.2173, + "eval_samples_per_second": 872.948, + "eval_steps_per_second": 54.559, "step": 1240000 }, { - "epoch": 1.36, + "epoch": 2.63, "learning_rate": 1.968e-07, - "loss": 2.1185, + "loss": 2.0742, "step": 1248000 }, { - "epoch": 1.36, - "eval_loss": 2.0129363536834717, - "eval_runtime": 886.2455, - "eval_samples_per_second": 870.887, - "eval_steps_per_second": 54.431, + "epoch": 2.63, + "eval_loss": 1.9367727041244507, + "eval_runtime": 456.9366, + "eval_samples_per_second": 875.395, + "eval_steps_per_second": 54.712, "step": 1248000 }, { - "epoch": 1.37, - "eval_loss": 2.011003017425537, - "eval_runtime": 888.1974, - "eval_samples_per_second": 868.974, - "eval_steps_per_second": 54.311, + "epoch": 2.64, + "eval_loss": 1.9389311075210571, + "eval_runtime": 457.4588, + "eval_samples_per_second": 874.396, + "eval_steps_per_second": 54.65, "step": 1256000 }, { - "epoch": 1.38, + "epoch": 2.66, "learning_rate": 1.9406666666666667e-07, - "loss": 2.131, + "loss": 2.0743, "step": 1264000 }, { - "epoch": 1.38, - "eval_loss": 2.01228928565979, - "eval_runtime": 884.9282, - "eval_samples_per_second": 872.184, - "eval_steps_per_second": 54.512, + "epoch": 2.66, + "eval_loss": 1.9287104606628418, + "eval_runtime": 458.2038, + "eval_samples_per_second": 872.974, + "eval_steps_per_second": 54.561, "step": 1264000 }, { - "epoch": 1.39, - "eval_loss": 2.0104737281799316, - "eval_runtime": 881.1611, - "eval_samples_per_second": 875.912, - "eval_steps_per_second": 54.745, + "epoch": 2.68, + "eval_loss": 1.9337459802627563, + "eval_runtime": 456.8104, + "eval_samples_per_second": 875.637, + "eval_steps_per_second": 54.727, "step": 1272000 }, { - "epoch": 1.4, + "epoch": 2.69, "learning_rate": 1.9133333333333333e-07, - "loss": 2.1141, + "loss": 2.0822, "step": 1280000 }, { - "epoch": 1.4, - "eval_loss": 2.010425090789795, - "eval_runtime": 882.3806, - "eval_samples_per_second": 874.702, - "eval_steps_per_second": 54.669, + "epoch": 2.69, + "eval_loss": 1.9323461055755615, + "eval_runtime": 459.5048, + "eval_samples_per_second": 870.502, + "eval_steps_per_second": 54.406, "step": 1280000 }, { - "epoch": 1.41, - "eval_loss": 2.015007734298706, - "eval_runtime": 879.3909, - "eval_samples_per_second": 877.676, - "eval_steps_per_second": 54.855, + "epoch": 2.71, + "eval_loss": 1.9348268508911133, + "eval_runtime": 459.882, + "eval_samples_per_second": 869.788, + "eval_steps_per_second": 54.362, "step": 1288000 }, { - "epoch": 1.41, + "epoch": 2.73, "learning_rate": 1.886e-07, - "loss": 2.1219, + "loss": 2.0845, "step": 1296000 }, { - "epoch": 1.41, - "eval_loss": 2.0161073207855225, - "eval_runtime": 879.4904, - "eval_samples_per_second": 877.576, - "eval_steps_per_second": 54.849, + "epoch": 2.73, + "eval_loss": 1.932786464691162, + "eval_runtime": 456.5549, + "eval_samples_per_second": 876.127, + "eval_steps_per_second": 54.758, "step": 1296000 }, { - "epoch": 1.42, - "eval_loss": 2.00930118560791, - "eval_runtime": 882.5935, - "eval_samples_per_second": 874.491, - "eval_steps_per_second": 54.656, + "epoch": 2.75, + "eval_loss": 1.9324398040771484, + "eval_runtime": 458.1511, + "eval_samples_per_second": 873.074, + "eval_steps_per_second": 54.567, "step": 1304000 }, { - "epoch": 1.43, + "epoch": 2.76, "learning_rate": 1.8586666666666666e-07, - "loss": 2.1203, + "loss": 2.0706, "step": 1312000 }, { - "epoch": 1.43, - "eval_loss": 2.0104291439056396, - "eval_runtime": 882.9969, - "eval_samples_per_second": 874.091, - "eval_steps_per_second": 54.631, + "epoch": 2.76, + "eval_loss": 1.9304131269454956, + "eval_runtime": 458.1533, + "eval_samples_per_second": 873.07, + "eval_steps_per_second": 54.567, "step": 1312000 }, { - "epoch": 1.44, - "eval_loss": 2.0144429206848145, - "eval_runtime": 878.5955, - "eval_samples_per_second": 878.47, - "eval_steps_per_second": 54.905, + "epoch": 2.78, + "eval_loss": 1.9322303533554077, + "eval_runtime": 458.9935, + "eval_samples_per_second": 871.472, + "eval_steps_per_second": 54.467, "step": 1320000 }, { - "epoch": 1.45, + "epoch": 2.8, "learning_rate": 1.8313333333333332e-07, - "loss": 2.1264, + "loss": 2.0813, "step": 1328000 }, { - "epoch": 1.45, - "eval_loss": 2.0084986686706543, - "eval_runtime": 878.8817, - "eval_samples_per_second": 878.184, - "eval_steps_per_second": 54.887, + "epoch": 2.8, + "eval_loss": 1.9320390224456787, + "eval_runtime": 460.999, + "eval_samples_per_second": 867.681, + "eval_steps_per_second": 54.23, "step": 1328000 }, { - "epoch": 1.46, - "eval_loss": 2.0118672847747803, - "eval_runtime": 880.8514, - "eval_samples_per_second": 876.22, - "eval_steps_per_second": 54.764, + "epoch": 2.81, + "eval_loss": 1.9378987550735474, + "eval_runtime": 457.836, + "eval_samples_per_second": 873.675, + "eval_steps_per_second": 54.605, "step": 1336000 }, { - "epoch": 1.47, + "epoch": 2.83, "learning_rate": 1.804e-07, - "loss": 2.1194, + "loss": 2.0768, "step": 1344000 }, { - "epoch": 1.47, - "eval_loss": 2.011784076690674, - "eval_runtime": 878.874, - "eval_samples_per_second": 878.192, - "eval_steps_per_second": 54.887, + "epoch": 2.83, + "eval_loss": 1.9283299446105957, + "eval_runtime": 457.711, + "eval_samples_per_second": 873.914, + "eval_steps_per_second": 54.62, "step": 1344000 }, { - "epoch": 1.48, - "eval_loss": 2.0109827518463135, - "eval_runtime": 893.715, - "eval_samples_per_second": 863.609, - "eval_steps_per_second": 53.976, + "epoch": 2.85, + "eval_loss": 1.9352092742919922, + "eval_runtime": 459.6254, + "eval_samples_per_second": 870.274, + "eval_steps_per_second": 54.392, "step": 1352000 }, { - "epoch": 1.48, + "epoch": 2.86, "learning_rate": 1.7766666666666666e-07, - "loss": 2.117, + "loss": 2.0776, "step": 1360000 }, { - "epoch": 1.48, - "eval_loss": 2.014660596847534, - "eval_runtime": 915.8924, - "eval_samples_per_second": 842.697, - "eval_steps_per_second": 52.669, + "epoch": 2.86, + "eval_loss": 1.9265965223312378, + "eval_runtime": 456.3209, + "eval_samples_per_second": 876.576, + "eval_steps_per_second": 54.786, "step": 1360000 }, { - "epoch": 1.49, - "eval_loss": 2.013535261154175, - "eval_runtime": 909.1816, - "eval_samples_per_second": 848.917, - "eval_steps_per_second": 53.058, + "epoch": 2.88, + "eval_loss": 1.9339468479156494, + "eval_runtime": 460.6096, + "eval_samples_per_second": 868.414, + "eval_steps_per_second": 54.276, "step": 1368000 }, { - "epoch": 1.5, + "epoch": 2.9, "learning_rate": 1.7493333333333334e-07, - "loss": 2.1311, + "loss": 2.0776, "step": 1376000 }, { - "epoch": 1.5, - "eval_loss": 2.0076611042022705, - "eval_runtime": 909.3083, - "eval_samples_per_second": 848.799, - "eval_steps_per_second": 53.05, + "epoch": 2.9, + "eval_loss": 1.9370609521865845, + "eval_runtime": 459.3688, + "eval_samples_per_second": 870.76, + "eval_steps_per_second": 54.423, "step": 1376000 }, { - "epoch": 1.51, - "eval_loss": 2.006574869155884, - "eval_runtime": 904.8344, - "eval_samples_per_second": 852.996, - "eval_steps_per_second": 53.313, + "epoch": 2.91, + "eval_loss": 1.9353028535842896, + "eval_runtime": 459.3054, + "eval_samples_per_second": 870.88, + "eval_steps_per_second": 54.43, "step": 1384000 }, { - "epoch": 1.52, + "epoch": 2.93, "learning_rate": 1.722e-07, - "loss": 2.1215, + "loss": 2.072, "step": 1392000 }, { - "epoch": 1.52, - "eval_loss": 2.008929967880249, - "eval_runtime": 903.4488, - "eval_samples_per_second": 854.304, - "eval_steps_per_second": 53.394, + "epoch": 2.93, + "eval_loss": 1.928996205329895, + "eval_runtime": 459.5911, + "eval_samples_per_second": 870.339, + "eval_steps_per_second": 54.396, "step": 1392000 }, { - "epoch": 1.53, - "eval_loss": 2.0118260383605957, - "eval_runtime": 913.6278, - "eval_samples_per_second": 844.786, - "eval_steps_per_second": 52.799, + "epoch": 2.95, + "eval_loss": 1.9337087869644165, + "eval_runtime": 457.9253, + "eval_samples_per_second": 873.505, + "eval_steps_per_second": 54.594, "step": 1400000 }, { - "epoch": 1.54, + "epoch": 2.96, "learning_rate": 1.6946666666666668e-07, - "loss": 2.1185, + "loss": 2.077, "step": 1408000 }, { - "epoch": 1.54, - "eval_loss": 2.0105414390563965, - "eval_runtime": 907.6551, - "eval_samples_per_second": 850.345, - "eval_steps_per_second": 53.147, + "epoch": 2.96, + "eval_loss": 1.931803584098816, + "eval_runtime": 459.0247, + "eval_samples_per_second": 871.413, + "eval_steps_per_second": 54.463, "step": 1408000 }, { - "epoch": 1.54, - "eval_loss": 2.012268304824829, - "eval_runtime": 903.9952, - "eval_samples_per_second": 853.788, - "eval_steps_per_second": 53.362, + "epoch": 2.98, + "eval_loss": 1.9326242208480835, + "eval_runtime": 459.782, + "eval_samples_per_second": 869.977, + "eval_steps_per_second": 54.374, "step": 1416000 }, { - "epoch": 1.55, + "epoch": 3.0, "learning_rate": 1.6673333333333333e-07, - "loss": 2.1284, + "loss": 2.0777, "step": 1424000 }, { - "epoch": 1.55, - "eval_loss": 2.0133912563323975, - "eval_runtime": 910.6028, - "eval_samples_per_second": 847.592, - "eval_steps_per_second": 52.975, + "epoch": 3.0, + "eval_loss": 1.9337918758392334, + "eval_runtime": 460.1196, + "eval_samples_per_second": 869.339, + "eval_steps_per_second": 54.334, "step": 1424000 }, { - "epoch": 1.56, - "eval_loss": 2.009307861328125, - "eval_runtime": 904.2587, - "eval_samples_per_second": 853.539, - "eval_steps_per_second": 53.346, + "epoch": 3.01, + "eval_loss": 1.9306981563568115, + "eval_runtime": 460.6247, + "eval_samples_per_second": 868.386, + "eval_steps_per_second": 54.274, "step": 1432000 }, { - "epoch": 1.57, + "epoch": 3.03, "learning_rate": 1.64e-07, - "loss": 2.1174, + "loss": 2.0846, "step": 1440000 }, { - "epoch": 1.57, - "eval_loss": 2.0101728439331055, - "eval_runtime": 912.2693, - "eval_samples_per_second": 846.044, - "eval_steps_per_second": 52.878, + "epoch": 3.03, + "eval_loss": 1.9305415153503418, + "eval_runtime": 459.7751, + "eval_samples_per_second": 869.991, + "eval_steps_per_second": 54.374, "step": 1440000 }, { - "epoch": 1.58, - "eval_loss": 2.00759220123291, - "eval_runtime": 910.2393, - "eval_samples_per_second": 847.931, - "eval_steps_per_second": 52.996, + "epoch": 3.05, + "eval_loss": 1.931223750114441, + "eval_runtime": 457.3015, + "eval_samples_per_second": 874.696, + "eval_steps_per_second": 54.669, "step": 1448000 }, { - "epoch": 1.59, + "epoch": 3.07, "learning_rate": 1.6126666666666667e-07, - "loss": 2.1108, + "loss": 2.0744, "step": 1456000 }, { - "epoch": 1.59, - "eval_loss": 2.00740909576416, - "eval_runtime": 914.6796, - "eval_samples_per_second": 843.815, - "eval_steps_per_second": 52.739, + "epoch": 3.07, + "eval_loss": 1.9331880807876587, + "eval_runtime": 459.0917, + "eval_samples_per_second": 871.286, + "eval_steps_per_second": 54.455, "step": 1456000 }, { - "epoch": 1.6, - "eval_loss": 2.007056474685669, - "eval_runtime": 908.0025, - "eval_samples_per_second": 850.02, - "eval_steps_per_second": 53.127, + "epoch": 3.08, + "eval_loss": 1.9313483238220215, + "eval_runtime": 461.1089, + "eval_samples_per_second": 867.474, + "eval_steps_per_second": 54.217, "step": 1464000 }, { - "epoch": 1.61, + "epoch": 3.1, "learning_rate": 1.5853333333333332e-07, - "loss": 2.1252, + "loss": 2.0767, "step": 1472000 }, { - "epoch": 1.61, - "eval_loss": 2.0092082023620605, - "eval_runtime": 905.6872, - "eval_samples_per_second": 852.193, - "eval_steps_per_second": 53.262, + "epoch": 3.1, + "eval_loss": 1.931079387664795, + "eval_runtime": 461.0181, + "eval_samples_per_second": 867.645, + "eval_steps_per_second": 54.228, "step": 1472000 }, { - "epoch": 1.61, - "eval_loss": 2.007967233657837, - "eval_runtime": 910.9272, - "eval_samples_per_second": 847.291, - "eval_steps_per_second": 52.956, + "epoch": 3.12, + "eval_loss": 1.9322373867034912, + "eval_runtime": 461.2145, + "eval_samples_per_second": 867.275, + "eval_steps_per_second": 54.205, "step": 1480000 }, { - "epoch": 1.62, + "epoch": 3.13, "learning_rate": 1.558e-07, - "loss": 2.121, + "loss": 2.082, "step": 1488000 }, { - "epoch": 1.62, - "eval_loss": 2.0052874088287354, - "eval_runtime": 908.8472, - "eval_samples_per_second": 849.23, - "eval_steps_per_second": 53.077, + "epoch": 3.13, + "eval_loss": 1.9361698627471924, + "eval_runtime": 460.1009, + "eval_samples_per_second": 869.375, + "eval_steps_per_second": 54.336, "step": 1488000 }, { - "epoch": 1.63, - "eval_loss": 2.0071661472320557, - "eval_runtime": 907.693, - "eval_samples_per_second": 850.31, - "eval_steps_per_second": 53.145, + "epoch": 3.15, + "eval_loss": 1.932852864265442, + "eval_runtime": 461.8912, + "eval_samples_per_second": 866.005, + "eval_steps_per_second": 54.125, "step": 1496000 }, { - "epoch": 1.64, + "epoch": 3.17, "learning_rate": 1.5306666666666666e-07, - "loss": 2.1178, + "loss": 2.0774, "step": 1504000 }, { - "epoch": 1.64, - "eval_loss": 2.0059070587158203, - "eval_runtime": 908.356, - "eval_samples_per_second": 849.689, - "eval_steps_per_second": 53.106, + "epoch": 3.17, + "eval_loss": 1.933501124382019, + "eval_runtime": 461.263, + "eval_samples_per_second": 867.184, + "eval_steps_per_second": 54.199, "step": 1504000 }, { - "epoch": 1.65, - "eval_loss": 2.00836443901062, - "eval_runtime": 908.0246, - "eval_samples_per_second": 849.999, - "eval_steps_per_second": 53.125, + "epoch": 3.18, + "eval_loss": 1.9341800212860107, + "eval_runtime": 462.3299, + "eval_samples_per_second": 865.183, + "eval_steps_per_second": 54.074, "step": 1512000 }, { - "epoch": 1.66, + "epoch": 3.2, "learning_rate": 1.5033333333333332e-07, - "loss": 2.1154, + "loss": 2.0793, "step": 1520000 }, { - "epoch": 1.66, - "eval_loss": 2.0105550289154053, - "eval_runtime": 903.6608, - "eval_samples_per_second": 854.104, - "eval_steps_per_second": 53.382, + "epoch": 3.2, + "eval_loss": 1.9325579404830933, + "eval_runtime": 462.9892, + "eval_samples_per_second": 863.951, + "eval_steps_per_second": 53.997, "step": 1520000 }, { - "epoch": 1.67, - "eval_loss": 2.0116729736328125, - "eval_runtime": 909.1515, - "eval_samples_per_second": 848.945, - "eval_steps_per_second": 53.059, + "epoch": 3.22, + "eval_loss": 1.9313201904296875, + "eval_runtime": 462.2771, + "eval_samples_per_second": 865.282, + "eval_steps_per_second": 54.08, "step": 1528000 }, { - "epoch": 1.68, + "epoch": 3.23, "learning_rate": 1.476e-07, - "loss": 2.1214, + "loss": 2.0834, "step": 1536000 }, { - "epoch": 1.68, - "eval_loss": 2.006955146789551, - "eval_runtime": 907.2355, - "eval_samples_per_second": 850.738, - "eval_steps_per_second": 53.171, + "epoch": 3.23, + "eval_loss": 1.9301501512527466, + "eval_runtime": 461.1678, + "eval_samples_per_second": 867.363, + "eval_steps_per_second": 54.21, "step": 1536000 }, { - "epoch": 1.68, - "eval_loss": 2.0078775882720947, - "eval_runtime": 908.5609, - "eval_samples_per_second": 849.497, - "eval_steps_per_second": 53.094, + "epoch": 3.25, + "eval_loss": 1.9299404621124268, + "eval_runtime": 463.2734, + "eval_samples_per_second": 863.421, + "eval_steps_per_second": 53.964, "step": 1544000 }, { - "epoch": 1.69, + "epoch": 3.27, "learning_rate": 1.4486666666666665e-07, - "loss": 2.1175, + "loss": 2.0698, "step": 1552000 }, { - "epoch": 1.69, - "eval_loss": 2.0101876258850098, - "eval_runtime": 901.3076, - "eval_samples_per_second": 856.334, - "eval_steps_per_second": 53.521, + "epoch": 3.27, + "eval_loss": 1.9287976026535034, + "eval_runtime": 462.9712, + "eval_samples_per_second": 863.985, + "eval_steps_per_second": 53.999, "step": 1552000 }, { - "epoch": 1.7, - "eval_loss": 2.009697675704956, - "eval_runtime": 906.1011, - "eval_samples_per_second": 851.803, - "eval_steps_per_second": 53.238, + "epoch": 3.28, + "eval_loss": 1.931135892868042, + "eval_runtime": 463.0622, + "eval_samples_per_second": 863.815, + "eval_steps_per_second": 53.988, "step": 1560000 }, { - "epoch": 1.71, + "epoch": 3.3, "learning_rate": 1.4213333333333334e-07, - "loss": 2.1206, + "loss": 2.0721, "step": 1568000 }, { - "epoch": 1.71, - "eval_loss": 2.0092358589172363, - "eval_runtime": 901.2376, - "eval_samples_per_second": 856.4, - "eval_steps_per_second": 53.525, + "epoch": 3.3, + "eval_loss": 1.926220178604126, + "eval_runtime": 461.1578, + "eval_samples_per_second": 867.382, + "eval_steps_per_second": 54.211, "step": 1568000 }, { - "epoch": 1.72, - "eval_loss": 2.005527973175049, - "eval_runtime": 896.3075, - "eval_samples_per_second": 861.111, - "eval_steps_per_second": 53.82, + "epoch": 3.32, + "eval_loss": 1.9320148229599, + "eval_runtime": 460.1501, + "eval_samples_per_second": 869.282, + "eval_steps_per_second": 54.33, "step": 1576000 }, { - "epoch": 1.73, + "epoch": 3.33, "learning_rate": 1.3940000000000002e-07, - "loss": 2.1302, + "loss": 2.0742, "step": 1584000 }, { - "epoch": 1.73, - "eval_loss": 2.008502244949341, - "eval_runtime": 899.3251, - "eval_samples_per_second": 858.221, - "eval_steps_per_second": 53.639, + "epoch": 3.33, + "eval_loss": 1.927839994430542, + "eval_runtime": 463.0138, + "eval_samples_per_second": 863.905, + "eval_steps_per_second": 53.994, "step": 1584000 }, { - "epoch": 1.74, - "eval_loss": 2.0109806060791016, - "eval_runtime": 906.7205, - "eval_samples_per_second": 851.222, - "eval_steps_per_second": 53.202, + "epoch": 3.35, + "eval_loss": 1.9333292245864868, + "eval_runtime": 461.8845, + "eval_samples_per_second": 866.017, + "eval_steps_per_second": 54.126, "step": 1592000 }, { - "epoch": 1.75, + "epoch": 3.37, "learning_rate": 1.3666666666666665e-07, - "loss": 2.1177, + "loss": 2.0774, "step": 1600000 }, { - "epoch": 1.75, - "eval_loss": 2.006521701812744, - "eval_runtime": 898.4764, - "eval_samples_per_second": 859.032, - "eval_steps_per_second": 53.69, + "epoch": 3.37, + "eval_loss": 1.9251993894577026, + "eval_runtime": 464.4513, + "eval_samples_per_second": 861.231, + "eval_steps_per_second": 53.827, "step": 1600000 }, { - "epoch": 1.75, - "eval_loss": 2.0131704807281494, - "eval_runtime": 906.0839, - "eval_samples_per_second": 851.82, - "eval_steps_per_second": 53.239, + "epoch": 3.39, + "eval_loss": 1.930081844329834, + "eval_runtime": 465.4239, + "eval_samples_per_second": 859.432, + "eval_steps_per_second": 53.714, "step": 1608000 }, { - "epoch": 1.76, + "epoch": 3.4, "learning_rate": 1.3393333333333333e-07, - "loss": 2.1101, + "loss": 2.0766, "step": 1616000 }, { - "epoch": 1.76, - "eval_loss": 2.0085511207580566, - "eval_runtime": 896.2709, - "eval_samples_per_second": 861.146, - "eval_steps_per_second": 53.822, + "epoch": 3.4, + "eval_loss": 1.934423804283142, + "eval_runtime": 468.7051, + "eval_samples_per_second": 853.415, + "eval_steps_per_second": 53.338, "step": 1616000 }, { - "epoch": 1.77, - "eval_loss": 2.0077245235443115, - "eval_runtime": 897.3988, - "eval_samples_per_second": 860.064, - "eval_steps_per_second": 53.754, + "epoch": 3.42, + "eval_loss": 1.9320313930511475, + "eval_runtime": 464.1689, + "eval_samples_per_second": 861.755, + "eval_steps_per_second": 53.86, "step": 1624000 }, { - "epoch": 1.78, + "epoch": 3.44, "learning_rate": 1.312e-07, - "loss": 2.1194, + "loss": 2.0702, "step": 1632000 }, { - "epoch": 1.78, - "eval_loss": 2.008148431777954, - "eval_runtime": 896.5575, - "eval_samples_per_second": 860.871, - "eval_steps_per_second": 53.805, + "epoch": 3.44, + "eval_loss": 1.9307453632354736, + "eval_runtime": 464.689, + "eval_samples_per_second": 860.791, + "eval_steps_per_second": 53.799, "step": 1632000 }, { - "epoch": 1.79, - "eval_loss": 2.008798122406006, - "eval_runtime": 897.2787, - "eval_samples_per_second": 860.179, - "eval_steps_per_second": 53.761, + "epoch": 3.45, + "eval_loss": 1.9304145574569702, + "eval_runtime": 463.7779, + "eval_samples_per_second": 862.482, + "eval_steps_per_second": 53.905, "step": 1640000 }, { - "epoch": 1.8, + "epoch": 3.47, "learning_rate": 1.2846666666666667e-07, - "loss": 2.1167, + "loss": 2.0772, "step": 1648000 }, { - "epoch": 1.8, - "eval_loss": 2.002239942550659, - "eval_runtime": 893.5655, - "eval_samples_per_second": 863.753, - "eval_steps_per_second": 53.985, + "epoch": 3.47, + "eval_loss": 1.9280321598052979, + "eval_runtime": 465.1445, + "eval_samples_per_second": 859.948, + "eval_steps_per_second": 53.747, "step": 1648000 }, { - "epoch": 1.81, - "eval_loss": 2.007662296295166, - "eval_runtime": 895.7141, - "eval_samples_per_second": 861.681, - "eval_steps_per_second": 53.855, + "epoch": 3.49, + "eval_loss": 1.9324473142623901, + "eval_runtime": 466.7608, + "eval_samples_per_second": 856.97, + "eval_steps_per_second": 53.561, "step": 1656000 }, { - "epoch": 1.82, + "epoch": 3.5, "learning_rate": 1.2573333333333332e-07, - "loss": 2.1083, + "loss": 2.0757, "step": 1664000 }, { - "epoch": 1.82, - "eval_loss": 2.0065953731536865, - "eval_runtime": 890.9713, - "eval_samples_per_second": 866.268, - "eval_steps_per_second": 54.142, + "epoch": 3.5, + "eval_loss": 1.9342966079711914, + "eval_runtime": 466.6862, + "eval_samples_per_second": 857.107, + "eval_steps_per_second": 53.569, "step": 1664000 }, { - "epoch": 1.82, - "eval_loss": 2.0137040615081787, - "eval_runtime": 885.7627, - "eval_samples_per_second": 871.362, - "eval_steps_per_second": 54.46, + "epoch": 3.52, + "eval_loss": 1.9311579465866089, + "eval_runtime": 464.2418, + "eval_samples_per_second": 861.62, + "eval_steps_per_second": 53.851, "step": 1672000 }, { - "epoch": 1.83, + "epoch": 3.54, "learning_rate": 1.23e-07, - "loss": 2.1232, + "loss": 2.0747, "step": 1680000 }, { - "epoch": 1.83, - "eval_loss": 2.0067014694213867, - "eval_runtime": 890.51, - "eval_samples_per_second": 866.717, - "eval_steps_per_second": 54.17, + "epoch": 3.54, + "eval_loss": 1.9303609132766724, + "eval_runtime": 463.1295, + "eval_samples_per_second": 863.689, + "eval_steps_per_second": 53.981, "step": 1680000 }, { - "epoch": 1.84, - "eval_loss": 2.0039150714874268, - "eval_runtime": 889.3586, - "eval_samples_per_second": 867.839, - "eval_steps_per_second": 54.24, + "epoch": 3.55, + "eval_loss": 1.9359545707702637, + "eval_runtime": 463.1345, + "eval_samples_per_second": 863.68, + "eval_steps_per_second": 53.98, "step": 1688000 }, { - "epoch": 1.85, + "epoch": 3.57, "learning_rate": 1.2026666666666666e-07, - "loss": 2.1212, + "loss": 2.068, "step": 1696000 }, { - "epoch": 1.85, - "eval_loss": 2.008970022201538, - "eval_runtime": 893.785, - "eval_samples_per_second": 863.541, - "eval_steps_per_second": 53.972, + "epoch": 3.57, + "eval_loss": 1.9296965599060059, + "eval_runtime": 463.742, + "eval_samples_per_second": 862.548, + "eval_steps_per_second": 53.909, "step": 1696000 }, { - "epoch": 1.86, - "eval_loss": 2.0079498291015625, - "eval_runtime": 882.5613, - "eval_samples_per_second": 874.523, - "eval_steps_per_second": 54.658, + "epoch": 3.59, + "eval_loss": 1.9337373971939087, + "eval_runtime": 466.0229, + "eval_samples_per_second": 858.327, + "eval_steps_per_second": 53.645, "step": 1704000 }, { - "epoch": 1.87, + "epoch": 3.6, "learning_rate": 1.1753333333333334e-07, - "loss": 2.1246, + "loss": 2.0825, "step": 1712000 }, { - "epoch": 1.87, - "eval_loss": 2.0082814693450928, - "eval_runtime": 886.133, - "eval_samples_per_second": 870.998, - "eval_steps_per_second": 54.438, + "epoch": 3.6, + "eval_loss": 1.9293195009231567, + "eval_runtime": 468.6282, + "eval_samples_per_second": 853.555, + "eval_steps_per_second": 53.347, "step": 1712000 }, { - "epoch": 1.88, - "eval_loss": 2.003898859024048, - "eval_runtime": 887.1853, - "eval_samples_per_second": 869.965, - "eval_steps_per_second": 54.373, + "epoch": 3.62, + "eval_loss": 1.929545283317566, + "eval_runtime": 467.7047, + "eval_samples_per_second": 855.24, + "eval_steps_per_second": 53.453, "step": 1720000 }, { - "epoch": 1.89, + "epoch": 3.64, "learning_rate": 1.1480000000000001e-07, - "loss": 2.1129, + "loss": 2.0811, "step": 1728000 }, { - "epoch": 1.89, - "eval_loss": 2.0069074630737305, - "eval_runtime": 891.3907, - "eval_samples_per_second": 865.86, - "eval_steps_per_second": 54.117, + "epoch": 3.64, + "eval_loss": 1.9315083026885986, + "eval_runtime": 464.4272, + "eval_samples_per_second": 861.276, + "eval_steps_per_second": 53.83, "step": 1728000 }, { - "epoch": 1.89, - "eval_loss": 2.007922410964966, - "eval_runtime": 884.1175, - "eval_samples_per_second": 872.984, - "eval_steps_per_second": 54.562, + "epoch": 3.65, + "eval_loss": 1.9279147386550903, + "eval_runtime": 464.6795, + "eval_samples_per_second": 860.808, + "eval_steps_per_second": 53.801, "step": 1736000 }, { - "epoch": 1.9, + "epoch": 3.67, "learning_rate": 1.1206666666666666e-07, - "loss": 2.1209, + "loss": 2.0844, "step": 1744000 }, { - "epoch": 1.9, - "eval_loss": 2.00584077835083, - "eval_runtime": 888.6359, - "eval_samples_per_second": 868.545, - "eval_steps_per_second": 54.284, + "epoch": 3.67, + "eval_loss": 1.9289209842681885, + "eval_runtime": 466.6058, + "eval_samples_per_second": 857.255, + "eval_steps_per_second": 53.578, "step": 1744000 }, { - "epoch": 1.91, - "eval_loss": 2.0071957111358643, - "eval_runtime": 891.8674, - "eval_samples_per_second": 865.398, - "eval_steps_per_second": 54.088, + "epoch": 3.69, + "eval_loss": 1.9279465675354004, + "eval_runtime": 464.5589, + "eval_samples_per_second": 861.032, + "eval_steps_per_second": 53.814, "step": 1752000 }, { - "epoch": 1.92, + "epoch": 3.71, "learning_rate": 1.0933333333333333e-07, - "loss": 2.1209, + "loss": 2.0827, "step": 1760000 }, { - "epoch": 1.92, - "eval_loss": 2.0067615509033203, - "eval_runtime": 884.8141, - "eval_samples_per_second": 872.296, - "eval_steps_per_second": 54.519, + "epoch": 3.71, + "eval_loss": 1.9282883405685425, + "eval_runtime": 467.4378, + "eval_samples_per_second": 855.729, + "eval_steps_per_second": 53.483, "step": 1760000 }, { - "epoch": 1.93, - "eval_loss": 2.0078628063201904, - "eval_runtime": 888.3025, - "eval_samples_per_second": 868.871, - "eval_steps_per_second": 54.305, + "epoch": 3.72, + "eval_loss": 1.9295260906219482, + "eval_runtime": 466.5213, + "eval_samples_per_second": 857.41, + "eval_steps_per_second": 53.588, "step": 1768000 }, { - "epoch": 1.94, + "epoch": 3.74, "learning_rate": 1.066e-07, - "loss": 2.1184, + "loss": 2.0684, "step": 1776000 }, { - "epoch": 1.94, - "eval_loss": 2.0036442279815674, - "eval_runtime": 887.5766, - "eval_samples_per_second": 869.581, - "eval_steps_per_second": 54.349, + "epoch": 3.74, + "eval_loss": 1.9280706644058228, + "eval_runtime": 468.3975, + "eval_samples_per_second": 853.975, + "eval_steps_per_second": 53.373, "step": 1776000 }, { - "epoch": 1.95, - "eval_loss": 2.0064985752105713, - "eval_runtime": 890.3705, - "eval_samples_per_second": 866.853, - "eval_steps_per_second": 54.179, + "epoch": 3.76, + "eval_loss": 1.9329652786254883, + "eval_runtime": 467.0852, + "eval_samples_per_second": 856.375, + "eval_steps_per_second": 53.523, "step": 1784000 }, { - "epoch": 1.96, + "epoch": 3.77, "learning_rate": 1.0386666666666667e-07, - "loss": 2.1065, + "loss": 2.0724, "step": 1792000 }, { - "epoch": 1.96, - "eval_loss": 2.007737159729004, - "eval_runtime": 889.1985, - "eval_samples_per_second": 867.995, - "eval_steps_per_second": 54.25, + "epoch": 3.77, + "eval_loss": 1.9293663501739502, + "eval_runtime": 466.1999, + "eval_samples_per_second": 858.001, + "eval_steps_per_second": 53.625, "step": 1792000 }, { - "epoch": 1.96, - "eval_loss": 2.006197452545166, - "eval_runtime": 889.8901, - "eval_samples_per_second": 867.321, - "eval_steps_per_second": 54.208, + "epoch": 3.79, + "eval_loss": 1.9276474714279175, + "eval_runtime": 466.9185, + "eval_samples_per_second": 856.681, + "eval_steps_per_second": 53.543, "step": 1800000 }, { - "epoch": 1.97, + "epoch": 3.81, "learning_rate": 1.0113333333333334e-07, - "loss": 2.109, + "loss": 2.074, "step": 1808000 }, { - "epoch": 1.97, - "eval_loss": 2.0090434551239014, - "eval_runtime": 888.3297, - "eval_samples_per_second": 868.844, - "eval_steps_per_second": 54.303, + "epoch": 3.81, + "eval_loss": 1.9226585626602173, + "eval_runtime": 466.1573, + "eval_samples_per_second": 858.079, + "eval_steps_per_second": 53.63, "step": 1808000 }, { - "epoch": 1.98, - "eval_loss": 2.012356758117676, - "eval_runtime": 893.3256, - "eval_samples_per_second": 863.985, - "eval_steps_per_second": 53.999, + "epoch": 3.82, + "eval_loss": 1.931994080543518, + "eval_runtime": 468.1658, + "eval_samples_per_second": 854.398, + "eval_steps_per_second": 53.4, "step": 1816000 }, { - "epoch": 1.99, + "epoch": 3.84, "learning_rate": 9.84e-08, - "loss": 2.1081, + "loss": 2.0801, "step": 1824000 }, { - "epoch": 1.99, - "eval_loss": 2.0065596103668213, - "eval_runtime": 893.6122, - "eval_samples_per_second": 863.708, - "eval_steps_per_second": 53.982, + "epoch": 3.84, + "eval_loss": 1.927461862564087, + "eval_runtime": 468.6252, + "eval_samples_per_second": 853.561, + "eval_steps_per_second": 53.348, "step": 1824000 }, { - "epoch": 2.0, - "eval_loss": 2.008080005645752, - "eval_runtime": 891.4247, - "eval_samples_per_second": 865.828, - "eval_steps_per_second": 54.115, + "epoch": 3.86, + "eval_loss": 1.9301567077636719, + "eval_runtime": 468.5364, + "eval_samples_per_second": 853.722, + "eval_steps_per_second": 53.358, "step": 1832000 }, { - "epoch": 2.01, + "epoch": 3.87, "learning_rate": 9.566666666666666e-08, - "loss": 2.1151, + "loss": 2.0783, "step": 1840000 }, { - "epoch": 2.01, - "eval_loss": 2.008512258529663, - "eval_runtime": 884.9554, - "eval_samples_per_second": 872.157, - "eval_steps_per_second": 54.51, + "epoch": 3.87, + "eval_loss": 1.9332624673843384, + "eval_runtime": 468.1279, + "eval_samples_per_second": 854.467, + "eval_steps_per_second": 53.404, "step": 1840000 }, { - "epoch": 2.02, - "eval_loss": 2.0054173469543457, - "eval_runtime": 886.9049, - "eval_samples_per_second": 870.24, - "eval_steps_per_second": 54.39, + "epoch": 3.89, + "eval_loss": 1.9296493530273438, + "eval_runtime": 468.9713, + "eval_samples_per_second": 852.931, + "eval_steps_per_second": 53.308, "step": 1848000 }, { - "epoch": 2.03, + "epoch": 3.91, "learning_rate": 9.293333333333333e-08, - "loss": 2.1178, + "loss": 2.0787, "step": 1856000 }, { - "epoch": 2.03, - "eval_loss": 2.005777359008789, - "eval_runtime": 886.5315, - "eval_samples_per_second": 870.606, - "eval_steps_per_second": 54.413, + "epoch": 3.91, + "eval_loss": 1.9301527738571167, + "eval_runtime": 465.9133, + "eval_samples_per_second": 858.529, + "eval_steps_per_second": 53.658, "step": 1856000 }, { - "epoch": 2.03, - "eval_loss": 2.0048415660858154, - "eval_runtime": 893.5519, - "eval_samples_per_second": 863.766, - "eval_steps_per_second": 53.986, + "epoch": 3.92, + "eval_loss": 1.9347186088562012, + "eval_runtime": 467.3282, + "eval_samples_per_second": 855.93, + "eval_steps_per_second": 53.496, "step": 1864000 }, { - "epoch": 2.04, + "epoch": 3.94, "learning_rate": 9.02e-08, - "loss": 2.1035, + "loss": 2.0733, "step": 1872000 }, { - "epoch": 2.04, - "eval_loss": 2.004007339477539, - "eval_runtime": 890.5358, - "eval_samples_per_second": 866.692, - "eval_steps_per_second": 54.169, + "epoch": 3.94, + "eval_loss": 1.9297592639923096, + "eval_runtime": 467.0743, + "eval_samples_per_second": 856.395, + "eval_steps_per_second": 53.525, "step": 1872000 }, { - "epoch": 2.05, - "eval_loss": 2.0059244632720947, - "eval_runtime": 887.0437, - "eval_samples_per_second": 870.104, - "eval_steps_per_second": 54.382, + "epoch": 3.96, + "eval_loss": 1.9302401542663574, + "eval_runtime": 469.1433, + "eval_samples_per_second": 852.618, + "eval_steps_per_second": 53.289, "step": 1880000 }, { - "epoch": 2.06, + "epoch": 3.97, "learning_rate": 8.746666666666667e-08, - "loss": 2.1197, + "loss": 2.0742, "step": 1888000 }, { - "epoch": 2.06, - "eval_loss": 2.0071017742156982, - "eval_runtime": 889.191, - "eval_samples_per_second": 868.003, - "eval_steps_per_second": 54.25, + "epoch": 3.97, + "eval_loss": 1.9278994798660278, + "eval_runtime": 468.7857, + "eval_samples_per_second": 853.268, + "eval_steps_per_second": 53.329, "step": 1888000 }, { - "epoch": 2.07, - "eval_loss": 2.005682945251465, - "eval_runtime": 888.8818, - "eval_samples_per_second": 868.304, - "eval_steps_per_second": 54.269, + "epoch": 3.99, + "eval_loss": 1.9257820844650269, + "eval_runtime": 467.4664, + "eval_samples_per_second": 855.676, + "eval_steps_per_second": 53.48, "step": 1896000 }, { - "epoch": 2.08, + "epoch": 4.01, "learning_rate": 8.473333333333334e-08, - "loss": 2.1143, + "loss": 2.0769, "step": 1904000 }, { - "epoch": 2.08, - "eval_loss": 2.005943536758423, - "eval_runtime": 884.5437, - "eval_samples_per_second": 872.563, - "eval_steps_per_second": 54.535, + "epoch": 4.01, + "eval_loss": 1.9254648685455322, + "eval_runtime": 469.7925, + "eval_samples_per_second": 851.44, + "eval_steps_per_second": 53.215, "step": 1904000 }, { - "epoch": 2.09, - "eval_loss": 2.0042991638183594, - "eval_runtime": 884.1715, - "eval_samples_per_second": 872.93, - "eval_steps_per_second": 54.558, + "epoch": 4.03, + "eval_loss": 1.9282057285308838, + "eval_runtime": 467.1165, + "eval_samples_per_second": 856.317, + "eval_steps_per_second": 53.52, "step": 1912000 }, { - "epoch": 2.09, + "epoch": 4.04, "learning_rate": 8.2e-08, - "loss": 2.1082, + "loss": 2.0736, "step": 1920000 }, { - "epoch": 2.09, - "eval_loss": 2.0067648887634277, - "eval_runtime": 885.4828, - "eval_samples_per_second": 871.637, - "eval_steps_per_second": 54.478, + "epoch": 4.04, + "eval_loss": 1.9297821521759033, + "eval_runtime": 467.8574, + "eval_samples_per_second": 854.961, + "eval_steps_per_second": 53.435, "step": 1920000 }, { - "epoch": 2.1, - "eval_loss": 2.0057313442230225, - "eval_runtime": 887.8665, - "eval_samples_per_second": 869.297, - "eval_steps_per_second": 54.331, + "epoch": 4.06, + "eval_loss": 1.9324908256530762, + "eval_runtime": 469.8593, + "eval_samples_per_second": 851.319, + "eval_steps_per_second": 53.207, "step": 1928000 }, { - "epoch": 2.11, + "epoch": 4.08, "learning_rate": 7.926666666666666e-08, - "loss": 2.1202, + "loss": 2.0713, "step": 1936000 }, { - "epoch": 2.11, - "eval_loss": 2.007241725921631, - "eval_runtime": 885.5971, - "eval_samples_per_second": 871.525, - "eval_steps_per_second": 54.471, + "epoch": 4.08, + "eval_loss": 1.929565668106079, + "eval_runtime": 473.7904, + "eval_samples_per_second": 844.255, + "eval_steps_per_second": 52.766, "step": 1936000 }, { - "epoch": 2.12, - "eval_loss": 2.0057430267333984, - "eval_runtime": 888.4045, - "eval_samples_per_second": 868.771, - "eval_steps_per_second": 54.298, + "epoch": 4.09, + "eval_loss": 1.9292526245117188, + "eval_runtime": 474.7388, + "eval_samples_per_second": 842.569, + "eval_steps_per_second": 52.661, "step": 1944000 }, { - "epoch": 2.13, + "epoch": 4.11, "learning_rate": 7.653333333333333e-08, - "loss": 2.1138, + "loss": 2.0825, "step": 1952000 }, { - "epoch": 2.13, - "eval_loss": 2.0051097869873047, - "eval_runtime": 889.7536, - "eval_samples_per_second": 867.454, - "eval_steps_per_second": 54.216, + "epoch": 4.11, + "eval_loss": 1.9344995021820068, + "eval_runtime": 474.9247, + "eval_samples_per_second": 842.239, + "eval_steps_per_second": 52.64, "step": 1952000 }, { - "epoch": 2.14, - "eval_loss": 2.008528709411621, - "eval_runtime": 887.8548, - "eval_samples_per_second": 869.309, - "eval_steps_per_second": 54.332, + "epoch": 4.13, + "eval_loss": 1.934618592262268, + "eval_runtime": 472.2426, + "eval_samples_per_second": 847.022, + "eval_steps_per_second": 52.939, "step": 1960000 }, { - "epoch": 2.15, + "epoch": 4.14, "learning_rate": 7.38e-08, - "loss": 2.1082, + "loss": 2.0828, "step": 1968000 }, { - "epoch": 2.15, - "eval_loss": 2.007629871368408, - "eval_runtime": 886.2101, - "eval_samples_per_second": 870.922, - "eval_steps_per_second": 54.433, + "epoch": 4.14, + "eval_loss": 1.9310798645019531, + "eval_runtime": 468.8102, + "eval_samples_per_second": 853.224, + "eval_steps_per_second": 53.326, "step": 1968000 }, { - "epoch": 2.16, - "eval_loss": 2.0076658725738525, - "eval_runtime": 886.4111, - "eval_samples_per_second": 870.725, - "eval_steps_per_second": 54.421, + "epoch": 4.16, + "eval_loss": 1.9307339191436768, + "eval_runtime": 470.4345, + "eval_samples_per_second": 850.278, + "eval_steps_per_second": 53.142, "step": 1976000 }, { - "epoch": 2.16, + "epoch": 4.18, "learning_rate": 7.106666666666667e-08, - "loss": 2.1084, + "loss": 2.0821, "step": 1984000 }, { - "epoch": 2.16, - "eval_loss": 2.001997470855713, - "eval_runtime": 885.1567, - "eval_samples_per_second": 871.959, - "eval_steps_per_second": 54.498, + "epoch": 4.18, + "eval_loss": 1.9335579872131348, + "eval_runtime": 469.9157, + "eval_samples_per_second": 851.216, + "eval_steps_per_second": 53.201, "step": 1984000 }, { - "epoch": 2.17, - "eval_loss": 2.005009651184082, - "eval_runtime": 889.5629, - "eval_samples_per_second": 867.64, - "eval_steps_per_second": 54.228, + "epoch": 4.19, + "eval_loss": 1.9265415668487549, + "eval_runtime": 469.8042, + "eval_samples_per_second": 851.418, + "eval_steps_per_second": 53.214, "step": 1992000 }, { - "epoch": 2.18, + "epoch": 4.21, "learning_rate": 6.833333333333332e-08, - "loss": 2.1151, + "loss": 2.0768, "step": 2000000 }, { - "epoch": 2.18, - "eval_loss": 2.0065817832946777, - "eval_runtime": 885.7641, - "eval_samples_per_second": 871.361, - "eval_steps_per_second": 54.46, + "epoch": 4.21, + "eval_loss": 1.9284056425094604, + "eval_runtime": 473.143, + "eval_samples_per_second": 845.41, + "eval_steps_per_second": 52.838, "step": 2000000 }, { - "epoch": 2.19, - "eval_loss": 2.003136396408081, - "eval_runtime": 886.578, - "eval_samples_per_second": 870.561, - "eval_steps_per_second": 54.41, + "epoch": 4.23, + "eval_loss": 1.9290404319763184, + "eval_runtime": 475.6044, + "eval_samples_per_second": 841.035, + "eval_steps_per_second": 52.565, "step": 2008000 }, { - "epoch": 2.2, + "epoch": 4.24, "learning_rate": 6.56e-08, - "loss": 2.1141, + "loss": 2.0695, "step": 2016000 }, { - "epoch": 2.2, - "eval_loss": 2.0128238201141357, - "eval_runtime": 891.0219, - "eval_samples_per_second": 866.219, - "eval_steps_per_second": 54.139, + "epoch": 4.24, + "eval_loss": 1.9305514097213745, + "eval_runtime": 470.7659, + "eval_samples_per_second": 849.679, + "eval_steps_per_second": 53.105, "step": 2016000 }, { - "epoch": 2.21, - "eval_loss": 2.0021839141845703, - "eval_runtime": 895.8435, - "eval_samples_per_second": 861.557, - "eval_steps_per_second": 53.848, + "epoch": 4.26, + "eval_loss": 1.9299392700195312, + "eval_runtime": 474.6989, + "eval_samples_per_second": 842.639, + "eval_steps_per_second": 52.665, "step": 2024000 }, { - "epoch": 2.22, + "epoch": 4.28, "learning_rate": 6.286666666666666e-08, - "loss": 2.1129, + "loss": 2.0698, "step": 2032000 }, { - "epoch": 2.22, - "eval_loss": 2.0065131187438965, - "eval_runtime": 890.2528, - "eval_samples_per_second": 866.967, - "eval_steps_per_second": 54.186, + "epoch": 4.28, + "eval_loss": 1.9230471849441528, + "eval_runtime": 473.1742, + "eval_samples_per_second": 845.355, + "eval_steps_per_second": 52.835, "step": 2032000 }, { - "epoch": 2.23, - "eval_loss": 2.005363941192627, - "eval_runtime": 890.9681, - "eval_samples_per_second": 866.271, - "eval_steps_per_second": 54.142, + "epoch": 4.29, + "eval_loss": 1.9271833896636963, + "eval_runtime": 471.6682, + "eval_samples_per_second": 848.054, + "eval_steps_per_second": 53.003, "step": 2040000 }, { - "epoch": 2.23, + "epoch": 4.31, "learning_rate": 6.013333333333333e-08, - "loss": 2.1164, + "loss": 2.0776, "step": 2048000 }, { - "epoch": 2.23, - "eval_loss": 2.0038933753967285, - "eval_runtime": 892.3995, - "eval_samples_per_second": 864.882, - "eval_steps_per_second": 54.055, + "epoch": 4.31, + "eval_loss": 1.9306389093399048, + "eval_runtime": 472.8901, + "eval_samples_per_second": 845.863, + "eval_steps_per_second": 52.866, "step": 2048000 }, { - "epoch": 2.24, - "eval_loss": 2.003117561340332, - "eval_runtime": 894.495, - "eval_samples_per_second": 862.856, - "eval_steps_per_second": 53.929, + "epoch": 4.33, + "eval_loss": 1.9242680072784424, + "eval_runtime": 472.2094, + "eval_samples_per_second": 847.082, + "eval_steps_per_second": 52.943, "step": 2056000 }, { - "epoch": 2.25, + "epoch": 4.35, "learning_rate": 5.7400000000000004e-08, - "loss": 2.1121, + "loss": 2.0797, "step": 2064000 }, { - "epoch": 2.25, - "eval_loss": 2.0101029872894287, - "eval_runtime": 886.6646, - "eval_samples_per_second": 870.476, - "eval_steps_per_second": 54.405, + "epoch": 4.35, + "eval_loss": 1.9265525341033936, + "eval_runtime": 472.6552, + "eval_samples_per_second": 846.283, + "eval_steps_per_second": 52.893, "step": 2064000 }, { - "epoch": 2.26, - "eval_loss": 2.0098650455474854, - "eval_runtime": 887.3882, - "eval_samples_per_second": 869.766, - "eval_steps_per_second": 54.361, + "epoch": 4.36, + "eval_loss": 1.9248952865600586, + "eval_runtime": 470.1754, + "eval_samples_per_second": 850.746, + "eval_steps_per_second": 53.172, "step": 2072000 }, { - "epoch": 2.27, + "epoch": 4.38, "learning_rate": 5.4666666666666666e-08, - "loss": 2.1071, + "loss": 2.0808, "step": 2080000 }, { - "epoch": 2.27, - "eval_loss": 2.0041701793670654, - "eval_runtime": 891.5578, - "eval_samples_per_second": 865.698, - "eval_steps_per_second": 54.106, + "epoch": 4.38, + "eval_loss": 1.927932858467102, + "eval_runtime": 471.7256, + "eval_samples_per_second": 847.951, + "eval_steps_per_second": 52.997, "step": 2080000 }, { - "epoch": 2.28, - "eval_loss": 2.0030367374420166, - "eval_runtime": 886.7055, - "eval_samples_per_second": 870.436, - "eval_steps_per_second": 54.403, + "epoch": 4.4, + "eval_loss": 1.926186203956604, + "eval_runtime": 472.0307, + "eval_samples_per_second": 847.402, + "eval_steps_per_second": 52.963, "step": 2088000 }, { - "epoch": 2.29, + "epoch": 4.41, "learning_rate": 5.1933333333333335e-08, - "loss": 2.1094, + "loss": 2.0776, "step": 2096000 }, { - "epoch": 2.29, - "eval_loss": 2.00482439994812, - "eval_runtime": 887.8886, - "eval_samples_per_second": 869.276, - "eval_steps_per_second": 54.33, + "epoch": 4.41, + "eval_loss": 1.9350157976150513, + "eval_runtime": 468.711, + "eval_samples_per_second": 853.404, + "eval_steps_per_second": 53.338, "step": 2096000 }, { - "epoch": 2.3, - "eval_loss": 2.004595994949341, - "eval_runtime": 887.4455, - "eval_samples_per_second": 869.71, - "eval_steps_per_second": 54.357, + "epoch": 4.43, + "eval_loss": 1.9297165870666504, + "eval_runtime": 470.9356, + "eval_samples_per_second": 849.373, + "eval_steps_per_second": 53.086, "step": 2104000 }, { - "epoch": 2.3, + "epoch": 4.45, "learning_rate": 4.92e-08, - "loss": 2.1017, + "loss": 2.0805, "step": 2112000 }, { - "epoch": 2.3, - "eval_loss": 2.0038633346557617, - "eval_runtime": 888.4121, - "eval_samples_per_second": 868.764, - "eval_steps_per_second": 54.298, + "epoch": 4.45, + "eval_loss": 1.933711051940918, + "eval_runtime": 472.0879, + "eval_samples_per_second": 847.3, + "eval_steps_per_second": 52.956, "step": 2112000 }, { - "epoch": 2.31, - "eval_loss": 2.0011472702026367, - "eval_runtime": 889.7748, - "eval_samples_per_second": 867.433, - "eval_steps_per_second": 54.215, + "epoch": 4.46, + "eval_loss": 1.9301713705062866, + "eval_runtime": 472.6372, + "eval_samples_per_second": 846.315, + "eval_steps_per_second": 52.895, "step": 2120000 }, { - "epoch": 2.32, + "epoch": 4.48, "learning_rate": 4.6466666666666666e-08, - "loss": 2.1124, + "loss": 2.0791, "step": 2128000 }, { - "epoch": 2.32, - "eval_loss": 2.007091522216797, - "eval_runtime": 892.2658, - "eval_samples_per_second": 865.011, - "eval_steps_per_second": 54.063, + "epoch": 4.48, + "eval_loss": 1.9336822032928467, + "eval_runtime": 472.4005, + "eval_samples_per_second": 846.739, + "eval_steps_per_second": 52.921, "step": 2128000 }, { - "epoch": 2.33, - "eval_loss": 2.0060718059539795, - "eval_runtime": 887.502, - "eval_samples_per_second": 869.654, - "eval_steps_per_second": 54.354, + "epoch": 4.5, + "eval_loss": 1.9298382997512817, + "eval_runtime": 470.1621, + "eval_samples_per_second": 850.77, + "eval_steps_per_second": 53.173, "step": 2136000 }, { - "epoch": 2.34, + "epoch": 4.51, "learning_rate": 4.3733333333333335e-08, - "loss": 2.1064, + "loss": 2.0771, "step": 2144000 }, { - "epoch": 2.34, - "eval_loss": 2.0040297508239746, - "eval_runtime": 888.8512, - "eval_samples_per_second": 868.334, - "eval_steps_per_second": 54.271, + "epoch": 4.51, + "eval_loss": 1.9268031120300293, + "eval_runtime": 469.03, + "eval_samples_per_second": 852.824, + "eval_steps_per_second": 53.301, "step": 2144000 }, { - "epoch": 2.35, - "eval_loss": 2.007528066635132, - "eval_runtime": 895.8909, - "eval_samples_per_second": 861.511, - "eval_steps_per_second": 53.845, + "epoch": 4.53, + "eval_loss": 1.9369832277297974, + "eval_runtime": 469.9036, + "eval_samples_per_second": 851.238, + "eval_steps_per_second": 53.202, "step": 2152000 }, { - "epoch": 2.36, + "epoch": 4.55, "learning_rate": 4.1e-08, - "loss": 2.115, + "loss": 2.0807, "step": 2160000 }, { - "epoch": 2.36, - "eval_loss": 2.0025811195373535, - "eval_runtime": 894.6822, - "eval_samples_per_second": 862.675, - "eval_steps_per_second": 53.917, + "epoch": 4.55, + "eval_loss": 1.93067467212677, + "eval_runtime": 471.0137, + "eval_samples_per_second": 849.232, + "eval_steps_per_second": 53.077, "step": 2160000 }, { - "epoch": 2.37, - "eval_loss": 2.006788492202759, - "eval_runtime": 885.9111, - "eval_samples_per_second": 871.216, - "eval_steps_per_second": 54.451, + "epoch": 4.56, + "eval_loss": 1.9291709661483765, + "eval_runtime": 470.7625, + "eval_samples_per_second": 849.685, + "eval_steps_per_second": 53.105, "step": 2168000 }, { - "epoch": 2.37, + "epoch": 4.58, "learning_rate": 3.8266666666666665e-08, - "loss": 2.114, + "loss": 2.0856, "step": 2176000 }, { - "epoch": 2.37, - "eval_loss": 2.006558418273926, - "eval_runtime": 889.8092, - "eval_samples_per_second": 867.399, - "eval_steps_per_second": 54.213, + "epoch": 4.58, + "eval_loss": 1.9300168752670288, + "eval_runtime": 473.6712, + "eval_samples_per_second": 844.468, + "eval_steps_per_second": 52.779, "step": 2176000 }, { - "epoch": 2.38, - "eval_loss": 2.0079538822174072, - "eval_runtime": 889.2248, - "eval_samples_per_second": 867.97, - "eval_steps_per_second": 54.248, + "epoch": 4.6, + "eval_loss": 1.932855486869812, + "eval_runtime": 472.2129, + "eval_samples_per_second": 847.076, + "eval_steps_per_second": 52.942, "step": 2184000 }, { - "epoch": 2.39, + "epoch": 4.61, "learning_rate": 3.5533333333333334e-08, - "loss": 2.1171, + "loss": 2.0744, "step": 2192000 }, { - "epoch": 2.39, - "eval_loss": 2.0031957626342773, - "eval_runtime": 891.062, - "eval_samples_per_second": 866.18, - "eval_steps_per_second": 54.137, + "epoch": 4.61, + "eval_loss": 1.9319262504577637, + "eval_runtime": 471.3448, + "eval_samples_per_second": 848.636, + "eval_steps_per_second": 53.04, "step": 2192000 }, { - "epoch": 2.4, - "eval_loss": 2.0036396980285645, - "eval_runtime": 889.4858, - "eval_samples_per_second": 867.715, - "eval_steps_per_second": 54.232, + "epoch": 4.63, + "eval_loss": 1.9351770877838135, + "eval_runtime": 471.0763, + "eval_samples_per_second": 849.119, + "eval_steps_per_second": 53.07, "step": 2200000 }, { - "epoch": 2.41, + "epoch": 4.65, "learning_rate": 3.28e-08, - "loss": 2.1119, + "loss": 2.0839, "step": 2208000 }, { - "epoch": 2.41, - "eval_loss": 2.004848003387451, - "eval_runtime": 890.2659, - "eval_samples_per_second": 866.954, - "eval_steps_per_second": 54.185, + "epoch": 4.65, + "eval_loss": 1.9368445873260498, + "eval_runtime": 472.7057, + "eval_samples_per_second": 846.193, + "eval_steps_per_second": 52.887, "step": 2208000 }, { - "epoch": 2.42, - "eval_loss": 2.0058629512786865, - "eval_runtime": 890.6135, - "eval_samples_per_second": 866.616, - "eval_steps_per_second": 54.164, + "epoch": 4.67, + "eval_loss": 1.9342936277389526, + "eval_runtime": 470.7852, + "eval_samples_per_second": 849.644, + "eval_steps_per_second": 53.103, "step": 2216000 }, { - "epoch": 2.43, + "epoch": 4.68, "learning_rate": 3.0066666666666665e-08, - "loss": 2.1097, + "loss": 2.0706, "step": 2224000 }, { - "epoch": 2.43, - "eval_loss": 2.005845546722412, - "eval_runtime": 889.9256, - "eval_samples_per_second": 867.286, - "eval_steps_per_second": 54.206, + "epoch": 4.68, + "eval_loss": 1.929012656211853, + "eval_runtime": 471.422, + "eval_samples_per_second": 848.497, + "eval_steps_per_second": 53.031, "step": 2224000 }, { - "epoch": 2.44, - "eval_loss": 2.004934310913086, - "eval_runtime": 893.1468, - "eval_samples_per_second": 864.158, - "eval_steps_per_second": 54.01, + "epoch": 4.7, + "eval_loss": 1.934726357460022, + "eval_runtime": 473.3032, + "eval_samples_per_second": 845.124, + "eval_steps_per_second": 52.82, "step": 2232000 }, { - "epoch": 2.44, + "epoch": 4.72, "learning_rate": 2.7333333333333333e-08, - "loss": 2.1091, + "loss": 2.0745, "step": 2240000 }, { - "epoch": 2.44, - "eval_loss": 2.005760669708252, - "eval_runtime": 893.6832, - "eval_samples_per_second": 863.639, - "eval_steps_per_second": 53.978, + "epoch": 4.72, + "eval_loss": 1.9293938875198364, + "eval_runtime": 473.9686, + "eval_samples_per_second": 843.938, + "eval_steps_per_second": 52.746, "step": 2240000 }, { - "epoch": 2.45, - "eval_loss": 2.0032405853271484, - "eval_runtime": 894.8171, - "eval_samples_per_second": 862.545, - "eval_steps_per_second": 53.909, + "epoch": 4.73, + "eval_loss": 1.9254814386367798, + "eval_runtime": 474.8799, + "eval_samples_per_second": 842.318, + "eval_steps_per_second": 52.645, "step": 2248000 }, { - "epoch": 2.46, + "epoch": 4.75, "learning_rate": 2.46e-08, - "loss": 2.1107, + "loss": 2.0767, "step": 2256000 }, { - "epoch": 2.46, - "eval_loss": 2.00769305229187, - "eval_runtime": 893.4774, - "eval_samples_per_second": 863.838, - "eval_steps_per_second": 53.99, + "epoch": 4.75, + "eval_loss": 1.927069902420044, + "eval_runtime": 471.8653, + "eval_samples_per_second": 847.7, + "eval_steps_per_second": 52.981, "step": 2256000 }, { - "epoch": 2.47, - "eval_loss": 2.0032243728637695, - "eval_runtime": 893.6019, - "eval_samples_per_second": 863.718, - "eval_steps_per_second": 53.983, + "epoch": 4.77, + "eval_loss": 1.9295985698699951, + "eval_runtime": 470.4126, + "eval_samples_per_second": 850.317, + "eval_steps_per_second": 53.145, "step": 2264000 }, { - "epoch": 2.48, + "epoch": 4.78, "learning_rate": 2.1866666666666667e-08, - "loss": 2.1126, + "loss": 2.0753, "step": 2272000 }, { - "epoch": 2.48, - "eval_loss": 2.0055274963378906, - "eval_runtime": 891.7304, - "eval_samples_per_second": 865.531, - "eval_steps_per_second": 54.096, + "epoch": 4.78, + "eval_loss": 1.9268025159835815, + "eval_runtime": 473.0248, + "eval_samples_per_second": 845.622, + "eval_steps_per_second": 52.851, "step": 2272000 }, { - "epoch": 2.49, - "eval_loss": 2.002612590789795, - "eval_runtime": 892.1014, - "eval_samples_per_second": 865.171, - "eval_steps_per_second": 54.073, + "epoch": 4.8, + "eval_loss": 1.9291967153549194, + "eval_runtime": 472.2138, + "eval_samples_per_second": 847.074, + "eval_steps_per_second": 52.942, "step": 2280000 }, { - "epoch": 2.5, + "epoch": 4.82, "learning_rate": 1.9133333333333333e-08, - "loss": 2.1173, + "loss": 2.0716, "step": 2288000 }, { - "epoch": 2.5, - "eval_loss": 2.0062429904937744, - "eval_runtime": 891.9249, - "eval_samples_per_second": 865.342, - "eval_steps_per_second": 54.084, + "epoch": 4.82, + "eval_loss": 1.9310023784637451, + "eval_runtime": 473.6905, + "eval_samples_per_second": 844.433, + "eval_steps_per_second": 52.777, "step": 2288000 }, { - "epoch": 2.51, - "eval_loss": 2.003859043121338, - "eval_runtime": 892.8008, - "eval_samples_per_second": 864.493, - "eval_steps_per_second": 54.031, + "epoch": 4.83, + "eval_loss": 1.9267252683639526, + "eval_runtime": 474.7542, + "eval_samples_per_second": 842.541, + "eval_steps_per_second": 52.659, "step": 2296000 }, { - "epoch": 2.51, + "epoch": 4.85, "learning_rate": 1.64e-08, - "loss": 2.114, + "loss": 2.0778, "step": 2304000 }, { - "epoch": 2.51, - "eval_loss": 2.006359100341797, - "eval_runtime": 891.1547, - "eval_samples_per_second": 866.09, - "eval_steps_per_second": 54.131, + "epoch": 4.85, + "eval_loss": 1.9301416873931885, + "eval_runtime": 476.5948, + "eval_samples_per_second": 839.287, + "eval_steps_per_second": 52.455, "step": 2304000 }, { - "epoch": 2.52, - "eval_loss": 2.0113308429718018, - "eval_runtime": 890.136, - "eval_samples_per_second": 867.081, - "eval_steps_per_second": 54.193, + "epoch": 4.87, + "eval_loss": 1.9280035495758057, + "eval_runtime": 475.4076, + "eval_samples_per_second": 841.383, + "eval_steps_per_second": 52.586, "step": 2312000 }, { - "epoch": 2.53, + "epoch": 4.88, "learning_rate": 1.3666666666666667e-08, - "loss": 2.1131, + "loss": 2.0724, "step": 2320000 }, { - "epoch": 2.53, - "eval_loss": 2.0065314769744873, - "eval_runtime": 890.6924, - "eval_samples_per_second": 866.539, - "eval_steps_per_second": 54.159, + "epoch": 4.88, + "eval_loss": 1.9283393621444702, + "eval_runtime": 475.7915, + "eval_samples_per_second": 840.704, + "eval_steps_per_second": 52.544, "step": 2320000 }, { - "epoch": 2.54, - "eval_loss": 2.0098392963409424, - "eval_runtime": 892.2668, - "eval_samples_per_second": 865.01, - "eval_steps_per_second": 54.063, + "epoch": 4.9, + "eval_loss": 1.9288876056671143, + "eval_runtime": 478.8452, + "eval_samples_per_second": 835.343, + "eval_steps_per_second": 52.209, "step": 2328000 }, { - "epoch": 2.55, + "epoch": 4.92, "learning_rate": 1.0933333333333334e-08, - "loss": 2.1045, + "loss": 2.0811, "step": 2336000 }, { - "epoch": 2.55, - "eval_loss": 2.0060501098632812, - "eval_runtime": 891.9301, - "eval_samples_per_second": 865.337, - "eval_steps_per_second": 54.084, + "epoch": 4.92, + "eval_loss": 1.931495189666748, + "eval_runtime": 475.8623, + "eval_samples_per_second": 840.579, + "eval_steps_per_second": 52.536, "step": 2336000 }, { - "epoch": 2.56, - "eval_loss": 2.006572961807251, - "eval_runtime": 894.7549, - "eval_samples_per_second": 862.605, - "eval_steps_per_second": 53.913, + "epoch": 4.93, + "eval_loss": 1.9268301725387573, + "eval_runtime": 473.6667, + "eval_samples_per_second": 844.476, + "eval_steps_per_second": 52.78, "step": 2344000 }, { - "epoch": 2.57, + "epoch": 4.95, "learning_rate": 8.2e-09, - "loss": 2.1144, + "loss": 2.0816, "step": 2352000 }, { - "epoch": 2.57, - "eval_loss": 2.006028175354004, - "eval_runtime": 899.347, - "eval_samples_per_second": 858.2, - "eval_steps_per_second": 53.638, + "epoch": 4.95, + "eval_loss": 1.9304386377334595, + "eval_runtime": 475.5489, + "eval_samples_per_second": 841.133, + "eval_steps_per_second": 52.571, "step": 2352000 }, { - "epoch": 2.57, - "eval_loss": 2.00589656829834, - "eval_runtime": 893.5452, - "eval_samples_per_second": 863.773, - "eval_steps_per_second": 53.986, + "epoch": 4.97, + "eval_loss": 1.9301530122756958, + "eval_runtime": 475.0873, + "eval_samples_per_second": 841.951, + "eval_steps_per_second": 52.622, "step": 2360000 }, { - "epoch": 2.58, + "epoch": 4.99, "learning_rate": 5.466666666666667e-09, - "loss": 2.1086, + "loss": 2.0775, "step": 2368000 }, { - "epoch": 2.58, - "eval_loss": 2.0038540363311768, - "eval_runtime": 893.2561, - "eval_samples_per_second": 864.052, - "eval_steps_per_second": 54.004, + "epoch": 4.99, + "eval_loss": 1.929231882095337, + "eval_runtime": 473.3176, + "eval_samples_per_second": 845.098, + "eval_steps_per_second": 52.819, "step": 2368000 }, { - "epoch": 2.59, - "eval_loss": 2.0076115131378174, - "eval_runtime": 895.0756, - "eval_samples_per_second": 862.296, - "eval_steps_per_second": 53.894, + "epoch": 5.0, + "eval_loss": 1.927372694015503, + "eval_runtime": 472.3148, + "eval_samples_per_second": 846.893, + "eval_steps_per_second": 52.931, "step": 2376000 }, { - "epoch": 2.6, + "epoch": 5.02, "learning_rate": 2.7333333333333334e-09, - "loss": 2.1058, + "loss": 2.0807, "step": 2384000 }, { - "epoch": 2.6, - "eval_loss": 2.0035552978515625, - "eval_runtime": 895.3228, - "eval_samples_per_second": 862.058, - "eval_steps_per_second": 53.879, + "epoch": 5.02, + "eval_loss": 1.9316951036453247, + "eval_runtime": 472.3854, + "eval_samples_per_second": 846.766, + "eval_steps_per_second": 52.923, "step": 2384000 }, { - "epoch": 2.61, - "eval_loss": 2.0077223777770996, - "eval_runtime": 896.1834, - "eval_samples_per_second": 861.23, - "eval_steps_per_second": 53.827, + "epoch": 5.04, + "eval_loss": 1.9297714233398438, + "eval_runtime": 474.0973, + "eval_samples_per_second": 843.709, + "eval_steps_per_second": 52.732, "step": 2392000 }, { - "epoch": 2.62, + "epoch": 5.05, "learning_rate": 0.0, - "loss": 2.1112, + "loss": 2.0668, "step": 2400000 }, { - "epoch": 2.62, - "eval_loss": 2.000014066696167, - "eval_runtime": 893.9091, - "eval_samples_per_second": 863.421, - "eval_steps_per_second": 53.964, + "epoch": 5.05, + "eval_loss": 1.9348891973495483, + "eval_runtime": 472.3546, + "eval_samples_per_second": 846.821, + "eval_steps_per_second": 52.926, "step": 2400000 }, { - "epoch": 2.62, + "epoch": 5.05, "step": 2400000, - "total_flos": 7.571300080769916e+17, - "train_loss": 2.133689431966146, - "train_runtime": 416842.919, - "train_samples_per_second": 92.121, - "train_steps_per_second": 5.758 + "total_flos": 7.57317099323892e+17, + "train_loss": 2.0860133251953124, + "train_runtime": 289964.3288, + "train_samples_per_second": 132.43, + "train_steps_per_second": 8.277 } ], "logging_steps": 16000, "max_steps": 2400000, - "num_train_epochs": 3, + "num_train_epochs": 6, "save_steps": 32000, - "total_flos": 7.571300080769916e+17, + "total_flos": 7.57317099323892e+17, "trial_name": null, "trial_params": null }