diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6 +1,6 @@ { - "best_metric": 2.5151257514953613, - "best_model_checkpoint": "./model_tweets_2020_Q1_50/checkpoint-128000", + "best_metric": 2.4417428970336914, + "best_model_checkpoint": "./model_tweets_2020_Q1_50/checkpoint-1888000", "epoch": 9.834292176820574, "eval_steps": 8000, "global_step": 2400000, @@ -10,3312 +10,3312 @@ "log_history": [ { "epoch": 0.03, - "eval_loss": 2.6128780841827393, - "eval_runtime": 220.0915, - "eval_samples_per_second": 933.748, - "eval_steps_per_second": 58.362, + "eval_loss": 2.89373779296875, + "eval_runtime": 220.7068, + "eval_samples_per_second": 931.145, + "eval_steps_per_second": 58.199, "step": 8000 }, { "epoch": 0.07, - "learning_rate": 9.939131159843243e-06, - "loss": 2.7516, + "learning_rate": 4.0726666666666665e-07, + "loss": 3.073, "step": 16000 }, { "epoch": 0.07, - "eval_loss": 2.560152769088745, - "eval_runtime": 222.3718, - "eval_samples_per_second": 924.173, - "eval_steps_per_second": 57.764, + "eval_loss": 2.76598858833313, + "eval_runtime": 221.0774, + "eval_samples_per_second": 929.584, + "eval_steps_per_second": 58.102, "step": 16000 }, { "epoch": 0.1, - "eval_loss": 2.561044216156006, - "eval_runtime": 219.7796, - "eval_samples_per_second": 935.073, - "eval_steps_per_second": 58.445, + "eval_loss": 2.7232513427734375, + "eval_runtime": 221.6288, + "eval_samples_per_second": 927.271, + "eval_steps_per_second": 57.957, "step": 24000 }, { "epoch": 0.13, - "learning_rate": 9.872425581589261e-06, - "loss": 2.6478, + "learning_rate": 4.0453333333333336e-07, + "loss": 2.8244, "step": 32000 }, { "epoch": 0.13, - "eval_loss": 2.558018684387207, - "eval_runtime": 219.7961, - "eval_samples_per_second": 935.003, - "eval_steps_per_second": 58.441, + "eval_loss": 2.687758207321167, + "eval_runtime": 220.1262, + "eval_samples_per_second": 933.601, + "eval_steps_per_second": 58.353, "step": 32000 }, { "epoch": 0.16, - "eval_loss": 2.5498745441436768, - "eval_runtime": 221.2066, - "eval_samples_per_second": 929.041, - "eval_steps_per_second": 58.068, + "eval_loss": 2.6519503593444824, + "eval_runtime": 220.495, + "eval_samples_per_second": 932.039, + "eval_steps_per_second": 58.255, "step": 40000 }, { "epoch": 0.2, - "learning_rate": 9.80572000333528e-06, - "loss": 2.6344, + "learning_rate": 4.018e-07, + "loss": 2.7542, "step": 48000 }, { "epoch": 0.2, - "eval_loss": 2.542255163192749, - "eval_runtime": 219.3858, - "eval_samples_per_second": 936.752, - "eval_steps_per_second": 58.55, + "eval_loss": 2.63004469871521, + "eval_runtime": 220.2803, + "eval_samples_per_second": 932.948, + "eval_steps_per_second": 58.312, "step": 48000 }, { "epoch": 0.23, - "eval_loss": 2.5431501865386963, - "eval_runtime": 220.781, - "eval_samples_per_second": 930.832, - "eval_steps_per_second": 58.18, + "eval_loss": 2.613522529602051, + "eval_runtime": 221.2317, + "eval_samples_per_second": 928.936, + "eval_steps_per_second": 58.061, "step": 56000 }, { "epoch": 0.26, - "learning_rate": 9.739014425081299e-06, - "loss": 2.6174, + "learning_rate": 3.9906666666666667e-07, + "loss": 2.7083, "step": 64000 }, { "epoch": 0.26, - "eval_loss": 2.550448417663574, - "eval_runtime": 220.5, - "eval_samples_per_second": 932.018, - "eval_steps_per_second": 58.254, + "eval_loss": 2.6067709922790527, + "eval_runtime": 220.3177, + "eval_samples_per_second": 932.789, + "eval_steps_per_second": 58.302, "step": 64000 }, { "epoch": 0.3, - "eval_loss": 2.5348613262176514, - "eval_runtime": 220.0149, - "eval_samples_per_second": 934.073, - "eval_steps_per_second": 58.382, + "eval_loss": 2.5854294300079346, + "eval_runtime": 220.8061, + "eval_samples_per_second": 930.726, + "eval_steps_per_second": 58.173, "step": 72000 }, { "epoch": 0.33, - "learning_rate": 9.672308846827316e-06, - "loss": 2.5993, + "learning_rate": 3.963333333333333e-07, + "loss": 2.6752, "step": 80000 }, { "epoch": 0.33, - "eval_loss": 2.533210039138794, - "eval_runtime": 224.2726, - "eval_samples_per_second": 916.34, - "eval_steps_per_second": 57.274, + "eval_loss": 2.575528860092163, + "eval_runtime": 221.8521, + "eval_samples_per_second": 926.338, + "eval_steps_per_second": 57.899, "step": 80000 }, { "epoch": 0.36, - "eval_loss": 2.5342018604278564, - "eval_runtime": 221.6101, - "eval_samples_per_second": 927.349, - "eval_steps_per_second": 57.962, + "eval_loss": 2.5720720291137695, + "eval_runtime": 221.4472, + "eval_samples_per_second": 928.032, + "eval_steps_per_second": 58.005, "step": 88000 }, { "epoch": 0.39, - "learning_rate": 9.605603268573334e-06, - "loss": 2.5948, + "learning_rate": 3.936e-07, + "loss": 2.6657, "step": 96000 }, { "epoch": 0.39, - "eval_loss": 2.5296876430511475, - "eval_runtime": 221.2575, - "eval_samples_per_second": 928.827, - "eval_steps_per_second": 58.055, + "eval_loss": 2.5709290504455566, + "eval_runtime": 220.9006, + "eval_samples_per_second": 930.328, + "eval_steps_per_second": 58.148, "step": 96000 }, { "epoch": 0.43, - "eval_loss": 2.5301403999328613, - "eval_runtime": 221.9648, - "eval_samples_per_second": 925.868, - "eval_steps_per_second": 57.87, + "eval_loss": 2.5656096935272217, + "eval_runtime": 220.5433, + "eval_samples_per_second": 931.835, + "eval_steps_per_second": 58.243, "step": 104000 }, { "epoch": 0.46, - "learning_rate": 9.538897690319354e-06, - "loss": 2.5846, + "learning_rate": 3.908666666666667e-07, + "loss": 2.6534, "step": 112000 }, { "epoch": 0.46, - "eval_loss": 2.5199761390686035, - "eval_runtime": 221.7756, - "eval_samples_per_second": 926.658, - "eval_steps_per_second": 57.919, + "eval_loss": 2.5558407306671143, + "eval_runtime": 221.6371, + "eval_samples_per_second": 927.236, + "eval_steps_per_second": 57.955, "step": 112000 }, { "epoch": 0.49, - "eval_loss": 2.5174739360809326, - "eval_runtime": 222.132, - "eval_samples_per_second": 925.171, - "eval_steps_per_second": 57.826, + "eval_loss": 2.5495829582214355, + "eval_runtime": 220.7733, + "eval_samples_per_second": 930.864, + "eval_steps_per_second": 58.182, "step": 120000 }, { "epoch": 0.52, - "learning_rate": 9.472192112065373e-06, - "loss": 2.5774, + "learning_rate": 3.8813333333333334e-07, + "loss": 2.646, "step": 128000 }, { "epoch": 0.52, - "eval_loss": 2.5151257514953613, - "eval_runtime": 222.5971, - "eval_samples_per_second": 923.238, - "eval_steps_per_second": 57.705, + "eval_loss": 2.547106981277466, + "eval_runtime": 221.6448, + "eval_samples_per_second": 927.204, + "eval_steps_per_second": 57.953, "step": 128000 }, { "epoch": 0.56, - "eval_loss": 2.512612819671631, - "eval_runtime": 225.9428, - "eval_samples_per_second": 909.567, - "eval_steps_per_second": 56.851, + "eval_loss": 2.5408244132995605, + "eval_runtime": 221.6302, + "eval_samples_per_second": 927.265, + "eval_steps_per_second": 57.957, "step": 136000 }, { "epoch": 0.59, - "learning_rate": 9.405486533811392e-06, - "loss": 2.5584, + "learning_rate": 3.854e-07, + "loss": 2.625, "step": 144000 }, { "epoch": 0.59, - "eval_loss": 2.5006816387176514, - "eval_runtime": 225.1567, - "eval_samples_per_second": 912.742, - "eval_steps_per_second": 57.049, + "eval_loss": 2.531517744064331, + "eval_runtime": 223.6683, + "eval_samples_per_second": 918.816, + "eval_steps_per_second": 57.429, "step": 144000 }, { "epoch": 0.62, - "eval_loss": 2.5149295330047607, - "eval_runtime": 222.6305, - "eval_samples_per_second": 923.099, - "eval_steps_per_second": 57.697, + "eval_loss": 2.5364675521850586, + "eval_runtime": 224.1465, + "eval_samples_per_second": 916.856, + "eval_steps_per_second": 57.306, "step": 152000 }, { "epoch": 0.66, - "learning_rate": 9.338780955557409e-06, - "loss": 2.5578, + "learning_rate": 3.8266666666666665e-07, + "loss": 2.6222, "step": 160000 }, { "epoch": 0.66, - "eval_loss": 2.519784688949585, - "eval_runtime": 224.6765, - "eval_samples_per_second": 914.693, - "eval_steps_per_second": 57.171, + "eval_loss": 2.5372273921966553, + "eval_runtime": 221.7325, + "eval_samples_per_second": 926.837, + "eval_steps_per_second": 57.93, "step": 160000 }, { "epoch": 0.69, - "eval_loss": 2.5136947631835938, - "eval_runtime": 223.084, - "eval_samples_per_second": 921.223, - "eval_steps_per_second": 57.579, + "eval_loss": 2.534186363220215, + "eval_runtime": 222.3525, + "eval_samples_per_second": 924.253, + "eval_steps_per_second": 57.769, "step": 168000 }, { "epoch": 0.72, - "learning_rate": 9.272075377303427e-06, - "loss": 2.5699, + "learning_rate": 3.799333333333333e-07, + "loss": 2.6256, "step": 176000 }, { "epoch": 0.72, - "eval_loss": 2.5189852714538574, - "eval_runtime": 221.4167, - "eval_samples_per_second": 928.159, - "eval_steps_per_second": 58.013, + "eval_loss": 2.5308265686035156, + "eval_runtime": 221.538, + "eval_samples_per_second": 927.651, + "eval_steps_per_second": 57.981, "step": 176000 }, { "epoch": 0.75, - "eval_loss": 2.5280351638793945, - "eval_runtime": 220.8999, - "eval_samples_per_second": 930.331, - "eval_steps_per_second": 58.149, + "eval_loss": 2.5311617851257324, + "eval_runtime": 224.2919, + "eval_samples_per_second": 916.261, + "eval_steps_per_second": 57.269, "step": 184000 }, { "epoch": 0.79, - "learning_rate": 9.205369799049446e-06, - "loss": 2.5648, + "learning_rate": 3.772e-07, + "loss": 2.6074, "step": 192000 }, { "epoch": 0.79, - "eval_loss": 2.5205190181732178, - "eval_runtime": 220.8432, - "eval_samples_per_second": 930.57, - "eval_steps_per_second": 58.163, + "eval_loss": 2.522848129272461, + "eval_runtime": 224.4507, + "eval_samples_per_second": 915.613, + "eval_steps_per_second": 57.229, "step": 192000 }, { "epoch": 0.82, - "eval_loss": 2.5266730785369873, - "eval_runtime": 223.4826, - "eval_samples_per_second": 919.579, - "eval_steps_per_second": 57.477, + "eval_loss": 2.529161214828491, + "eval_runtime": 222.7477, + "eval_samples_per_second": 922.613, + "eval_steps_per_second": 57.666, "step": 200000 }, { "epoch": 0.85, - "learning_rate": 9.138664220795464e-06, - "loss": 2.5701, + "learning_rate": 3.7446666666666667e-07, + "loss": 2.6071, "step": 208000 }, { "epoch": 0.85, - "eval_loss": 2.533360719680786, - "eval_runtime": 222.9517, - "eval_samples_per_second": 921.769, - "eval_steps_per_second": 57.613, + "eval_loss": 2.5295047760009766, + "eval_runtime": 223.0891, + "eval_samples_per_second": 921.201, + "eval_steps_per_second": 57.578, "step": 208000 }, { "epoch": 0.89, - "eval_loss": 2.5359556674957275, - "eval_runtime": 223.0368, - "eval_samples_per_second": 921.418, - "eval_steps_per_second": 57.591, + "eval_loss": 2.523491621017456, + "eval_runtime": 221.4007, + "eval_samples_per_second": 928.227, + "eval_steps_per_second": 58.017, "step": 216000 }, { "epoch": 0.92, - "learning_rate": 9.071958642541483e-06, - "loss": 2.5688, + "learning_rate": 3.7173333333333333e-07, + "loss": 2.5955, "step": 224000 }, { "epoch": 0.92, - "eval_loss": 2.5403811931610107, - "eval_runtime": 219.8534, - "eval_samples_per_second": 934.759, - "eval_steps_per_second": 58.425, + "eval_loss": 2.5219199657440186, + "eval_runtime": 221.3605, + "eval_samples_per_second": 928.395, + "eval_steps_per_second": 58.028, "step": 224000 }, { "epoch": 0.95, - "eval_loss": 2.536496162414551, - "eval_runtime": 222.4622, - "eval_samples_per_second": 923.797, - "eval_steps_per_second": 57.74, + "eval_loss": 2.5190882682800293, + "eval_runtime": 221.1449, + "eval_samples_per_second": 929.3, + "eval_steps_per_second": 58.084, "step": 232000 }, { "epoch": 0.98, - "learning_rate": 9.005253064287502e-06, - "loss": 2.5788, + "learning_rate": 3.69e-07, + "loss": 2.6036, "step": 240000 }, { "epoch": 0.98, - "eval_loss": 2.5484516620635986, - "eval_runtime": 221.8944, - "eval_samples_per_second": 926.161, - "eval_steps_per_second": 57.888, + "eval_loss": 2.517120361328125, + "eval_runtime": 220.9198, + "eval_samples_per_second": 930.247, + "eval_steps_per_second": 58.143, "step": 240000 }, { "epoch": 1.02, - "eval_loss": 2.5406737327575684, - "eval_runtime": 223.364, - "eval_samples_per_second": 920.068, - "eval_steps_per_second": 57.507, + "eval_loss": 2.5102434158325195, + "eval_runtime": 221.7647, + "eval_samples_per_second": 926.703, + "eval_steps_per_second": 57.922, "step": 248000 }, { "epoch": 1.05, - "learning_rate": 8.93854748603352e-06, - "loss": 2.5857, + "learning_rate": 3.6626666666666664e-07, + "loss": 2.6046, "step": 256000 }, { "epoch": 1.05, - "eval_loss": 2.5444178581237793, - "eval_runtime": 222.6684, - "eval_samples_per_second": 922.942, - "eval_steps_per_second": 57.687, + "eval_loss": 2.5070137977600098, + "eval_runtime": 221.6584, + "eval_samples_per_second": 927.147, + "eval_steps_per_second": 57.95, "step": 256000 }, { "epoch": 1.08, - "eval_loss": 2.55073881149292, - "eval_runtime": 221.5347, - "eval_samples_per_second": 927.665, - "eval_steps_per_second": 57.982, + "eval_loss": 2.5109376907348633, + "eval_runtime": 221.3382, + "eval_samples_per_second": 928.489, + "eval_steps_per_second": 58.033, "step": 264000 }, { "epoch": 1.11, - "learning_rate": 8.871841907779539e-06, - "loss": 2.576, + "learning_rate": 3.6353333333333335e-07, + "loss": 2.5892, "step": 272000 }, { "epoch": 1.11, - "eval_loss": 2.5567193031311035, - "eval_runtime": 221.584, - "eval_samples_per_second": 927.459, - "eval_steps_per_second": 57.969, + "eval_loss": 2.5104565620422363, + "eval_runtime": 222.1683, + "eval_samples_per_second": 925.019, + "eval_steps_per_second": 57.817, "step": 272000 }, { "epoch": 1.15, - "eval_loss": 2.5561089515686035, - "eval_runtime": 221.035, - "eval_samples_per_second": 929.762, - "eval_steps_per_second": 58.113, + "eval_loss": 2.508704423904419, + "eval_runtime": 222.9629, + "eval_samples_per_second": 921.723, + "eval_steps_per_second": 57.61, "step": 280000 }, { "epoch": 1.18, - "learning_rate": 8.805136329525557e-06, - "loss": 2.5919, + "learning_rate": 3.608e-07, + "loss": 2.5929, "step": 288000 }, { "epoch": 1.18, - "eval_loss": 2.564077138900757, - "eval_runtime": 221.6286, - "eval_samples_per_second": 927.272, - "eval_steps_per_second": 57.957, + "eval_loss": 2.509392738342285, + "eval_runtime": 223.3494, + "eval_samples_per_second": 920.128, + "eval_steps_per_second": 57.511, "step": 288000 }, { "epoch": 1.21, - "eval_loss": 2.5574235916137695, - "eval_runtime": 221.5082, - "eval_samples_per_second": 927.776, - "eval_steps_per_second": 57.989, + "eval_loss": 2.508585214614868, + "eval_runtime": 222.7314, + "eval_samples_per_second": 922.681, + "eval_steps_per_second": 57.67, "step": 296000 }, { "epoch": 1.25, - "learning_rate": 8.738430751271576e-06, - "loss": 2.5893, + "learning_rate": 3.5806666666666666e-07, + "loss": 2.5857, "step": 304000 }, { "epoch": 1.25, - "eval_loss": 2.5562856197357178, - "eval_runtime": 221.2963, - "eval_samples_per_second": 928.664, - "eval_steps_per_second": 58.044, + "eval_loss": 2.4991345405578613, + "eval_runtime": 223.3332, + "eval_samples_per_second": 920.195, + "eval_steps_per_second": 57.515, "step": 304000 }, { "epoch": 1.28, - "eval_loss": 2.5707902908325195, - "eval_runtime": 220.7861, - "eval_samples_per_second": 930.811, - "eval_steps_per_second": 58.178, + "eval_loss": 2.508927822113037, + "eval_runtime": 224.1404, + "eval_samples_per_second": 916.881, + "eval_steps_per_second": 57.308, "step": 312000 }, { "epoch": 1.31, - "learning_rate": 8.671725173017595e-06, - "loss": 2.5896, + "learning_rate": 3.553333333333333e-07, + "loss": 2.5828, "step": 320000 }, { "epoch": 1.31, - "eval_loss": 2.571282386779785, - "eval_runtime": 222.5864, - "eval_samples_per_second": 923.282, - "eval_steps_per_second": 57.708, + "eval_loss": 2.501734972000122, + "eval_runtime": 223.1146, + "eval_samples_per_second": 921.096, + "eval_steps_per_second": 57.571, "step": 320000 }, { "epoch": 1.34, - "eval_loss": 2.5756185054779053, - "eval_runtime": 220.8371, - "eval_samples_per_second": 930.596, - "eval_steps_per_second": 58.165, + "eval_loss": 2.503918409347534, + "eval_runtime": 223.3327, + "eval_samples_per_second": 920.196, + "eval_steps_per_second": 57.515, "step": 328000 }, { "epoch": 1.38, - "learning_rate": 8.605019594763613e-06, - "loss": 2.6066, + "learning_rate": 3.5259999999999997e-07, + "loss": 2.5812, "step": 336000 }, { "epoch": 1.38, - "eval_loss": 2.5867831707000732, - "eval_runtime": 221.6633, - "eval_samples_per_second": 927.127, - "eval_steps_per_second": 57.948, + "eval_loss": 2.5064587593078613, + "eval_runtime": 224.1587, + "eval_samples_per_second": 916.806, + "eval_steps_per_second": 57.303, "step": 336000 }, { "epoch": 1.41, - "eval_loss": 2.5921335220336914, - "eval_runtime": 220.8265, - "eval_samples_per_second": 930.64, - "eval_steps_per_second": 58.168, + "eval_loss": 2.508263111114502, + "eval_runtime": 222.503, + "eval_samples_per_second": 923.628, + "eval_steps_per_second": 57.73, "step": 344000 }, { "epoch": 1.44, - "learning_rate": 8.538314016509632e-06, - "loss": 2.6121, + "learning_rate": 3.498666666666667e-07, + "loss": 2.5775, "step": 352000 }, { "epoch": 1.44, - "eval_loss": 2.605332136154175, - "eval_runtime": 224.0349, - "eval_samples_per_second": 917.312, - "eval_steps_per_second": 57.335, + "eval_loss": 2.509936571121216, + "eval_runtime": 223.039, + "eval_samples_per_second": 921.408, + "eval_steps_per_second": 57.591, "step": 352000 }, { "epoch": 1.48, - "eval_loss": 2.60457706451416, - "eval_runtime": 220.8455, - "eval_samples_per_second": 930.56, - "eval_steps_per_second": 58.163, + "eval_loss": 2.5078811645507812, + "eval_runtime": 221.7646, + "eval_samples_per_second": 926.703, + "eval_steps_per_second": 57.922, "step": 360000 }, { "epoch": 1.51, - "learning_rate": 8.471608438255649e-06, - "loss": 2.6161, + "learning_rate": 3.4713333333333333e-07, + "loss": 2.5711, "step": 368000 }, { "epoch": 1.51, - "eval_loss": 2.5993847846984863, - "eval_runtime": 222.5248, - "eval_samples_per_second": 923.537, - "eval_steps_per_second": 57.724, + "eval_loss": 2.4922046661376953, + "eval_runtime": 223.0544, + "eval_samples_per_second": 921.345, + "eval_steps_per_second": 57.587, "step": 368000 }, { "epoch": 1.54, - "eval_loss": 2.6035287380218506, - "eval_runtime": 221.0365, - "eval_samples_per_second": 929.756, - "eval_steps_per_second": 58.113, + "eval_loss": 2.5012030601501465, + "eval_runtime": 222.0392, + "eval_samples_per_second": 925.557, + "eval_steps_per_second": 57.85, "step": 376000 }, { "epoch": 1.57, - "learning_rate": 8.404902860001667e-06, - "loss": 2.6313, + "learning_rate": 3.444e-07, + "loss": 2.5797, "step": 384000 }, { "epoch": 1.57, - "eval_loss": 2.6119377613067627, - "eval_runtime": 221.8478, - "eval_samples_per_second": 926.356, - "eval_steps_per_second": 57.9, + "eval_loss": 2.49989914894104, + "eval_runtime": 223.8829, + "eval_samples_per_second": 917.935, + "eval_steps_per_second": 57.374, "step": 384000 }, { "epoch": 1.61, - "eval_loss": 2.601569175720215, - "eval_runtime": 221.1561, - "eval_samples_per_second": 929.253, - "eval_steps_per_second": 58.081, + "eval_loss": 2.4881107807159424, + "eval_runtime": 222.4413, + "eval_samples_per_second": 923.884, + "eval_steps_per_second": 57.746, "step": 392000 }, { "epoch": 1.64, - "learning_rate": 8.338197281747686e-06, - "loss": 2.6342, + "learning_rate": 3.416666666666667e-07, + "loss": 2.5718, "step": 400000 }, { "epoch": 1.64, - "eval_loss": 2.62074875831604, - "eval_runtime": 222.2544, - "eval_samples_per_second": 924.661, - "eval_steps_per_second": 57.794, + "eval_loss": 2.4960451126098633, + "eval_runtime": 222.8741, + "eval_samples_per_second": 922.09, + "eval_steps_per_second": 57.633, "step": 400000 }, { "epoch": 1.67, - "eval_loss": 2.623107433319092, - "eval_runtime": 221.8176, - "eval_samples_per_second": 926.482, - "eval_steps_per_second": 57.908, + "eval_loss": 2.490837574005127, + "eval_runtime": 222.3679, + "eval_samples_per_second": 924.189, + "eval_steps_per_second": 57.765, "step": 408000 }, { "epoch": 1.7, - "learning_rate": 8.271491703493705e-06, - "loss": 2.6358, + "learning_rate": 3.3893333333333335e-07, + "loss": 2.5627, "step": 416000 }, { "epoch": 1.7, - "eval_loss": 2.6287713050842285, - "eval_runtime": 224.4588, - "eval_samples_per_second": 915.58, - "eval_steps_per_second": 57.227, + "eval_loss": 2.4970648288726807, + "eval_runtime": 223.472, + "eval_samples_per_second": 919.623, + "eval_steps_per_second": 57.479, "step": 416000 }, { "epoch": 1.74, - "eval_loss": 2.633507490158081, - "eval_runtime": 222.6705, - "eval_samples_per_second": 922.933, - "eval_steps_per_second": 57.686, + "eval_loss": 2.4916465282440186, + "eval_runtime": 222.5109, + "eval_samples_per_second": 923.595, + "eval_steps_per_second": 57.728, "step": 424000 }, { "epoch": 1.77, - "learning_rate": 8.204786125239725e-06, - "loss": 2.6463, + "learning_rate": 3.3619999999999995e-07, + "loss": 2.5641, "step": 432000 }, { "epoch": 1.77, - "eval_loss": 2.63563871383667, - "eval_runtime": 221.9208, - "eval_samples_per_second": 926.051, - "eval_steps_per_second": 57.881, + "eval_loss": 2.4971389770507812, + "eval_runtime": 222.1533, + "eval_samples_per_second": 925.082, + "eval_steps_per_second": 57.82, "step": 432000 }, { "epoch": 1.8, - "eval_loss": 2.6429896354675293, - "eval_runtime": 222.0647, - "eval_samples_per_second": 925.451, - "eval_steps_per_second": 57.844, + "eval_loss": 2.495426654815674, + "eval_runtime": 223.2728, + "eval_samples_per_second": 920.444, + "eval_steps_per_second": 57.531, "step": 440000 }, { "epoch": 1.84, - "learning_rate": 8.138080546985743e-06, - "loss": 2.6561, + "learning_rate": 3.3346666666666666e-07, + "loss": 2.5633, "step": 448000 }, { "epoch": 1.84, - "eval_loss": 2.643951416015625, - "eval_runtime": 222.3078, - "eval_samples_per_second": 924.439, - "eval_steps_per_second": 57.78, + "eval_loss": 2.485994815826416, + "eval_runtime": 222.7264, + "eval_samples_per_second": 922.702, + "eval_steps_per_second": 57.672, "step": 448000 }, { "epoch": 1.87, - "eval_loss": 2.6444947719573975, - "eval_runtime": 222.0032, - "eval_samples_per_second": 925.707, - "eval_steps_per_second": 57.86, + "eval_loss": 2.4893651008605957, + "eval_runtime": 223.4251, + "eval_samples_per_second": 919.816, + "eval_steps_per_second": 57.491, "step": 456000 }, { "epoch": 1.9, - "learning_rate": 8.07137496873176e-06, - "loss": 2.6748, + "learning_rate": 3.307333333333333e-07, + "loss": 2.5676, "step": 464000 }, { "epoch": 1.9, - "eval_loss": 2.6506636142730713, - "eval_runtime": 223.5942, - "eval_samples_per_second": 919.121, - "eval_steps_per_second": 57.448, + "eval_loss": 2.489337205886841, + "eval_runtime": 222.9423, + "eval_samples_per_second": 921.808, + "eval_steps_per_second": 57.616, "step": 464000 }, { "epoch": 1.93, - "eval_loss": 2.6547434329986572, - "eval_runtime": 221.7433, - "eval_samples_per_second": 926.792, - "eval_steps_per_second": 57.927, + "eval_loss": 2.4883553981781006, + "eval_runtime": 223.2404, + "eval_samples_per_second": 920.577, + "eval_steps_per_second": 57.539, "step": 472000 }, { "epoch": 1.97, - "learning_rate": 8.004669390477779e-06, - "loss": 2.6847, + "learning_rate": 3.28e-07, + "loss": 2.5687, "step": 480000 }, { "epoch": 1.97, - "eval_loss": 2.664285898208618, - "eval_runtime": 221.7152, - "eval_samples_per_second": 926.91, - "eval_steps_per_second": 57.935, + "eval_loss": 2.4921038150787354, + "eval_runtime": 223.8809, + "eval_samples_per_second": 917.943, + "eval_steps_per_second": 57.374, "step": 480000 }, { "epoch": 2.0, - "eval_loss": 2.670330762863159, - "eval_runtime": 222.5951, - "eval_samples_per_second": 923.246, - "eval_steps_per_second": 57.706, + "eval_loss": 2.4873294830322266, + "eval_runtime": 222.8771, + "eval_samples_per_second": 922.078, + "eval_steps_per_second": 57.633, "step": 488000 }, { "epoch": 2.03, - "learning_rate": 7.937963812223798e-06, - "loss": 2.6772, + "learning_rate": 3.252666666666667e-07, + "loss": 2.5633, "step": 496000 }, { "epoch": 2.03, - "eval_loss": 2.674689292907715, - "eval_runtime": 222.9188, - "eval_samples_per_second": 921.905, - "eval_steps_per_second": 57.622, + "eval_loss": 2.4919497966766357, + "eval_runtime": 222.6439, + "eval_samples_per_second": 923.043, + "eval_steps_per_second": 57.693, "step": 496000 }, { "epoch": 2.07, - "eval_loss": 2.6655595302581787, - "eval_runtime": 221.9863, - "eval_samples_per_second": 925.778, - "eval_steps_per_second": 57.864, + "eval_loss": 2.482137441635132, + "eval_runtime": 222.747, + "eval_samples_per_second": 922.616, + "eval_steps_per_second": 57.666, "step": 504000 }, { "epoch": 2.1, - "learning_rate": 7.871258233969816e-06, - "loss": 2.6668, + "learning_rate": 3.2253333333333334e-07, + "loss": 2.5547, "step": 512000 }, { "epoch": 2.1, - "eval_loss": 2.676820993423462, - "eval_runtime": 222.0249, - "eval_samples_per_second": 925.617, - "eval_steps_per_second": 57.854, + "eval_loss": 2.490872621536255, + "eval_runtime": 222.6765, + "eval_samples_per_second": 922.908, + "eval_steps_per_second": 57.685, "step": 512000 }, { "epoch": 2.13, - "eval_loss": 2.669207811355591, - "eval_runtime": 221.7932, - "eval_samples_per_second": 926.584, - "eval_steps_per_second": 57.914, + "eval_loss": 2.4818356037139893, + "eval_runtime": 223.7166, + "eval_samples_per_second": 918.617, + "eval_steps_per_second": 57.416, "step": 520000 }, { "epoch": 2.16, - "learning_rate": 7.804552655715835e-06, - "loss": 2.6802, + "learning_rate": 3.198e-07, + "loss": 2.5617, "step": 528000 }, { "epoch": 2.16, - "eval_loss": 2.6729602813720703, - "eval_runtime": 221.933, - "eval_samples_per_second": 926.0, - "eval_steps_per_second": 57.878, + "eval_loss": 2.4854869842529297, + "eval_runtime": 223.7715, + "eval_samples_per_second": 918.392, + "eval_steps_per_second": 57.402, "step": 528000 }, { "epoch": 2.2, - "eval_loss": 2.67464017868042, - "eval_runtime": 222.2198, - "eval_samples_per_second": 924.805, - "eval_steps_per_second": 57.803, + "eval_loss": 2.48504638671875, + "eval_runtime": 223.6654, + "eval_samples_per_second": 918.828, + "eval_steps_per_second": 57.43, "step": 536000 }, { "epoch": 2.23, - "learning_rate": 7.737847077461853e-06, - "loss": 2.6856, + "learning_rate": 3.1706666666666665e-07, + "loss": 2.5569, "step": 544000 }, { "epoch": 2.23, - "eval_loss": 2.678727865219116, - "eval_runtime": 222.413, - "eval_samples_per_second": 924.002, - "eval_steps_per_second": 57.753, + "eval_loss": 2.480282783508301, + "eval_runtime": 222.7744, + "eval_samples_per_second": 922.503, + "eval_steps_per_second": 57.659, "step": 544000 }, { "epoch": 2.26, - "eval_loss": 2.6777920722961426, - "eval_runtime": 222.6488, - "eval_samples_per_second": 923.023, - "eval_steps_per_second": 57.692, + "eval_loss": 2.4775896072387695, + "eval_runtime": 223.0018, + "eval_samples_per_second": 921.562, + "eval_steps_per_second": 57.6, "step": 552000 }, { "epoch": 2.29, - "learning_rate": 7.671141499207872e-06, - "loss": 2.6874, + "learning_rate": 3.1433333333333336e-07, + "loss": 2.5535, "step": 560000 }, { "epoch": 2.29, - "eval_loss": 2.6909406185150146, - "eval_runtime": 221.9152, - "eval_samples_per_second": 926.074, - "eval_steps_per_second": 57.882, + "eval_loss": 2.4824471473693848, + "eval_runtime": 223.1733, + "eval_samples_per_second": 920.854, + "eval_steps_per_second": 57.556, "step": 560000 }, { "epoch": 2.33, - "eval_loss": 2.691913604736328, - "eval_runtime": 222.3607, - "eval_samples_per_second": 924.219, - "eval_steps_per_second": 57.766, + "eval_loss": 2.4821510314941406, + "eval_runtime": 224.1586, + "eval_samples_per_second": 916.806, + "eval_steps_per_second": 57.303, "step": 568000 }, { "epoch": 2.36, - "learning_rate": 7.604435920953891e-06, - "loss": 2.6956, + "learning_rate": 3.116e-07, + "loss": 2.5534, "step": 576000 }, { "epoch": 2.36, - "eval_loss": 2.69474720954895, - "eval_runtime": 222.1948, - "eval_samples_per_second": 924.909, - "eval_steps_per_second": 57.81, + "eval_loss": 2.476337432861328, + "eval_runtime": 223.4733, + "eval_samples_per_second": 919.618, + "eval_steps_per_second": 57.479, "step": 576000 }, { "epoch": 2.39, - "eval_loss": 2.7031822204589844, - "eval_runtime": 223.2718, - "eval_samples_per_second": 920.447, - "eval_steps_per_second": 57.531, + "eval_loss": 2.47969388961792, + "eval_runtime": 224.2217, + "eval_samples_per_second": 916.548, + "eval_steps_per_second": 57.287, "step": 584000 }, { "epoch": 2.43, - "learning_rate": 7.537730342699909e-06, - "loss": 2.7081, + "learning_rate": 3.0886666666666667e-07, + "loss": 2.5583, "step": 592000 }, { "epoch": 2.43, - "eval_loss": 2.707876205444336, - "eval_runtime": 222.7056, - "eval_samples_per_second": 922.788, - "eval_steps_per_second": 57.677, + "eval_loss": 2.4872305393218994, + "eval_runtime": 224.237, + "eval_samples_per_second": 916.486, + "eval_steps_per_second": 57.283, "step": 592000 }, { "epoch": 2.46, - "eval_loss": 2.710313320159912, - "eval_runtime": 223.6841, - "eval_samples_per_second": 918.751, - "eval_steps_per_second": 57.425, + "eval_loss": 2.4812192916870117, + "eval_runtime": 222.6272, + "eval_samples_per_second": 923.113, + "eval_steps_per_second": 57.697, "step": 600000 }, { "epoch": 2.49, - "learning_rate": 7.471024764445928e-06, - "loss": 2.7124, + "learning_rate": 3.061333333333333e-07, + "loss": 2.5545, "step": 608000 }, { "epoch": 2.49, - "eval_loss": 2.7138588428497314, - "eval_runtime": 222.1167, - "eval_samples_per_second": 925.234, - "eval_steps_per_second": 57.83, + "eval_loss": 2.474827527999878, + "eval_runtime": 223.4042, + "eval_samples_per_second": 919.902, + "eval_steps_per_second": 57.497, "step": 608000 }, { "epoch": 2.52, - "eval_loss": 2.710862636566162, - "eval_runtime": 222.0676, - "eval_samples_per_second": 925.439, - "eval_steps_per_second": 57.843, + "eval_loss": 2.4735865592956543, + "eval_runtime": 224.1504, + "eval_samples_per_second": 916.84, + "eval_steps_per_second": 57.305, "step": 616000 }, { "epoch": 2.56, - "learning_rate": 7.4043191861919465e-06, - "loss": 2.7221, + "learning_rate": 3.034e-07, + "loss": 2.5561, "step": 624000 }, { "epoch": 2.56, - "eval_loss": 2.7152557373046875, - "eval_runtime": 223.9434, - "eval_samples_per_second": 917.687, - "eval_steps_per_second": 57.358, + "eval_loss": 2.4714128971099854, + "eval_runtime": 223.2085, + "eval_samples_per_second": 920.709, + "eval_steps_per_second": 57.547, "step": 624000 }, { "epoch": 2.59, - "eval_loss": 2.735896348953247, - "eval_runtime": 222.2379, - "eval_samples_per_second": 924.73, - "eval_steps_per_second": 57.798, + "eval_loss": 2.485759973526001, + "eval_runtime": 222.8361, + "eval_samples_per_second": 922.247, + "eval_steps_per_second": 57.643, "step": 632000 }, { "epoch": 2.62, - "learning_rate": 7.337613607937964e-06, - "loss": 2.7131, + "learning_rate": 3.0066666666666663e-07, + "loss": 2.5384, "step": 640000 }, { "epoch": 2.62, - "eval_loss": 2.7278735637664795, - "eval_runtime": 222.8278, - "eval_samples_per_second": 922.282, - "eval_steps_per_second": 57.645, + "eval_loss": 2.482938289642334, + "eval_runtime": 223.4494, + "eval_samples_per_second": 919.716, + "eval_steps_per_second": 57.485, "step": 640000 }, { "epoch": 2.66, - "eval_loss": 2.737804651260376, - "eval_runtime": 223.5516, - "eval_samples_per_second": 919.296, - "eval_steps_per_second": 57.459, + "eval_loss": 2.47662091255188, + "eval_runtime": 222.9171, + "eval_samples_per_second": 921.912, + "eval_steps_per_second": 57.622, "step": 648000 }, { "epoch": 2.69, - "learning_rate": 7.270908029683983e-06, - "loss": 2.7268, + "learning_rate": 2.9793333333333334e-07, + "loss": 2.541, "step": 656000 }, { "epoch": 2.69, - "eval_loss": 2.73801851272583, - "eval_runtime": 222.415, - "eval_samples_per_second": 923.993, - "eval_steps_per_second": 57.752, + "eval_loss": 2.4835963249206543, + "eval_runtime": 223.4062, + "eval_samples_per_second": 919.894, + "eval_steps_per_second": 57.496, "step": 656000 }, { "epoch": 2.72, - "eval_loss": 2.727534294128418, - "eval_runtime": 222.4068, - "eval_samples_per_second": 924.027, - "eval_steps_per_second": 57.755, + "eval_loss": 2.465118408203125, + "eval_runtime": 226.1239, + "eval_samples_per_second": 908.838, + "eval_steps_per_second": 56.805, "step": 664000 }, { "epoch": 2.75, - "learning_rate": 7.2042024514300015e-06, - "loss": 2.7373, + "learning_rate": 2.952e-07, + "loss": 2.5439, "step": 672000 }, { "epoch": 2.75, - "eval_loss": 2.7439777851104736, - "eval_runtime": 224.2928, - "eval_samples_per_second": 916.258, - "eval_steps_per_second": 57.269, + "eval_loss": 2.4797005653381348, + "eval_runtime": 224.1173, + "eval_samples_per_second": 916.975, + "eval_steps_per_second": 57.314, "step": 672000 }, { "epoch": 2.79, - "eval_loss": 2.7381510734558105, - "eval_runtime": 223.3597, - "eval_samples_per_second": 920.086, - "eval_steps_per_second": 57.508, + "eval_loss": 2.4702000617980957, + "eval_runtime": 223.8532, + "eval_samples_per_second": 918.057, + "eval_steps_per_second": 57.381, "step": 680000 }, { "epoch": 2.82, - "learning_rate": 7.13749687317602e-06, - "loss": 2.7576, + "learning_rate": 2.9246666666666665e-07, + "loss": 2.5597, "step": 688000 }, { "epoch": 2.82, - "eval_loss": 2.743011236190796, - "eval_runtime": 223.7189, - "eval_samples_per_second": 918.608, - "eval_steps_per_second": 57.416, + "eval_loss": 2.475144386291504, + "eval_runtime": 223.4589, + "eval_samples_per_second": 919.677, + "eval_steps_per_second": 57.483, "step": 688000 }, { "epoch": 2.85, - "eval_loss": 2.742142915725708, - "eval_runtime": 223.6634, - "eval_samples_per_second": 918.836, - "eval_steps_per_second": 57.43, + "eval_loss": 2.474367618560791, + "eval_runtime": 222.9092, + "eval_samples_per_second": 921.945, + "eval_steps_per_second": 57.624, "step": 696000 }, { "epoch": 2.88, - "learning_rate": 7.070791294922038e-06, - "loss": 2.7495, + "learning_rate": 2.897333333333333e-07, + "loss": 2.5491, "step": 704000 }, { "epoch": 2.88, - "eval_loss": 2.751878261566162, - "eval_runtime": 223.6777, - "eval_samples_per_second": 918.777, - "eval_steps_per_second": 57.426, + "eval_loss": 2.4756221771240234, + "eval_runtime": 223.5443, + "eval_samples_per_second": 919.325, + "eval_steps_per_second": 57.461, "step": 704000 }, { "epoch": 2.92, - "eval_loss": 2.749375104904175, - "eval_runtime": 223.9511, - "eval_samples_per_second": 917.656, - "eval_steps_per_second": 57.356, + "eval_loss": 2.4731247425079346, + "eval_runtime": 223.5397, + "eval_samples_per_second": 919.345, + "eval_steps_per_second": 57.462, "step": 712000 }, { "epoch": 2.95, - "learning_rate": 7.0040857166680564e-06, - "loss": 2.7626, + "learning_rate": 2.8699999999999996e-07, + "loss": 2.5505, "step": 720000 }, { "epoch": 2.95, - "eval_loss": 2.754082202911377, - "eval_runtime": 222.8179, - "eval_samples_per_second": 922.323, - "eval_steps_per_second": 57.648, + "eval_loss": 2.475615978240967, + "eval_runtime": 223.941, + "eval_samples_per_second": 917.697, + "eval_steps_per_second": 57.359, "step": 720000 }, { "epoch": 2.98, - "eval_loss": 2.7496798038482666, - "eval_runtime": 223.5475, - "eval_samples_per_second": 919.312, - "eval_steps_per_second": 57.46, + "eval_loss": 2.4703986644744873, + "eval_runtime": 224.1288, + "eval_samples_per_second": 916.928, + "eval_steps_per_second": 57.311, "step": 728000 }, { "epoch": 3.02, - "learning_rate": 6.937380138414076e-06, - "loss": 2.7551, + "learning_rate": 2.8426666666666667e-07, + "loss": 2.5432, "step": 736000 }, { "epoch": 3.02, - "eval_loss": 2.762470245361328, - "eval_runtime": 223.2771, - "eval_samples_per_second": 920.426, - "eval_steps_per_second": 57.529, + "eval_loss": 2.4762611389160156, + "eval_runtime": 223.7009, + "eval_samples_per_second": 918.682, + "eval_steps_per_second": 57.42, "step": 736000 }, { "epoch": 3.05, - "eval_loss": 2.764946937561035, - "eval_runtime": 222.8366, - "eval_samples_per_second": 922.245, - "eval_steps_per_second": 57.643, + "eval_loss": 2.4743261337280273, + "eval_runtime": 224.407, + "eval_samples_per_second": 915.791, + "eval_steps_per_second": 57.24, "step": 744000 }, { "epoch": 3.08, - "learning_rate": 6.8706745601600945e-06, - "loss": 2.7606, + "learning_rate": 2.815333333333333e-07, + "loss": 2.5485, "step": 752000 }, { "epoch": 3.08, - "eval_loss": 2.759011745452881, - "eval_runtime": 223.0437, - "eval_samples_per_second": 921.389, - "eval_steps_per_second": 57.59, + "eval_loss": 2.4626660346984863, + "eval_runtime": 224.0612, + "eval_samples_per_second": 917.205, + "eval_steps_per_second": 57.328, "step": 752000 }, { "epoch": 3.11, - "eval_loss": 2.7598087787628174, - "eval_runtime": 223.0646, - "eval_samples_per_second": 921.303, - "eval_steps_per_second": 57.584, + "eval_loss": 2.471444606781006, + "eval_runtime": 223.4318, + "eval_samples_per_second": 919.788, + "eval_steps_per_second": 57.49, "step": 760000 }, { "epoch": 3.15, - "learning_rate": 6.803968981906113e-06, - "loss": 2.7709, + "learning_rate": 2.7880000000000003e-07, + "loss": 2.5482, "step": 768000 }, { "epoch": 3.15, - "eval_loss": 2.7685601711273193, - "eval_runtime": 223.0947, - "eval_samples_per_second": 921.178, - "eval_steps_per_second": 57.576, + "eval_loss": 2.4684672355651855, + "eval_runtime": 224.1026, + "eval_samples_per_second": 917.035, + "eval_steps_per_second": 57.317, "step": 768000 }, { "epoch": 3.18, - "eval_loss": 2.769714117050171, - "eval_runtime": 224.3593, - "eval_samples_per_second": 915.986, - "eval_steps_per_second": 57.252, + "eval_loss": 2.4672694206237793, + "eval_runtime": 224.9545, + "eval_samples_per_second": 913.562, + "eval_steps_per_second": 57.1, "step": 776000 }, { "epoch": 3.21, - "learning_rate": 6.737263403652131e-06, - "loss": 2.7687, + "learning_rate": 2.7606666666666664e-07, + "loss": 2.5411, "step": 784000 }, { "epoch": 3.21, - "eval_loss": 2.7807157039642334, - "eval_runtime": 224.1624, - "eval_samples_per_second": 916.791, - "eval_steps_per_second": 57.302, + "eval_loss": 2.4726006984710693, + "eval_runtime": 224.2901, + "eval_samples_per_second": 916.269, + "eval_steps_per_second": 57.27, "step": 784000 }, { "epoch": 3.25, - "eval_loss": 2.783041000366211, - "eval_runtime": 224.6759, - "eval_samples_per_second": 914.695, - "eval_steps_per_second": 57.171, + "eval_loss": 2.476133108139038, + "eval_runtime": 224.2831, + "eval_samples_per_second": 916.297, + "eval_steps_per_second": 57.271, "step": 792000 }, { "epoch": 3.28, - "learning_rate": 6.6705578253981495e-06, - "loss": 2.7745, + "learning_rate": 2.733333333333333e-07, + "loss": 2.5407, "step": 800000 }, { "epoch": 3.28, - "eval_loss": 2.7677245140075684, - "eval_runtime": 223.5451, - "eval_samples_per_second": 919.322, - "eval_steps_per_second": 57.46, + "eval_loss": 2.4611737728118896, + "eval_runtime": 223.8119, + "eval_samples_per_second": 918.226, + "eval_steps_per_second": 57.392, "step": 800000 }, { "epoch": 3.31, - "eval_loss": 2.789714813232422, - "eval_runtime": 224.1358, - "eval_samples_per_second": 916.9, - "eval_steps_per_second": 57.309, + "eval_loss": 2.4742894172668457, + "eval_runtime": 224.8606, + "eval_samples_per_second": 913.944, + "eval_steps_per_second": 57.124, "step": 808000 }, { "epoch": 3.34, - "learning_rate": 6.603852247144168e-06, - "loss": 2.7596, + "learning_rate": 2.706e-07, + "loss": 2.5307, "step": 816000 }, { "epoch": 3.34, - "eval_loss": 2.782496690750122, - "eval_runtime": 224.2552, - "eval_samples_per_second": 916.411, - "eval_steps_per_second": 57.278, + "eval_loss": 2.469853401184082, + "eval_runtime": 224.177, + "eval_samples_per_second": 916.731, + "eval_steps_per_second": 57.298, "step": 816000 }, { "epoch": 3.38, - "eval_loss": 2.782914400100708, - "eval_runtime": 225.4775, - "eval_samples_per_second": 911.444, - "eval_steps_per_second": 56.968, + "eval_loss": 2.4721498489379883, + "eval_runtime": 223.5611, + "eval_samples_per_second": 919.256, + "eval_steps_per_second": 57.456, "step": 824000 }, { "epoch": 3.41, - "learning_rate": 6.537146668890187e-06, - "loss": 2.7749, + "learning_rate": 2.6786666666666666e-07, + "loss": 2.5391, "step": 832000 }, { "epoch": 3.41, - "eval_loss": 2.7868270874023438, - "eval_runtime": 224.6633, - "eval_samples_per_second": 914.747, - "eval_steps_per_second": 57.174, + "eval_loss": 2.461381435394287, + "eval_runtime": 224.3794, + "eval_samples_per_second": 915.904, + "eval_steps_per_second": 57.247, "step": 832000 }, { "epoch": 3.44, - "eval_loss": 2.7875137329101562, - "eval_runtime": 223.5713, - "eval_samples_per_second": 919.214, - "eval_steps_per_second": 57.454, + "eval_loss": 2.4641225337982178, + "eval_runtime": 224.9887, + "eval_samples_per_second": 913.423, + "eval_steps_per_second": 57.092, "step": 840000 }, { "epoch": 3.47, - "learning_rate": 6.4704410906362044e-06, - "loss": 2.7788, + "learning_rate": 2.651333333333333e-07, + "loss": 2.5378, "step": 848000 }, { "epoch": 3.47, - "eval_loss": 2.7906622886657715, - "eval_runtime": 223.506, - "eval_samples_per_second": 919.483, - "eval_steps_per_second": 57.471, + "eval_loss": 2.4652435779571533, + "eval_runtime": 225.299, + "eval_samples_per_second": 912.166, + "eval_steps_per_second": 57.013, "step": 848000 }, { "epoch": 3.51, - "eval_loss": 2.7915995121002197, - "eval_runtime": 225.1506, - "eval_samples_per_second": 912.767, - "eval_steps_per_second": 57.051, + "eval_loss": 2.4640512466430664, + "eval_runtime": 224.7841, + "eval_samples_per_second": 914.255, + "eval_steps_per_second": 57.144, "step": 856000 }, { "epoch": 3.54, - "learning_rate": 6.403735512382223e-06, - "loss": 2.7792, + "learning_rate": 2.624e-07, + "loss": 2.5399, "step": 864000 }, { "epoch": 3.54, - "eval_loss": 2.793468713760376, - "eval_runtime": 223.5626, - "eval_samples_per_second": 919.25, - "eval_steps_per_second": 57.456, + "eval_loss": 2.469067096710205, + "eval_runtime": 225.0567, + "eval_samples_per_second": 913.148, + "eval_steps_per_second": 57.075, "step": 864000 }, { "epoch": 3.57, - "eval_loss": 2.789273977279663, - "eval_runtime": 224.3088, - "eval_samples_per_second": 916.192, - "eval_steps_per_second": 57.265, + "eval_loss": 2.4611856937408447, + "eval_runtime": 224.6227, + "eval_samples_per_second": 914.912, + "eval_steps_per_second": 57.185, "step": 872000 }, { "epoch": 3.61, - "learning_rate": 6.337029934128242e-06, - "loss": 2.7871, + "learning_rate": 2.596666666666667e-07, + "loss": 2.5412, "step": 880000 }, { "epoch": 3.61, - "eval_loss": 2.8002703189849854, - "eval_runtime": 223.9319, - "eval_samples_per_second": 917.734, - "eval_steps_per_second": 57.361, + "eval_loss": 2.469621419906616, + "eval_runtime": 225.3239, + "eval_samples_per_second": 912.065, + "eval_steps_per_second": 57.007, "step": 880000 }, { "epoch": 3.64, - "eval_loss": 2.79728364944458, - "eval_runtime": 224.811, - "eval_samples_per_second": 914.145, - "eval_steps_per_second": 57.137, + "eval_loss": 2.4638073444366455, + "eval_runtime": 224.7878, + "eval_samples_per_second": 914.24, + "eval_steps_per_second": 57.143, "step": 888000 }, { "epoch": 3.67, - "learning_rate": 6.270324355874261e-06, - "loss": 2.7967, + "learning_rate": 2.5693333333333333e-07, + "loss": 2.5389, "step": 896000 }, { "epoch": 3.67, - "eval_loss": 2.810227394104004, - "eval_runtime": 223.8856, - "eval_samples_per_second": 917.924, - "eval_steps_per_second": 57.373, + "eval_loss": 2.4658303260803223, + "eval_runtime": 224.4987, + "eval_samples_per_second": 915.417, + "eval_steps_per_second": 57.216, "step": 896000 }, { "epoch": 3.7, - "eval_loss": 2.811846971511841, - "eval_runtime": 226.2552, - "eval_samples_per_second": 908.31, - "eval_steps_per_second": 56.772, + "eval_loss": 2.4725189208984375, + "eval_runtime": 226.711, + "eval_samples_per_second": 906.484, + "eval_steps_per_second": 56.658, "step": 904000 }, { "epoch": 3.74, - "learning_rate": 6.20361877762028e-06, - "loss": 2.7896, + "learning_rate": 2.542e-07, + "loss": 2.5325, "step": 912000 }, { "epoch": 3.74, - "eval_loss": 2.805860757827759, - "eval_runtime": 224.7634, - "eval_samples_per_second": 914.339, - "eval_steps_per_second": 57.149, + "eval_loss": 2.46415114402771, + "eval_runtime": 225.3655, + "eval_samples_per_second": 911.896, + "eval_steps_per_second": 56.996, "step": 912000 }, { "epoch": 3.77, - "eval_loss": 2.8134682178497314, - "eval_runtime": 223.9747, - "eval_samples_per_second": 917.559, - "eval_steps_per_second": 57.35, + "eval_loss": 2.4599404335021973, + "eval_runtime": 224.7087, + "eval_samples_per_second": 914.562, + "eval_steps_per_second": 57.163, "step": 920000 }, { "epoch": 3.8, - "learning_rate": 6.1369131993662975e-06, - "loss": 2.8021, + "learning_rate": 2.5146666666666664e-07, + "loss": 2.5351, "step": 928000 }, { "epoch": 3.8, - "eval_loss": 2.810849905014038, - "eval_runtime": 226.3005, - "eval_samples_per_second": 908.129, - "eval_steps_per_second": 56.761, + "eval_loss": 2.4616599082946777, + "eval_runtime": 226.6966, + "eval_samples_per_second": 906.542, + "eval_steps_per_second": 56.662, "step": 928000 }, { "epoch": 3.84, - "eval_loss": 2.8164422512054443, - "eval_runtime": 227.0284, - "eval_samples_per_second": 905.217, - "eval_steps_per_second": 56.579, + "eval_loss": 2.464627265930176, + "eval_runtime": 224.9933, + "eval_samples_per_second": 913.405, + "eval_steps_per_second": 57.091, "step": 936000 }, { "epoch": 3.87, - "learning_rate": 6.070207621112316e-06, - "loss": 2.7931, + "learning_rate": 2.4873333333333335e-07, + "loss": 2.522, "step": 944000 }, { "epoch": 3.87, - "eval_loss": 2.8173489570617676, - "eval_runtime": 225.5541, - "eval_samples_per_second": 911.134, - "eval_steps_per_second": 56.949, + "eval_loss": 2.4665021896362305, + "eval_runtime": 225.0978, + "eval_samples_per_second": 912.981, + "eval_steps_per_second": 57.064, "step": 944000 }, { "epoch": 3.9, - "eval_loss": 2.829514741897583, - "eval_runtime": 225.3778, - "eval_samples_per_second": 911.847, - "eval_steps_per_second": 56.993, + "eval_loss": 2.4761972427368164, + "eval_runtime": 224.2641, + "eval_samples_per_second": 916.375, + "eval_steps_per_second": 57.276, "step": 952000 }, { "epoch": 3.93, - "learning_rate": 6.003502042858335e-06, - "loss": 2.8105, + "learning_rate": 2.46e-07, + "loss": 2.5331, "step": 960000 }, { "epoch": 3.93, - "eval_loss": 2.8248379230499268, - "eval_runtime": 224.9909, - "eval_samples_per_second": 913.415, - "eval_steps_per_second": 57.091, + "eval_loss": 2.4668779373168945, + "eval_runtime": 225.069, + "eval_samples_per_second": 913.098, + "eval_steps_per_second": 57.071, "step": 960000 }, { "epoch": 3.97, - "eval_loss": 2.812260627746582, - "eval_runtime": 225.6651, - "eval_samples_per_second": 910.686, - "eval_steps_per_second": 56.921, + "eval_loss": 2.4549825191497803, + "eval_runtime": 224.677, + "eval_samples_per_second": 914.691, + "eval_steps_per_second": 57.171, "step": 968000 }, { "epoch": 4.0, - "learning_rate": 5.936796464604353e-06, - "loss": 2.805, + "learning_rate": 2.4326666666666666e-07, + "loss": 2.5276, "step": 976000 }, { "epoch": 4.0, - "eval_loss": 2.829453468322754, - "eval_runtime": 224.9048, - "eval_samples_per_second": 913.764, - "eval_steps_per_second": 57.113, + "eval_loss": 2.466226577758789, + "eval_runtime": 224.6009, + "eval_samples_per_second": 915.001, + "eval_steps_per_second": 57.19, "step": 976000 }, { "epoch": 4.03, - "eval_loss": 2.8286798000335693, - "eval_runtime": 224.572, - "eval_samples_per_second": 915.119, - "eval_steps_per_second": 57.198, + "eval_loss": 2.464536190032959, + "eval_runtime": 224.76, + "eval_samples_per_second": 914.353, + "eval_steps_per_second": 57.15, "step": 984000 }, { "epoch": 4.06, - "learning_rate": 5.870090886350371e-06, - "loss": 2.7959, + "learning_rate": 2.405333333333333e-07, + "loss": 2.5206, "step": 992000 }, { "epoch": 4.06, - "eval_loss": 2.826554298400879, - "eval_runtime": 224.6548, - "eval_samples_per_second": 914.781, - "eval_steps_per_second": 57.177, + "eval_loss": 2.4586591720581055, + "eval_runtime": 225.6548, + "eval_samples_per_second": 910.727, + "eval_steps_per_second": 56.923, "step": 992000 }, { "epoch": 4.1, - "eval_loss": 2.844482898712158, - "eval_runtime": 225.9544, - "eval_samples_per_second": 909.52, - "eval_steps_per_second": 56.848, + "eval_loss": 2.47253680229187, + "eval_runtime": 225.9081, + "eval_samples_per_second": 909.706, + "eval_steps_per_second": 56.859, "step": 1000000 }, { "epoch": 4.13, - "learning_rate": 5.80338530809639e-06, - "loss": 2.8126, + "learning_rate": 2.3779999999999997e-07, + "loss": 2.5294, "step": 1008000 }, { "epoch": 4.13, - "eval_loss": 2.8259875774383545, - "eval_runtime": 226.7424, - "eval_samples_per_second": 906.359, - "eval_steps_per_second": 56.65, + "eval_loss": 2.458824634552002, + "eval_runtime": 224.8489, + "eval_samples_per_second": 913.991, + "eval_steps_per_second": 57.127, "step": 1008000 }, { "epoch": 4.16, - "eval_loss": 2.829375743865967, - "eval_runtime": 225.2876, - "eval_samples_per_second": 912.212, - "eval_steps_per_second": 57.016, + "eval_loss": 2.4591076374053955, + "eval_runtime": 225.8271, + "eval_samples_per_second": 910.032, + "eval_steps_per_second": 56.88, "step": 1016000 }, { "epoch": 4.2, - "learning_rate": 5.736679729842408e-06, - "loss": 2.8085, + "learning_rate": 2.3506666666666668e-07, + "loss": 2.5312, "step": 1024000 }, { "epoch": 4.2, - "eval_loss": 2.8393983840942383, - "eval_runtime": 224.8219, - "eval_samples_per_second": 914.101, - "eval_steps_per_second": 57.134, + "eval_loss": 2.4680891036987305, + "eval_runtime": 226.319, + "eval_samples_per_second": 908.055, + "eval_steps_per_second": 56.756, "step": 1024000 }, { "epoch": 4.23, - "eval_loss": 2.835977077484131, - "eval_runtime": 226.9491, - "eval_samples_per_second": 905.533, - "eval_steps_per_second": 56.599, + "eval_loss": 2.4624712467193604, + "eval_runtime": 226.6472, + "eval_samples_per_second": 906.74, + "eval_steps_per_second": 56.674, "step": 1032000 }, { "epoch": 4.26, - "learning_rate": 5.669974151588427e-06, - "loss": 2.8179, + "learning_rate": 2.3233333333333334e-07, + "loss": 2.525, "step": 1040000 }, { "epoch": 4.26, - "eval_loss": 2.8393590450286865, - "eval_runtime": 226.0543, - "eval_samples_per_second": 909.118, - "eval_steps_per_second": 56.823, + "eval_loss": 2.4659371376037598, + "eval_runtime": 225.5287, + "eval_samples_per_second": 911.237, + "eval_steps_per_second": 56.955, "step": 1040000 }, { "epoch": 4.29, - "eval_loss": 2.840928077697754, - "eval_runtime": 226.338, - "eval_samples_per_second": 907.978, - "eval_steps_per_second": 56.751, + "eval_loss": 2.460909843444824, + "eval_runtime": 225.5374, + "eval_samples_per_second": 911.201, + "eval_steps_per_second": 56.953, "step": 1048000 }, { "epoch": 4.33, - "learning_rate": 5.603268573334446e-06, - "loss": 2.8212, + "learning_rate": 2.2960000000000002e-07, + "loss": 2.5318, "step": 1056000 }, { "epoch": 4.33, - "eval_loss": 2.834209442138672, - "eval_runtime": 225.5106, - "eval_samples_per_second": 911.31, - "eval_steps_per_second": 56.96, + "eval_loss": 2.4571011066436768, + "eval_runtime": 225.1138, + "eval_samples_per_second": 912.916, + "eval_steps_per_second": 57.06, "step": 1056000 }, { "epoch": 4.36, - "eval_loss": 2.8416330814361572, - "eval_runtime": 226.856, - "eval_samples_per_second": 905.905, - "eval_steps_per_second": 56.622, + "eval_loss": 2.4581968784332275, + "eval_runtime": 226.7154, + "eval_samples_per_second": 906.467, + "eval_steps_per_second": 56.657, "step": 1064000 }, { "epoch": 4.39, - "learning_rate": 5.536562995080464e-06, - "loss": 2.8236, + "learning_rate": 2.2686666666666667e-07, + "loss": 2.5332, "step": 1072000 }, { "epoch": 4.39, - "eval_loss": 2.839299440383911, - "eval_runtime": 228.289, - "eval_samples_per_second": 900.218, - "eval_steps_per_second": 56.266, + "eval_loss": 2.456618547439575, + "eval_runtime": 225.422, + "eval_samples_per_second": 911.668, + "eval_steps_per_second": 56.982, "step": 1072000 }, { "epoch": 4.43, - "eval_loss": 2.8458597660064697, - "eval_runtime": 228.7211, - "eval_samples_per_second": 898.518, - "eval_steps_per_second": 56.16, + "eval_loss": 2.4587738513946533, + "eval_runtime": 226.533, + "eval_samples_per_second": 907.197, + "eval_steps_per_second": 56.703, "step": 1080000 }, { "epoch": 4.46, - "learning_rate": 5.469857416826483e-06, - "loss": 2.8134, + "learning_rate": 2.2413333333333333e-07, + "loss": 2.5168, "step": 1088000 }, { "epoch": 4.46, - "eval_loss": 2.844362497329712, - "eval_runtime": 227.78, - "eval_samples_per_second": 902.23, - "eval_steps_per_second": 56.392, + "eval_loss": 2.4606146812438965, + "eval_runtime": 226.9924, + "eval_samples_per_second": 905.361, + "eval_steps_per_second": 56.588, "step": 1088000 }, { "epoch": 4.49, - "eval_loss": 2.843454360961914, - "eval_runtime": 228.7834, - "eval_samples_per_second": 898.273, - "eval_steps_per_second": 56.145, + "eval_loss": 2.4597506523132324, + "eval_runtime": 228.0241, + "eval_samples_per_second": 901.264, + "eval_steps_per_second": 56.332, "step": 1096000 }, { "epoch": 4.52, - "learning_rate": 5.403151838572501e-06, - "loss": 2.8196, + "learning_rate": 2.214e-07, + "loss": 2.5181, "step": 1104000 }, { "epoch": 4.52, - "eval_loss": 2.848449945449829, - "eval_runtime": 228.2467, - "eval_samples_per_second": 900.385, - "eval_steps_per_second": 56.277, + "eval_loss": 2.454252004623413, + "eval_runtime": 228.6537, + "eval_samples_per_second": 898.783, + "eval_steps_per_second": 56.177, "step": 1104000 }, { "epoch": 4.56, - "eval_loss": 2.8537204265594482, - "eval_runtime": 231.8913, - "eval_samples_per_second": 886.234, - "eval_steps_per_second": 55.392, + "eval_loss": 2.4619953632354736, + "eval_runtime": 226.293, + "eval_samples_per_second": 908.159, + "eval_steps_per_second": 56.763, "step": 1112000 }, { "epoch": 4.59, - "learning_rate": 5.33644626031852e-06, - "loss": 2.8284, + "learning_rate": 2.1866666666666667e-07, + "loss": 2.5246, "step": 1120000 }, { "epoch": 4.59, - "eval_loss": 2.8540537357330322, - "eval_runtime": 230.1324, - "eval_samples_per_second": 893.008, - "eval_steps_per_second": 55.816, + "eval_loss": 2.4638657569885254, + "eval_runtime": 228.163, + "eval_samples_per_second": 900.716, + "eval_steps_per_second": 56.297, "step": 1120000 }, { "epoch": 4.62, - "eval_loss": 2.8400697708129883, - "eval_runtime": 230.0912, - "eval_samples_per_second": 893.167, - "eval_steps_per_second": 55.826, + "eval_loss": 2.4556171894073486, + "eval_runtime": 228.3656, + "eval_samples_per_second": 899.917, + "eval_steps_per_second": 56.248, "step": 1128000 }, { "epoch": 4.65, - "learning_rate": 5.269740682064538e-06, - "loss": 2.8349, + "learning_rate": 2.1593333333333332e-07, + "loss": 2.5318, "step": 1136000 }, { "epoch": 4.65, - "eval_loss": 2.847630262374878, - "eval_runtime": 227.545, - "eval_samples_per_second": 903.162, - "eval_steps_per_second": 56.45, + "eval_loss": 2.457075595855713, + "eval_runtime": 227.349, + "eval_samples_per_second": 903.941, + "eval_steps_per_second": 56.499, "step": 1136000 }, { "epoch": 4.69, - "eval_loss": 2.8476176261901855, - "eval_runtime": 227.3305, - "eval_samples_per_second": 904.014, - "eval_steps_per_second": 56.504, + "eval_loss": 2.4636013507843018, + "eval_runtime": 228.1517, + "eval_samples_per_second": 900.76, + "eval_steps_per_second": 56.3, "step": 1144000 }, { "epoch": 4.72, - "learning_rate": 5.203035103810556e-06, - "loss": 2.8171, + "learning_rate": 2.132e-07, + "loss": 2.512, "step": 1152000 }, { "epoch": 4.72, - "eval_loss": 2.843763589859009, - "eval_runtime": 232.1395, - "eval_samples_per_second": 885.287, - "eval_steps_per_second": 55.333, + "eval_loss": 2.4567556381225586, + "eval_runtime": 228.1066, + "eval_samples_per_second": 900.938, + "eval_steps_per_second": 56.311, "step": 1152000 }, { "epoch": 4.75, - "eval_loss": 2.853471040725708, - "eval_runtime": 229.6265, - "eval_samples_per_second": 894.975, - "eval_steps_per_second": 55.939, + "eval_loss": 2.4644010066986084, + "eval_runtime": 228.916, + "eval_samples_per_second": 897.753, + "eval_steps_per_second": 56.112, "step": 1160000 }, { "epoch": 4.79, - "learning_rate": 5.136329525556575e-06, - "loss": 2.8264, + "learning_rate": 2.1046666666666666e-07, + "loss": 2.5174, "step": 1168000 }, { "epoch": 4.79, - "eval_loss": 2.842829704284668, - "eval_runtime": 228.2848, - "eval_samples_per_second": 900.235, - "eval_steps_per_second": 56.267, + "eval_loss": 2.4528720378875732, + "eval_runtime": 228.2634, + "eval_samples_per_second": 900.32, + "eval_steps_per_second": 56.273, "step": 1168000 }, { "epoch": 4.82, - "eval_loss": 2.855221748352051, - "eval_runtime": 229.9347, - "eval_samples_per_second": 893.775, - "eval_steps_per_second": 55.864, + "eval_loss": 2.4613921642303467, + "eval_runtime": 228.3765, + "eval_samples_per_second": 899.874, + "eval_steps_per_second": 56.245, "step": 1176000 }, { "epoch": 4.85, - "learning_rate": 5.0696239473025935e-06, - "loss": 2.8335, + "learning_rate": 2.0773333333333334e-07, + "loss": 2.5196, "step": 1184000 }, { "epoch": 4.85, - "eval_loss": 2.857285261154175, - "eval_runtime": 229.2467, - "eval_samples_per_second": 896.458, - "eval_steps_per_second": 56.031, + "eval_loss": 2.463758707046509, + "eval_runtime": 227.0091, + "eval_samples_per_second": 905.294, + "eval_steps_per_second": 56.584, "step": 1184000 }, { "epoch": 4.88, - "eval_loss": 2.850496292114258, - "eval_runtime": 228.4453, - "eval_samples_per_second": 899.603, - "eval_steps_per_second": 56.228, + "eval_loss": 2.453406572341919, + "eval_runtime": 227.3538, + "eval_samples_per_second": 903.921, + "eval_steps_per_second": 56.498, "step": 1192000 }, { "epoch": 4.92, - "learning_rate": 5.002918369048611e-06, - "loss": 2.8351, + "learning_rate": 2.05e-07, + "loss": 2.5248, "step": 1200000 }, { "epoch": 4.92, - "eval_loss": 2.8511581420898438, - "eval_runtime": 228.7507, - "eval_samples_per_second": 898.401, - "eval_steps_per_second": 56.153, + "eval_loss": 2.4553115367889404, + "eval_runtime": 227.0142, + "eval_samples_per_second": 905.274, + "eval_steps_per_second": 56.582, "step": 1200000 }, { "epoch": 4.95, - "eval_loss": 2.8499553203582764, - "eval_runtime": 230.0891, - "eval_samples_per_second": 893.176, - "eval_steps_per_second": 55.826, + "eval_loss": 2.453683853149414, + "eval_runtime": 226.8684, + "eval_samples_per_second": 905.855, + "eval_steps_per_second": 56.619, "step": 1208000 }, { "epoch": 4.98, - "learning_rate": 4.936212790794631e-06, - "loss": 2.8366, + "learning_rate": 2.0226666666666668e-07, + "loss": 2.5201, "step": 1216000 }, { "epoch": 4.98, - "eval_loss": 2.8569633960723877, - "eval_runtime": 229.1877, - "eval_samples_per_second": 896.689, - "eval_steps_per_second": 56.046, + "eval_loss": 2.4578709602355957, + "eval_runtime": 226.9695, + "eval_samples_per_second": 905.452, + "eval_steps_per_second": 56.594, "step": 1216000 }, { "epoch": 5.02, - "eval_loss": 2.846999168395996, - "eval_runtime": 228.106, - "eval_samples_per_second": 900.941, - "eval_steps_per_second": 56.312, + "eval_loss": 2.4524765014648438, + "eval_runtime": 226.5657, + "eval_samples_per_second": 907.066, + "eval_steps_per_second": 56.694, "step": 1224000 }, { "epoch": 5.05, - "learning_rate": 4.869507212540649e-06, - "loss": 2.8257, + "learning_rate": 1.9953333333333333e-07, + "loss": 2.5164, "step": 1232000 }, { "epoch": 5.05, - "eval_loss": 2.8638033866882324, - "eval_runtime": 227.5474, - "eval_samples_per_second": 903.153, - "eval_steps_per_second": 56.45, + "eval_loss": 2.4645235538482666, + "eval_runtime": 227.3689, + "eval_samples_per_second": 903.861, + "eval_steps_per_second": 56.494, "step": 1232000 }, { "epoch": 5.08, - "eval_loss": 2.8511650562286377, - "eval_runtime": 227.331, - "eval_samples_per_second": 904.012, - "eval_steps_per_second": 56.504, + "eval_loss": 2.447993040084839, + "eval_runtime": 228.8072, + "eval_samples_per_second": 898.18, + "eval_steps_per_second": 56.139, "step": 1240000 }, { "epoch": 5.11, - "learning_rate": 4.802801634286667e-06, - "loss": 2.8351, + "learning_rate": 1.968e-07, + "loss": 2.5186, "step": 1248000 }, { "epoch": 5.11, - "eval_loss": 2.8640658855438232, - "eval_runtime": 227.1737, - "eval_samples_per_second": 904.638, - "eval_steps_per_second": 56.543, + "eval_loss": 2.4605581760406494, + "eval_runtime": 229.2086, + "eval_samples_per_second": 896.607, + "eval_steps_per_second": 56.041, "step": 1248000 }, { "epoch": 5.15, - "eval_loss": 2.8679511547088623, - "eval_runtime": 227.7339, - "eval_samples_per_second": 902.413, - "eval_steps_per_second": 56.404, + "eval_loss": 2.4623043537139893, + "eval_runtime": 229.8264, + "eval_samples_per_second": 894.197, + "eval_steps_per_second": 55.89, "step": 1256000 }, { "epoch": 5.18, - "learning_rate": 4.7360960560326865e-06, - "loss": 2.8272, + "learning_rate": 1.9406666666666667e-07, + "loss": 2.5123, "step": 1264000 }, { "epoch": 5.18, - "eval_loss": 2.8521032333374023, - "eval_runtime": 226.8252, - "eval_samples_per_second": 906.028, - "eval_steps_per_second": 56.63, + "eval_loss": 2.456583261489868, + "eval_runtime": 230.427, + "eval_samples_per_second": 891.866, + "eval_steps_per_second": 55.744, "step": 1264000 }, { "epoch": 5.21, - "eval_loss": 2.8616464138031006, - "eval_runtime": 226.0986, - "eval_samples_per_second": 908.94, - "eval_steps_per_second": 56.812, + "eval_loss": 2.464402437210083, + "eval_runtime": 229.1803, + "eval_samples_per_second": 896.717, + "eval_steps_per_second": 56.048, "step": 1272000 }, { "epoch": 5.24, - "learning_rate": 4.669390477778704e-06, - "loss": 2.8346, + "learning_rate": 1.9133333333333333e-07, + "loss": 2.5233, "step": 1280000 }, { "epoch": 5.24, - "eval_loss": 2.8545360565185547, - "eval_runtime": 226.6533, - "eval_samples_per_second": 906.715, - "eval_steps_per_second": 56.672, + "eval_loss": 2.457606792449951, + "eval_runtime": 227.9824, + "eval_samples_per_second": 901.429, + "eval_steps_per_second": 56.342, "step": 1280000 }, { "epoch": 5.28, - "eval_loss": 2.847703695297241, - "eval_runtime": 226.3926, - "eval_samples_per_second": 907.759, - "eval_steps_per_second": 56.738, + "eval_loss": 2.451943874359131, + "eval_runtime": 229.374, + "eval_samples_per_second": 895.96, + "eval_steps_per_second": 56.0, "step": 1288000 }, { "epoch": 5.31, - "learning_rate": 4.602684899524723e-06, - "loss": 2.8211, + "learning_rate": 1.886e-07, + "loss": 2.513, "step": 1296000 }, { "epoch": 5.31, - "eval_loss": 2.860240936279297, - "eval_runtime": 229.1022, - "eval_samples_per_second": 897.023, - "eval_steps_per_second": 56.067, + "eval_loss": 2.456979513168335, + "eval_runtime": 228.051, + "eval_samples_per_second": 901.158, + "eval_steps_per_second": 56.325, "step": 1296000 }, { "epoch": 5.34, - "eval_loss": 2.857428550720215, - "eval_runtime": 227.3841, - "eval_samples_per_second": 903.801, - "eval_steps_per_second": 56.49, + "eval_loss": 2.462719202041626, + "eval_runtime": 228.489, + "eval_samples_per_second": 899.43, + "eval_steps_per_second": 56.217, "step": 1304000 }, { "epoch": 5.38, - "learning_rate": 4.5359793212707415e-06, - "loss": 2.8302, + "learning_rate": 1.8586666666666666e-07, + "loss": 2.5226, "step": 1312000 }, { "epoch": 5.38, - "eval_loss": 2.8489556312561035, - "eval_runtime": 227.6538, - "eval_samples_per_second": 902.73, - "eval_steps_per_second": 56.423, + "eval_loss": 2.449977397918701, + "eval_runtime": 227.9952, + "eval_samples_per_second": 901.379, + "eval_steps_per_second": 56.339, "step": 1312000 }, { "epoch": 5.41, - "eval_loss": 2.8547353744506836, - "eval_runtime": 226.9648, - "eval_samples_per_second": 905.471, - "eval_steps_per_second": 56.595, + "eval_loss": 2.4563188552856445, + "eval_runtime": 227.2759, + "eval_samples_per_second": 904.231, + "eval_steps_per_second": 56.517, "step": 1320000 }, { "epoch": 5.44, - "learning_rate": 4.46927374301676e-06, - "loss": 2.8317, + "learning_rate": 1.8313333333333332e-07, + "loss": 2.5222, "step": 1328000 }, { "epoch": 5.44, - "eval_loss": 2.8535571098327637, - "eval_runtime": 228.1487, - "eval_samples_per_second": 900.772, - "eval_steps_per_second": 56.301, + "eval_loss": 2.4521265029907227, + "eval_runtime": 226.8418, + "eval_samples_per_second": 905.962, + "eval_steps_per_second": 56.625, "step": 1328000 }, { "epoch": 5.47, - "eval_loss": 2.855344295501709, - "eval_runtime": 227.3885, - "eval_samples_per_second": 903.783, - "eval_steps_per_second": 56.489, + "eval_loss": 2.4591453075408936, + "eval_runtime": 226.8374, + "eval_samples_per_second": 905.98, + "eval_steps_per_second": 56.626, "step": 1336000 }, { "epoch": 5.51, - "learning_rate": 4.402568164762779e-06, - "loss": 2.83, + "learning_rate": 1.804e-07, + "loss": 2.5191, "step": 1344000 }, { "epoch": 5.51, - "eval_loss": 2.853558301925659, - "eval_runtime": 226.8054, - "eval_samples_per_second": 906.107, - "eval_steps_per_second": 56.634, + "eval_loss": 2.4508602619171143, + "eval_runtime": 228.6931, + "eval_samples_per_second": 898.628, + "eval_steps_per_second": 56.167, "step": 1344000 }, { "epoch": 5.54, - "eval_loss": 2.8564646244049072, - "eval_runtime": 227.7928, - "eval_samples_per_second": 902.18, - "eval_steps_per_second": 56.389, + "eval_loss": 2.455850124359131, + "eval_runtime": 228.2295, + "eval_samples_per_second": 900.453, + "eval_steps_per_second": 56.281, "step": 1352000 }, { "epoch": 5.57, - "learning_rate": 4.335862586508797e-06, - "loss": 2.8347, + "learning_rate": 1.7766666666666666e-07, + "loss": 2.5243, "step": 1360000 }, { "epoch": 5.57, - "eval_loss": 2.8444929122924805, - "eval_runtime": 226.9232, - "eval_samples_per_second": 905.637, - "eval_steps_per_second": 56.605, + "eval_loss": 2.4501898288726807, + "eval_runtime": 228.7596, + "eval_samples_per_second": 898.367, + "eval_steps_per_second": 56.151, "step": 1360000 }, { "epoch": 5.61, - "eval_loss": 2.854001998901367, - "eval_runtime": 227.1661, - "eval_samples_per_second": 904.668, - "eval_steps_per_second": 56.545, + "eval_loss": 2.4514639377593994, + "eval_runtime": 227.2719, + "eval_samples_per_second": 904.247, + "eval_steps_per_second": 56.518, "step": 1368000 }, { "epoch": 5.64, - "learning_rate": 4.269157008254816e-06, - "loss": 2.8253, + "learning_rate": 1.7493333333333334e-07, + "loss": 2.5157, "step": 1376000 }, { "epoch": 5.64, - "eval_loss": 2.8630056381225586, - "eval_runtime": 228.1691, - "eval_samples_per_second": 900.692, - "eval_steps_per_second": 56.296, + "eval_loss": 2.4562854766845703, + "eval_runtime": 227.9532, + "eval_samples_per_second": 901.545, + "eval_steps_per_second": 56.349, "step": 1376000 }, { "epoch": 5.67, - "eval_loss": 2.8591771125793457, - "eval_runtime": 228.3462, - "eval_samples_per_second": 899.993, - "eval_steps_per_second": 56.252, + "eval_loss": 2.452606678009033, + "eval_runtime": 227.4532, + "eval_samples_per_second": 903.527, + "eval_steps_per_second": 56.473, "step": 1384000 }, { "epoch": 5.7, - "learning_rate": 4.202451430000834e-06, - "loss": 2.8237, + "learning_rate": 1.722e-07, + "loss": 2.5162, "step": 1392000 }, { "epoch": 5.7, - "eval_loss": 2.8634839057922363, - "eval_runtime": 227.2681, - "eval_samples_per_second": 904.262, - "eval_steps_per_second": 56.519, + "eval_loss": 2.458620071411133, + "eval_runtime": 228.2374, + "eval_samples_per_second": 900.422, + "eval_steps_per_second": 56.279, "step": 1392000 }, { "epoch": 5.74, - "eval_loss": 2.862060308456421, - "eval_runtime": 226.8521, - "eval_samples_per_second": 905.921, - "eval_steps_per_second": 56.623, + "eval_loss": 2.458387613296509, + "eval_runtime": 228.0105, + "eval_samples_per_second": 901.318, + "eval_steps_per_second": 56.335, "step": 1400000 }, { "epoch": 5.77, - "learning_rate": 4.135745851746852e-06, - "loss": 2.8364, + "learning_rate": 1.6946666666666668e-07, + "loss": 2.5169, "step": 1408000 }, { "epoch": 5.77, - "eval_loss": 2.854534864425659, - "eval_runtime": 228.0962, - "eval_samples_per_second": 900.98, - "eval_steps_per_second": 56.314, + "eval_loss": 2.454158067703247, + "eval_runtime": 227.4312, + "eval_samples_per_second": 903.614, + "eval_steps_per_second": 56.479, "step": 1408000 }, { "epoch": 5.8, - "eval_loss": 2.8681845664978027, - "eval_runtime": 228.9004, - "eval_samples_per_second": 897.814, - "eval_steps_per_second": 56.116, + "eval_loss": 2.460242986679077, + "eval_runtime": 228.5958, + "eval_samples_per_second": 899.01, + "eval_steps_per_second": 56.191, "step": 1416000 }, { "epoch": 5.84, - "learning_rate": 4.069040273492872e-06, - "loss": 2.8289, + "learning_rate": 1.6673333333333333e-07, + "loss": 2.5127, "step": 1424000 }, { "epoch": 5.84, - "eval_loss": 2.867513418197632, - "eval_runtime": 228.9233, - "eval_samples_per_second": 897.724, - "eval_steps_per_second": 56.111, + "eval_loss": 2.458707809448242, + "eval_runtime": 228.0452, + "eval_samples_per_second": 901.181, + "eval_steps_per_second": 56.327, "step": 1424000 }, { "epoch": 5.87, - "eval_loss": 2.859713077545166, - "eval_runtime": 228.0494, - "eval_samples_per_second": 901.164, - "eval_steps_per_second": 56.326, + "eval_loss": 2.452913284301758, + "eval_runtime": 227.4908, + "eval_samples_per_second": 903.377, + "eval_steps_per_second": 56.464, "step": 1432000 }, { "epoch": 5.9, - "learning_rate": 4.0023346952388895e-06, - "loss": 2.8327, + "learning_rate": 1.64e-07, + "loss": 2.5144, "step": 1440000 }, { "epoch": 5.9, - "eval_loss": 2.8727521896362305, - "eval_runtime": 228.1742, - "eval_samples_per_second": 900.672, - "eval_steps_per_second": 56.295, + "eval_loss": 2.462021827697754, + "eval_runtime": 229.4885, + "eval_samples_per_second": 895.513, + "eval_steps_per_second": 55.972, "step": 1440000 }, { "epoch": 5.93, - "eval_loss": 2.8643882274627686, - "eval_runtime": 228.4395, - "eval_samples_per_second": 899.626, - "eval_steps_per_second": 56.229, + "eval_loss": 2.450927972793579, + "eval_runtime": 227.9748, + "eval_samples_per_second": 901.459, + "eval_steps_per_second": 56.344, "step": 1448000 }, { "epoch": 5.97, - "learning_rate": 3.935629116984908e-06, - "loss": 2.8407, + "learning_rate": 1.6126666666666667e-07, + "loss": 2.5175, "step": 1456000 }, { "epoch": 5.97, - "eval_loss": 2.864025354385376, - "eval_runtime": 228.8691, - "eval_samples_per_second": 897.937, - "eval_steps_per_second": 56.124, + "eval_loss": 2.4503204822540283, + "eval_runtime": 227.5178, + "eval_samples_per_second": 903.27, + "eval_steps_per_second": 56.457, "step": 1456000 }, { "epoch": 6.0, - "eval_loss": 2.8669943809509277, - "eval_runtime": 227.4377, - "eval_samples_per_second": 903.588, - "eval_steps_per_second": 56.477, + "eval_loss": 2.4545462131500244, + "eval_runtime": 227.7963, + "eval_samples_per_second": 902.165, + "eval_steps_per_second": 56.388, "step": 1464000 }, { "epoch": 6.03, - "learning_rate": 3.868923538730927e-06, - "loss": 2.8349, + "learning_rate": 1.5853333333333332e-07, + "loss": 2.5147, "step": 1472000 }, { "epoch": 6.03, - "eval_loss": 2.8555285930633545, - "eval_runtime": 228.413, - "eval_samples_per_second": 899.73, - "eval_steps_per_second": 56.236, + "eval_loss": 2.4440090656280518, + "eval_runtime": 227.8162, + "eval_samples_per_second": 902.087, + "eval_steps_per_second": 56.383, "step": 1472000 }, { "epoch": 6.06, - "eval_loss": 2.8778481483459473, - "eval_runtime": 228.1335, - "eval_samples_per_second": 900.832, - "eval_steps_per_second": 56.305, + "eval_loss": 2.457670211791992, + "eval_runtime": 228.5245, + "eval_samples_per_second": 899.291, + "eval_steps_per_second": 56.208, "step": 1480000 }, { "epoch": 6.1, - "learning_rate": 3.8022179604769453e-06, - "loss": 2.8395, + "learning_rate": 1.558e-07, + "loss": 2.5128, "step": 1488000 }, { "epoch": 6.1, - "eval_loss": 2.8752517700195312, - "eval_runtime": 229.2602, - "eval_samples_per_second": 896.405, - "eval_steps_per_second": 56.028, + "eval_loss": 2.456602096557617, + "eval_runtime": 230.1502, + "eval_samples_per_second": 892.939, + "eval_steps_per_second": 55.811, "step": 1488000 }, { "epoch": 6.13, - "eval_loss": 2.865680694580078, - "eval_runtime": 228.1396, - "eval_samples_per_second": 900.808, - "eval_steps_per_second": 56.303, + "eval_loss": 2.449889659881592, + "eval_runtime": 228.3041, + "eval_samples_per_second": 900.159, + "eval_steps_per_second": 56.263, "step": 1496000 }, { "epoch": 6.16, - "learning_rate": 3.735512382222964e-06, - "loss": 2.8364, + "learning_rate": 1.5306666666666666e-07, + "loss": 2.5168, "step": 1504000 }, { "epoch": 6.16, - "eval_loss": 2.864389419555664, - "eval_runtime": 228.8163, - "eval_samples_per_second": 898.144, - "eval_steps_per_second": 56.137, + "eval_loss": 2.4480044841766357, + "eval_runtime": 228.2508, + "eval_samples_per_second": 900.369, + "eval_steps_per_second": 56.276, "step": 1504000 }, { "epoch": 6.2, - "eval_loss": 2.8669490814208984, - "eval_runtime": 230.4793, - "eval_samples_per_second": 891.663, - "eval_steps_per_second": 55.732, + "eval_loss": 2.4436299800872803, + "eval_runtime": 229.3638, + "eval_samples_per_second": 896.0, + "eval_steps_per_second": 56.003, "step": 1512000 }, { "epoch": 6.23, - "learning_rate": 3.668806803968982e-06, - "loss": 2.85, + "learning_rate": 1.5033333333333332e-07, + "loss": 2.5225, "step": 1520000 }, { "epoch": 6.23, - "eval_loss": 2.863649845123291, - "eval_runtime": 228.5034, - "eval_samples_per_second": 899.374, - "eval_steps_per_second": 56.214, + "eval_loss": 2.446739912033081, + "eval_runtime": 228.4899, + "eval_samples_per_second": 899.427, + "eval_steps_per_second": 56.217, "step": 1520000 }, { "epoch": 6.26, - "eval_loss": 2.868006944656372, - "eval_runtime": 228.7374, - "eval_samples_per_second": 898.454, - "eval_steps_per_second": 56.156, + "eval_loss": 2.4519920349121094, + "eval_runtime": 228.2075, + "eval_samples_per_second": 900.54, + "eval_steps_per_second": 56.286, "step": 1528000 }, { "epoch": 6.29, - "learning_rate": 3.6021012257150007e-06, - "loss": 2.8359, + "learning_rate": 1.476e-07, + "loss": 2.5135, "step": 1536000 }, { "epoch": 6.29, - "eval_loss": 2.8751797676086426, - "eval_runtime": 228.8775, - "eval_samples_per_second": 897.904, - "eval_steps_per_second": 56.122, + "eval_loss": 2.4535210132598877, + "eval_runtime": 228.7342, + "eval_samples_per_second": 898.466, + "eval_steps_per_second": 56.157, "step": 1536000 }, { "epoch": 6.33, - "eval_loss": 2.871007204055786, - "eval_runtime": 229.7636, - "eval_samples_per_second": 894.441, - "eval_steps_per_second": 55.905, + "eval_loss": 2.4462831020355225, + "eval_runtime": 229.9473, + "eval_samples_per_second": 893.727, + "eval_steps_per_second": 55.861, "step": 1544000 }, { "epoch": 6.36, - "learning_rate": 3.535395647461019e-06, - "loss": 2.8451, + "learning_rate": 1.4486666666666665e-07, + "loss": 2.5161, "step": 1552000 }, { "epoch": 6.36, - "eval_loss": 2.876699686050415, - "eval_runtime": 229.2961, - "eval_samples_per_second": 896.265, - "eval_steps_per_second": 56.019, + "eval_loss": 2.4556400775909424, + "eval_runtime": 228.5872, + "eval_samples_per_second": 899.044, + "eval_steps_per_second": 56.193, "step": 1552000 }, { "epoch": 6.39, - "eval_loss": 2.882432699203491, - "eval_runtime": 228.5189, - "eval_samples_per_second": 899.313, - "eval_steps_per_second": 56.21, + "eval_loss": 2.4604580402374268, + "eval_runtime": 229.1233, + "eval_samples_per_second": 896.941, + "eval_steps_per_second": 56.062, "step": 1560000 }, { "epoch": 6.43, - "learning_rate": 3.468690069207038e-06, - "loss": 2.8359, + "learning_rate": 1.4213333333333334e-07, + "loss": 2.5144, "step": 1568000 }, { "epoch": 6.43, - "eval_loss": 2.8723318576812744, - "eval_runtime": 229.3117, - "eval_samples_per_second": 896.204, - "eval_steps_per_second": 56.015, + "eval_loss": 2.4516451358795166, + "eval_runtime": 229.9726, + "eval_samples_per_second": 893.628, + "eval_steps_per_second": 55.854, "step": 1568000 }, { "epoch": 6.46, - "eval_loss": 2.87728214263916, - "eval_runtime": 230.0667, - "eval_samples_per_second": 893.263, - "eval_steps_per_second": 55.832, + "eval_loss": 2.4487648010253906, + "eval_runtime": 229.4253, + "eval_samples_per_second": 895.76, + "eval_steps_per_second": 55.988, "step": 1576000 }, { "epoch": 6.49, - "learning_rate": 3.4019844909530565e-06, - "loss": 2.8546, + "learning_rate": 1.3940000000000002e-07, + "loss": 2.5209, "step": 1584000 }, { "epoch": 6.49, - "eval_loss": 2.875946044921875, - "eval_runtime": 228.6285, - "eval_samples_per_second": 898.882, - "eval_steps_per_second": 56.183, + "eval_loss": 2.4525067806243896, + "eval_runtime": 228.8527, + "eval_samples_per_second": 898.001, + "eval_steps_per_second": 56.128, "step": 1584000 }, { "epoch": 6.52, - "eval_loss": 2.8732051849365234, - "eval_runtime": 231.6471, - "eval_samples_per_second": 887.168, - "eval_steps_per_second": 55.451, + "eval_loss": 2.450185537338257, + "eval_runtime": 230.8087, + "eval_samples_per_second": 890.391, + "eval_steps_per_second": 55.652, "step": 1592000 }, { "epoch": 6.56, - "learning_rate": 3.3352789126990747e-06, - "loss": 2.8395, + "learning_rate": 1.3666666666666665e-07, + "loss": 2.5102, "step": 1600000 }, { "epoch": 6.56, - "eval_loss": 2.8803162574768066, - "eval_runtime": 230.2291, - "eval_samples_per_second": 892.632, - "eval_steps_per_second": 55.792, + "eval_loss": 2.453780174255371, + "eval_runtime": 229.4733, + "eval_samples_per_second": 895.573, + "eval_steps_per_second": 55.976, "step": 1600000 }, { "epoch": 6.59, - "eval_loss": 2.8761353492736816, - "eval_runtime": 229.013, - "eval_samples_per_second": 897.373, - "eval_steps_per_second": 56.089, + "eval_loss": 2.4490787982940674, + "eval_runtime": 229.059, + "eval_samples_per_second": 897.192, + "eval_steps_per_second": 56.077, "step": 1608000 }, { "epoch": 6.62, - "learning_rate": 3.2685733344450933e-06, - "loss": 2.847, + "learning_rate": 1.3393333333333333e-07, + "loss": 2.5176, "step": 1616000 }, { "epoch": 6.62, - "eval_loss": 2.880068302154541, - "eval_runtime": 229.0344, - "eval_samples_per_second": 897.289, - "eval_steps_per_second": 56.083, + "eval_loss": 2.452752113342285, + "eval_runtime": 228.4962, + "eval_samples_per_second": 899.402, + "eval_steps_per_second": 56.215, "step": 1616000 }, { "epoch": 6.65, - "eval_loss": 2.873690366744995, - "eval_runtime": 228.9162, - "eval_samples_per_second": 897.752, - "eval_steps_per_second": 56.112, + "eval_loss": 2.44599986076355, + "eval_runtime": 228.5114, + "eval_samples_per_second": 899.342, + "eval_steps_per_second": 56.212, "step": 1624000 }, { "epoch": 6.69, - "learning_rate": 3.2018677561911115e-06, - "loss": 2.8555, + "learning_rate": 1.312e-07, + "loss": 2.5208, "step": 1632000 }, { "epoch": 6.69, - "eval_loss": 2.8797101974487305, - "eval_runtime": 228.8191, - "eval_samples_per_second": 898.133, - "eval_steps_per_second": 56.136, + "eval_loss": 2.4484992027282715, + "eval_runtime": 230.1605, + "eval_samples_per_second": 892.899, + "eval_steps_per_second": 55.809, "step": 1632000 }, { "epoch": 6.72, - "eval_loss": 2.8781728744506836, - "eval_runtime": 229.3564, - "eval_samples_per_second": 896.029, - "eval_steps_per_second": 56.005, + "eval_loss": 2.451284646987915, + "eval_runtime": 229.1401, + "eval_samples_per_second": 896.875, + "eval_steps_per_second": 56.057, "step": 1640000 }, { "epoch": 6.75, - "learning_rate": 3.1351621779371306e-06, - "loss": 2.8377, + "learning_rate": 1.2846666666666667e-07, + "loss": 2.5064, "step": 1648000 }, { "epoch": 6.75, - "eval_loss": 2.8825597763061523, - "eval_runtime": 229.3988, - "eval_samples_per_second": 895.863, - "eval_steps_per_second": 55.994, + "eval_loss": 2.451927900314331, + "eval_runtime": 229.3071, + "eval_samples_per_second": 896.222, + "eval_steps_per_second": 56.017, "step": 1648000 }, { "epoch": 6.79, - "eval_loss": 2.879844903945923, - "eval_runtime": 230.5279, - "eval_samples_per_second": 891.476, - "eval_steps_per_second": 55.72, + "eval_loss": 2.449305295944214, + "eval_runtime": 231.2204, + "eval_samples_per_second": 888.806, + "eval_steps_per_second": 55.553, "step": 1656000 }, { "epoch": 6.82, - "learning_rate": 3.0684565996831487e-06, - "loss": 2.8517, + "learning_rate": 1.2573333333333332e-07, + "loss": 2.5111, "step": 1664000 }, { "epoch": 6.82, - "eval_loss": 2.8799245357513428, - "eval_runtime": 230.1674, - "eval_samples_per_second": 892.872, - "eval_steps_per_second": 55.807, + "eval_loss": 2.4505178928375244, + "eval_runtime": 230.2462, + "eval_samples_per_second": 892.566, + "eval_steps_per_second": 55.788, "step": 1664000 }, { "epoch": 6.85, - "eval_loss": 2.8835196495056152, - "eval_runtime": 229.3852, - "eval_samples_per_second": 895.917, - "eval_steps_per_second": 55.998, + "eval_loss": 2.4501988887786865, + "eval_runtime": 229.973, + "eval_samples_per_second": 893.627, + "eval_steps_per_second": 55.854, "step": 1672000 }, { "epoch": 6.88, - "learning_rate": 3.0017510214291673e-06, - "loss": 2.8526, + "learning_rate": 1.23e-07, + "loss": 2.5141, "step": 1680000 }, { "epoch": 6.88, - "eval_loss": 2.8874735832214355, - "eval_runtime": 229.3731, - "eval_samples_per_second": 895.964, - "eval_steps_per_second": 56.0, + "eval_loss": 2.4560253620147705, + "eval_runtime": 229.6465, + "eval_samples_per_second": 894.897, + "eval_steps_per_second": 55.934, "step": 1680000 }, { "epoch": 6.92, - "eval_loss": 2.882905960083008, - "eval_runtime": 231.0041, - "eval_samples_per_second": 889.638, - "eval_steps_per_second": 55.605, + "eval_loss": 2.4499940872192383, + "eval_runtime": 229.0726, + "eval_samples_per_second": 897.139, + "eval_steps_per_second": 56.074, "step": 1688000 }, { "epoch": 6.95, - "learning_rate": 2.9350454431751855e-06, - "loss": 2.8511, + "learning_rate": 1.2026666666666666e-07, + "loss": 2.5089, "step": 1696000 }, { "epoch": 6.95, - "eval_loss": 2.8908419609069824, - "eval_runtime": 229.2389, - "eval_samples_per_second": 896.488, - "eval_steps_per_second": 56.033, + "eval_loss": 2.4512550830841064, + "eval_runtime": 228.4897, + "eval_samples_per_second": 899.428, + "eval_steps_per_second": 56.217, "step": 1696000 }, { "epoch": 6.98, - "eval_loss": 2.8755762577056885, - "eval_runtime": 230.9816, - "eval_samples_per_second": 889.724, - "eval_steps_per_second": 55.61, + "eval_loss": 2.4418201446533203, + "eval_runtime": 229.377, + "eval_samples_per_second": 895.949, + "eval_steps_per_second": 56.0, "step": 1704000 }, { "epoch": 7.02, - "learning_rate": 2.868339864921204e-06, - "loss": 2.8606, + "learning_rate": 1.1753333333333334e-07, + "loss": 2.5174, "step": 1712000 }, { "epoch": 7.02, - "eval_loss": 2.882680654525757, - "eval_runtime": 230.7858, - "eval_samples_per_second": 890.479, - "eval_steps_per_second": 55.658, + "eval_loss": 2.447690010070801, + "eval_runtime": 231.0137, + "eval_samples_per_second": 889.601, + "eval_steps_per_second": 55.603, "step": 1712000 }, { "epoch": 7.05, - "eval_loss": 2.884394407272339, - "eval_runtime": 230.0457, - "eval_samples_per_second": 893.344, - "eval_steps_per_second": 55.837, + "eval_loss": 2.450817584991455, + "eval_runtime": 231.6212, + "eval_samples_per_second": 887.268, + "eval_steps_per_second": 55.457, "step": 1720000 }, { "epoch": 7.08, - "learning_rate": 2.801634286667223e-06, - "loss": 2.852, + "learning_rate": 1.1480000000000001e-07, + "loss": 2.5198, "step": 1728000 }, { "epoch": 7.08, - "eval_loss": 2.8865275382995605, - "eval_runtime": 229.7673, - "eval_samples_per_second": 894.427, - "eval_steps_per_second": 55.904, + "eval_loss": 2.448648691177368, + "eval_runtime": 230.9308, + "eval_samples_per_second": 889.92, + "eval_steps_per_second": 55.623, "step": 1728000 }, { "epoch": 7.11, - "eval_loss": 2.8909904956817627, - "eval_runtime": 230.4684, - "eval_samples_per_second": 891.706, - "eval_steps_per_second": 55.734, + "eval_loss": 2.4577322006225586, + "eval_runtime": 230.4865, + "eval_samples_per_second": 891.636, + "eval_steps_per_second": 55.73, "step": 1736000 }, { "epoch": 7.15, - "learning_rate": 2.7349287084132413e-06, - "loss": 2.8319, + "learning_rate": 1.1206666666666666e-07, + "loss": 2.4974, "step": 1744000 }, { "epoch": 7.15, - "eval_loss": 2.884788751602173, - "eval_runtime": 229.622, - "eval_samples_per_second": 894.993, - "eval_steps_per_second": 55.94, + "eval_loss": 2.4416255950927734, + "eval_runtime": 229.8237, + "eval_samples_per_second": 894.207, + "eval_steps_per_second": 55.891, "step": 1744000 }, { "epoch": 7.18, - "eval_loss": 2.8916478157043457, - "eval_runtime": 231.4898, - "eval_samples_per_second": 887.771, - "eval_steps_per_second": 55.488, + "eval_loss": 2.4549336433410645, + "eval_runtime": 229.7571, + "eval_samples_per_second": 894.466, + "eval_steps_per_second": 55.907, "step": 1752000 }, { "epoch": 7.21, - "learning_rate": 2.66822313015926e-06, - "loss": 2.842, + "learning_rate": 1.0933333333333333e-07, + "loss": 2.5016, "step": 1760000 }, { "epoch": 7.21, - "eval_loss": 2.8830392360687256, - "eval_runtime": 230.1781, - "eval_samples_per_second": 892.83, - "eval_steps_per_second": 55.805, + "eval_loss": 2.455679416656494, + "eval_runtime": 230.1335, + "eval_samples_per_second": 893.003, + "eval_steps_per_second": 55.815, "step": 1760000 }, { "epoch": 7.24, - "eval_loss": 2.8850438594818115, - "eval_runtime": 230.145, - "eval_samples_per_second": 892.959, - "eval_steps_per_second": 55.813, + "eval_loss": 2.4531571865081787, + "eval_runtime": 231.3847, + "eval_samples_per_second": 888.175, + "eval_steps_per_second": 55.514, "step": 1768000 }, { "epoch": 7.28, - "learning_rate": 2.601517551905278e-06, - "loss": 2.8421, + "learning_rate": 1.066e-07, + "loss": 2.5112, "step": 1776000 }, { "epoch": 7.28, - "eval_loss": 2.875260591506958, - "eval_runtime": 230.4814, - "eval_samples_per_second": 891.655, - "eval_steps_per_second": 55.731, + "eval_loss": 2.445054531097412, + "eval_runtime": 231.2999, + "eval_samples_per_second": 888.5, + "eval_steps_per_second": 55.534, "step": 1776000 }, { "epoch": 7.31, - "eval_loss": 2.8958091735839844, - "eval_runtime": 230.6404, - "eval_samples_per_second": 891.041, - "eval_steps_per_second": 55.693, + "eval_loss": 2.460723638534546, + "eval_runtime": 230.196, + "eval_samples_per_second": 892.761, + "eval_steps_per_second": 55.8, "step": 1784000 }, { "epoch": 7.34, - "learning_rate": 2.5348119736512967e-06, - "loss": 2.8558, + "learning_rate": 1.0386666666666667e-07, + "loss": 2.5172, "step": 1792000 }, { "epoch": 7.34, - "eval_loss": 2.8712708950042725, - "eval_runtime": 229.8418, - "eval_samples_per_second": 894.137, - "eval_steps_per_second": 55.886, + "eval_loss": 2.4451537132263184, + "eval_runtime": 231.1406, + "eval_samples_per_second": 889.112, + "eval_steps_per_second": 55.572, "step": 1792000 }, { "epoch": 7.38, - "eval_loss": 2.8743996620178223, - "eval_runtime": 231.7487, - "eval_samples_per_second": 886.779, - "eval_steps_per_second": 55.426, + "eval_loss": 2.4426777362823486, + "eval_runtime": 230.7159, + "eval_samples_per_second": 890.749, + "eval_steps_per_second": 55.675, "step": 1800000 }, { "epoch": 7.41, - "learning_rate": 2.4681063953973154e-06, - "loss": 2.8382, + "learning_rate": 1.0113333333333334e-07, + "loss": 2.5089, "step": 1808000 }, { "epoch": 7.41, - "eval_loss": 2.890810966491699, - "eval_runtime": 230.3071, - "eval_samples_per_second": 892.33, - "eval_steps_per_second": 55.773, + "eval_loss": 2.4511077404022217, + "eval_runtime": 231.5975, + "eval_samples_per_second": 887.359, + "eval_steps_per_second": 55.463, "step": 1808000 }, { "epoch": 7.44, - "eval_loss": 2.8748860359191895, - "eval_runtime": 230.3829, - "eval_samples_per_second": 892.037, - "eval_steps_per_second": 55.755, + "eval_loss": 2.4440526962280273, + "eval_runtime": 231.4447, + "eval_samples_per_second": 887.944, + "eval_steps_per_second": 55.499, "step": 1816000 }, { "epoch": 7.47, - "learning_rate": 2.4014008171433335e-06, - "loss": 2.8508, + "learning_rate": 9.84e-08, + "loss": 2.5136, "step": 1824000 }, { "epoch": 7.47, - "eval_loss": 2.878951072692871, - "eval_runtime": 230.3626, - "eval_samples_per_second": 892.115, - "eval_steps_per_second": 55.76, + "eval_loss": 2.4492361545562744, + "eval_runtime": 231.7181, + "eval_samples_per_second": 886.896, + "eval_steps_per_second": 55.434, "step": 1824000 }, { "epoch": 7.51, - "eval_loss": 2.886600971221924, - "eval_runtime": 230.5556, - "eval_samples_per_second": 891.368, - "eval_steps_per_second": 55.713, + "eval_loss": 2.4523823261260986, + "eval_runtime": 231.3659, + "eval_samples_per_second": 888.247, + "eval_steps_per_second": 55.518, "step": 1832000 }, { "epoch": 7.54, - "learning_rate": 2.334695238889352e-06, - "loss": 2.8477, + "learning_rate": 9.566666666666666e-08, + "loss": 2.509, "step": 1840000 }, { "epoch": 7.54, - "eval_loss": 2.8806040287017822, - "eval_runtime": 230.3727, - "eval_samples_per_second": 892.076, - "eval_steps_per_second": 55.757, + "eval_loss": 2.451181411743164, + "eval_runtime": 230.8127, + "eval_samples_per_second": 890.376, + "eval_steps_per_second": 55.651, "step": 1840000 }, { "epoch": 7.57, - "eval_loss": 2.8820905685424805, - "eval_runtime": 230.4476, - "eval_samples_per_second": 891.786, - "eval_steps_per_second": 55.739, + "eval_loss": 2.4528069496154785, + "eval_runtime": 230.6096, + "eval_samples_per_second": 891.16, + "eval_steps_per_second": 55.7, "step": 1848000 }, { "epoch": 7.61, - "learning_rate": 2.2679896606353707e-06, - "loss": 2.8497, + "learning_rate": 9.293333333333333e-08, + "loss": 2.5157, "step": 1856000 }, { "epoch": 7.61, - "eval_loss": 2.877049684524536, - "eval_runtime": 230.7875, - "eval_samples_per_second": 890.473, - "eval_steps_per_second": 55.657, + "eval_loss": 2.4439537525177, + "eval_runtime": 233.382, + "eval_samples_per_second": 880.573, + "eval_steps_per_second": 55.039, "step": 1856000 }, { "epoch": 7.64, - "eval_loss": 2.873206377029419, - "eval_runtime": 230.6269, - "eval_samples_per_second": 891.093, - "eval_steps_per_second": 55.696, + "eval_loss": 2.4401602745056152, + "eval_runtime": 231.584, + "eval_samples_per_second": 887.41, + "eval_steps_per_second": 55.466, "step": 1864000 }, { "epoch": 7.67, - "learning_rate": 2.2012840823813894e-06, - "loss": 2.8566, + "learning_rate": 9.02e-08, + "loss": 2.5181, "step": 1872000 }, { "epoch": 7.67, - "eval_loss": 2.887915849685669, - "eval_runtime": 230.6589, - "eval_samples_per_second": 890.969, - "eval_steps_per_second": 55.688, + "eval_loss": 2.4537830352783203, + "eval_runtime": 230.4518, + "eval_samples_per_second": 891.77, + "eval_steps_per_second": 55.738, "step": 1872000 }, { "epoch": 7.7, - "eval_loss": 2.876028537750244, - "eval_runtime": 230.4372, - "eval_samples_per_second": 891.826, - "eval_steps_per_second": 55.742, + "eval_loss": 2.4480724334716797, + "eval_runtime": 229.9532, + "eval_samples_per_second": 893.703, + "eval_steps_per_second": 55.859, "step": 1880000 }, { "epoch": 7.74, - "learning_rate": 2.134578504127408e-06, - "loss": 2.8527, + "learning_rate": 8.746666666666667e-08, + "loss": 2.5145, "step": 1888000 }, { "epoch": 7.74, - "eval_loss": 2.8763537406921387, - "eval_runtime": 232.1508, - "eval_samples_per_second": 885.244, - "eval_steps_per_second": 55.33, + "eval_loss": 2.4417428970336914, + "eval_runtime": 231.464, + "eval_samples_per_second": 887.87, + "eval_steps_per_second": 55.495, "step": 1888000 }, { "epoch": 7.77, - "eval_loss": 2.8837802410125732, - "eval_runtime": 231.7421, - "eval_samples_per_second": 886.805, - "eval_steps_per_second": 55.428, + "eval_loss": 2.4512147903442383, + "eval_runtime": 231.0711, + "eval_samples_per_second": 889.38, + "eval_steps_per_second": 55.589, "step": 1896000 }, { "epoch": 7.8, - "learning_rate": 2.067872925873426e-06, - "loss": 2.8438, + "learning_rate": 8.473333333333334e-08, + "loss": 2.5013, "step": 1904000 }, { "epoch": 7.8, - "eval_loss": 2.8954691886901855, - "eval_runtime": 231.1122, - "eval_samples_per_second": 889.222, - "eval_steps_per_second": 55.579, + "eval_loss": 2.45603084564209, + "eval_runtime": 231.877, + "eval_samples_per_second": 886.289, + "eval_steps_per_second": 55.396, "step": 1904000 }, { "epoch": 7.83, - "eval_loss": 2.88919997215271, - "eval_runtime": 232.0892, - "eval_samples_per_second": 885.478, - "eval_steps_per_second": 55.345, + "eval_loss": 2.4508955478668213, + "eval_runtime": 230.4147, + "eval_samples_per_second": 891.913, + "eval_steps_per_second": 55.747, "step": 1912000 }, { "epoch": 7.87, - "learning_rate": 2.0011673476194448e-06, - "loss": 2.8422, + "learning_rate": 8.2e-08, + "loss": 2.5064, "step": 1920000 }, { "epoch": 7.87, - "eval_loss": 2.883701801300049, - "eval_runtime": 231.4957, - "eval_samples_per_second": 887.749, - "eval_steps_per_second": 55.487, + "eval_loss": 2.447256565093994, + "eval_runtime": 231.4505, + "eval_samples_per_second": 887.922, + "eval_steps_per_second": 55.498, "step": 1920000 }, { "epoch": 7.9, - "eval_loss": 2.89695143699646, - "eval_runtime": 230.9788, - "eval_samples_per_second": 889.735, - "eval_steps_per_second": 55.611, + "eval_loss": 2.457575559616089, + "eval_runtime": 232.2387, + "eval_samples_per_second": 884.908, + "eval_steps_per_second": 55.309, "step": 1928000 }, { "epoch": 7.93, - "learning_rate": 1.9344617693654634e-06, - "loss": 2.8521, + "learning_rate": 7.926666666666666e-08, + "loss": 2.5068, "step": 1936000 }, { "epoch": 7.93, - "eval_loss": 2.8805034160614014, - "eval_runtime": 231.7434, - "eval_samples_per_second": 886.8, - "eval_steps_per_second": 55.428, + "eval_loss": 2.4460949897766113, + "eval_runtime": 230.6448, + "eval_samples_per_second": 891.024, + "eval_steps_per_second": 55.692, "step": 1936000 }, { "epoch": 7.97, - "eval_loss": 2.8818843364715576, - "eval_runtime": 230.9357, - "eval_samples_per_second": 889.901, - "eval_steps_per_second": 55.622, + "eval_loss": 2.4451067447662354, + "eval_runtime": 231.6713, + "eval_samples_per_second": 887.076, + "eval_steps_per_second": 55.445, "step": 1944000 }, { "epoch": 8.0, - "learning_rate": 1.867756191111482e-06, - "loss": 2.8562, + "learning_rate": 7.653333333333333e-08, + "loss": 2.5152, "step": 1952000 }, { "epoch": 8.0, - "eval_loss": 2.877063751220703, - "eval_runtime": 230.6656, - "eval_samples_per_second": 890.943, - "eval_steps_per_second": 55.687, + "eval_loss": 2.442117214202881, + "eval_runtime": 231.4315, + "eval_samples_per_second": 887.995, + "eval_steps_per_second": 55.502, "step": 1952000 }, { "epoch": 8.03, - "eval_loss": 2.881901502609253, - "eval_runtime": 231.9508, - "eval_samples_per_second": 886.007, - "eval_steps_per_second": 55.378, + "eval_loss": 2.4458179473876953, + "eval_runtime": 230.6413, + "eval_samples_per_second": 891.037, + "eval_steps_per_second": 55.693, "step": 1960000 }, { "epoch": 8.06, - "learning_rate": 1.8010506128575004e-06, - "loss": 2.8417, + "learning_rate": 7.38e-08, + "loss": 2.5025, "step": 1968000 }, { "epoch": 8.06, - "eval_loss": 2.883180618286133, - "eval_runtime": 231.1061, - "eval_samples_per_second": 889.245, - "eval_steps_per_second": 55.581, + "eval_loss": 2.4532368183135986, + "eval_runtime": 230.9812, + "eval_samples_per_second": 889.726, + "eval_steps_per_second": 55.611, "step": 1968000 }, { "epoch": 8.1, - "eval_loss": 2.8927578926086426, - "eval_runtime": 232.292, - "eval_samples_per_second": 884.705, - "eval_steps_per_second": 55.297, + "eval_loss": 2.4541139602661133, + "eval_runtime": 231.1965, + "eval_samples_per_second": 888.898, + "eval_steps_per_second": 55.559, "step": 1976000 }, { "epoch": 8.13, - "learning_rate": 1.734345034603519e-06, - "loss": 2.8493, + "learning_rate": 7.106666666666667e-08, + "loss": 2.5151, "step": 1984000 }, { "epoch": 8.13, - "eval_loss": 2.8891210556030273, - "eval_runtime": 232.2356, - "eval_samples_per_second": 884.92, - "eval_steps_per_second": 55.31, + "eval_loss": 2.4499058723449707, + "eval_runtime": 231.2124, + "eval_samples_per_second": 888.836, + "eval_steps_per_second": 55.555, "step": 1984000 }, { "epoch": 8.16, - "eval_loss": 2.8862810134887695, - "eval_runtime": 233.5203, - "eval_samples_per_second": 880.052, - "eval_steps_per_second": 55.006, + "eval_loss": 2.4501264095306396, + "eval_runtime": 231.2241, + "eval_samples_per_second": 888.791, + "eval_steps_per_second": 55.552, "step": 1992000 }, { "epoch": 8.2, - "learning_rate": 1.6676394563495374e-06, - "loss": 2.8549, + "learning_rate": 6.833333333333332e-08, + "loss": 2.5138, "step": 2000000 }, { "epoch": 8.2, - "eval_loss": 2.876460313796997, - "eval_runtime": 232.4349, - "eval_samples_per_second": 884.162, - "eval_steps_per_second": 55.263, + "eval_loss": 2.444784641265869, + "eval_runtime": 231.6831, + "eval_samples_per_second": 887.031, + "eval_steps_per_second": 55.442, "step": 2000000 }, { "epoch": 8.23, - "eval_loss": 2.892146110534668, - "eval_runtime": 232.392, - "eval_samples_per_second": 884.325, - "eval_steps_per_second": 55.273, + "eval_loss": 2.4562456607818604, + "eval_runtime": 231.974, + "eval_samples_per_second": 885.918, + "eval_steps_per_second": 55.373, "step": 2008000 }, { "epoch": 8.26, - "learning_rate": 1.6009338780955558e-06, - "loss": 2.8421, + "learning_rate": 6.56e-08, + "loss": 2.5039, "step": 2016000 }, { "epoch": 8.26, - "eval_loss": 2.897282600402832, - "eval_runtime": 234.5662, - "eval_samples_per_second": 876.128, - "eval_steps_per_second": 54.761, + "eval_loss": 2.4612646102905273, + "eval_runtime": 234.4229, + "eval_samples_per_second": 876.663, + "eval_steps_per_second": 54.794, "step": 2016000 }, { "epoch": 8.29, - "eval_loss": 2.884657621383667, - "eval_runtime": 232.2059, - "eval_samples_per_second": 885.033, - "eval_steps_per_second": 55.317, + "eval_loss": 2.4471163749694824, + "eval_runtime": 233.3806, + "eval_samples_per_second": 880.579, + "eval_steps_per_second": 55.039, "step": 2024000 }, { "epoch": 8.33, - "learning_rate": 1.5342282998415744e-06, - "loss": 2.8451, + "learning_rate": 6.286666666666666e-08, + "loss": 2.5055, "step": 2032000 }, { "epoch": 8.33, - "eval_loss": 2.885942220687866, - "eval_runtime": 236.2429, - "eval_samples_per_second": 869.91, - "eval_steps_per_second": 54.372, + "eval_loss": 2.445026159286499, + "eval_runtime": 233.3418, + "eval_samples_per_second": 880.725, + "eval_steps_per_second": 55.048, "step": 2032000 }, { "epoch": 8.36, - "eval_loss": 2.8866655826568604, - "eval_runtime": 233.4019, - "eval_samples_per_second": 880.498, - "eval_steps_per_second": 55.034, + "eval_loss": 2.4492921829223633, + "eval_runtime": 232.3875, + "eval_samples_per_second": 884.342, + "eval_steps_per_second": 55.274, "step": 2040000 }, { "epoch": 8.39, - "learning_rate": 1.4675227215875928e-06, - "loss": 2.8465, + "learning_rate": 6.013333333333333e-08, + "loss": 2.5085, "step": 2048000 }, { "epoch": 8.39, - "eval_loss": 2.8853070735931396, - "eval_runtime": 233.5685, - "eval_samples_per_second": 879.87, - "eval_steps_per_second": 54.995, + "eval_loss": 2.448164224624634, + "eval_runtime": 233.4578, + "eval_samples_per_second": 880.288, + "eval_steps_per_second": 55.021, "step": 2048000 }, { "epoch": 8.42, - "eval_loss": 2.885338306427002, - "eval_runtime": 234.5164, - "eval_samples_per_second": 876.314, - "eval_steps_per_second": 54.772, + "eval_loss": 2.4571895599365234, + "eval_runtime": 235.4355, + "eval_samples_per_second": 872.893, + "eval_steps_per_second": 54.558, "step": 2056000 }, { "epoch": 8.46, - "learning_rate": 1.4008171433336116e-06, - "loss": 2.8516, + "learning_rate": 5.7400000000000004e-08, + "loss": 2.5114, "step": 2064000 }, { "epoch": 8.46, - "eval_loss": 2.8797264099121094, - "eval_runtime": 232.9203, - "eval_samples_per_second": 882.319, - "eval_steps_per_second": 55.148, + "eval_loss": 2.444307804107666, + "eval_runtime": 234.4924, + "eval_samples_per_second": 876.404, + "eval_steps_per_second": 54.778, "step": 2064000 }, { "epoch": 8.49, - "eval_loss": 2.882495880126953, - "eval_runtime": 232.5052, - "eval_samples_per_second": 883.894, - "eval_steps_per_second": 55.246, + "eval_loss": 2.445603132247925, + "eval_runtime": 234.6223, + "eval_samples_per_second": 875.919, + "eval_steps_per_second": 54.748, "step": 2072000 }, { "epoch": 8.52, - "learning_rate": 1.33411156507963e-06, - "loss": 2.8519, + "learning_rate": 5.4666666666666666e-08, + "loss": 2.5132, "step": 2080000 }, { "epoch": 8.52, - "eval_loss": 2.8863439559936523, - "eval_runtime": 232.936, - "eval_samples_per_second": 882.26, - "eval_steps_per_second": 55.144, + "eval_loss": 2.4528441429138184, + "eval_runtime": 234.3887, + "eval_samples_per_second": 876.791, + "eval_steps_per_second": 54.802, "step": 2080000 }, { "epoch": 8.56, - "eval_loss": 2.8823108673095703, - "eval_runtime": 234.3106, - "eval_samples_per_second": 877.084, - "eval_steps_per_second": 54.82, + "eval_loss": 2.449744939804077, + "eval_runtime": 233.1003, + "eval_samples_per_second": 881.638, + "eval_steps_per_second": 55.105, "step": 2088000 }, { "epoch": 8.59, - "learning_rate": 1.2674059868256484e-06, - "loss": 2.8454, + "learning_rate": 5.1933333333333335e-08, + "loss": 2.5072, "step": 2096000 }, { "epoch": 8.59, - "eval_loss": 2.887014865875244, - "eval_runtime": 236.1124, - "eval_samples_per_second": 870.391, - "eval_steps_per_second": 54.402, + "eval_loss": 2.4547877311706543, + "eval_runtime": 232.2237, + "eval_samples_per_second": 884.966, + "eval_steps_per_second": 55.313, "step": 2096000 }, { "epoch": 8.62, - "eval_loss": 2.8897511959075928, - "eval_runtime": 234.9051, - "eval_samples_per_second": 874.864, - "eval_steps_per_second": 54.682, + "eval_loss": 2.4547617435455322, + "eval_runtime": 232.0067, + "eval_samples_per_second": 885.794, + "eval_steps_per_second": 55.365, "step": 2104000 }, { "epoch": 8.65, - "learning_rate": 1.2007004085716668e-06, - "loss": 2.8428, + "learning_rate": 4.92e-08, + "loss": 2.504, "step": 2112000 }, { "epoch": 8.65, - "eval_loss": 2.8753533363342285, - "eval_runtime": 235.2777, - "eval_samples_per_second": 873.479, - "eval_steps_per_second": 54.595, + "eval_loss": 2.444261312484741, + "eval_runtime": 232.4079, + "eval_samples_per_second": 884.264, + "eval_steps_per_second": 55.269, "step": 2112000 }, { "epoch": 8.69, - "eval_loss": 2.8771519660949707, - "eval_runtime": 234.9372, - "eval_samples_per_second": 874.744, - "eval_steps_per_second": 54.674, + "eval_loss": 2.445204734802246, + "eval_runtime": 233.2645, + "eval_samples_per_second": 881.017, + "eval_steps_per_second": 55.066, "step": 2120000 }, { "epoch": 8.72, - "learning_rate": 1.1339948303176854e-06, - "loss": 2.85, + "learning_rate": 4.6466666666666666e-08, + "loss": 2.5128, "step": 2128000 }, { "epoch": 8.72, - "eval_loss": 2.8815581798553467, - "eval_runtime": 233.3269, - "eval_samples_per_second": 880.781, - "eval_steps_per_second": 55.052, + "eval_loss": 2.4509565830230713, + "eval_runtime": 233.1857, + "eval_samples_per_second": 881.315, + "eval_steps_per_second": 55.085, "step": 2128000 }, { "epoch": 8.75, - "eval_loss": 2.8723196983337402, - "eval_runtime": 233.3963, - "eval_samples_per_second": 880.52, - "eval_steps_per_second": 55.035, + "eval_loss": 2.447999954223633, + "eval_runtime": 233.2452, + "eval_samples_per_second": 881.09, + "eval_steps_per_second": 55.071, "step": 2136000 }, { "epoch": 8.79, - "learning_rate": 1.067289252063704e-06, - "loss": 2.8482, + "learning_rate": 4.3733333333333335e-08, + "loss": 2.5133, "step": 2144000 }, { "epoch": 8.79, - "eval_loss": 2.8833649158477783, - "eval_runtime": 233.2426, - "eval_samples_per_second": 881.1, - "eval_steps_per_second": 55.071, + "eval_loss": 2.4470479488372803, + "eval_runtime": 234.9529, + "eval_samples_per_second": 874.686, + "eval_steps_per_second": 54.671, "step": 2144000 }, { "epoch": 8.82, - "eval_loss": 2.8784103393554688, - "eval_runtime": 232.8953, - "eval_samples_per_second": 882.414, - "eval_steps_per_second": 55.154, + "eval_loss": 2.4436631202697754, + "eval_runtime": 234.9836, + "eval_samples_per_second": 874.572, + "eval_steps_per_second": 54.663, "step": 2152000 }, { "epoch": 8.85, - "learning_rate": 1.0005836738097224e-06, - "loss": 2.8417, + "learning_rate": 4.1e-08, + "loss": 2.5067, "step": 2160000 }, { "epoch": 8.85, - "eval_loss": 2.8758938312530518, - "eval_runtime": 233.0635, - "eval_samples_per_second": 881.777, - "eval_steps_per_second": 55.114, + "eval_loss": 2.444672107696533, + "eval_runtime": 234.3233, + "eval_samples_per_second": 877.036, + "eval_steps_per_second": 54.817, "step": 2160000 }, { "epoch": 8.88, - "eval_loss": 2.881685256958008, - "eval_runtime": 232.8737, - "eval_samples_per_second": 882.495, - "eval_steps_per_second": 55.159, + "eval_loss": 2.453118085861206, + "eval_runtime": 233.5384, + "eval_samples_per_second": 879.984, + "eval_steps_per_second": 55.002, "step": 2168000 }, { "epoch": 8.92, - "learning_rate": 9.33878095555741e-07, - "loss": 2.8336, + "learning_rate": 3.8266666666666665e-08, + "loss": 2.4996, "step": 2176000 }, { "epoch": 8.92, - "eval_loss": 2.8810818195343018, - "eval_runtime": 233.1976, - "eval_samples_per_second": 881.27, - "eval_steps_per_second": 55.082, + "eval_loss": 2.447479009628296, + "eval_runtime": 235.7844, + "eval_samples_per_second": 871.601, + "eval_steps_per_second": 54.478, "step": 2176000 }, { "epoch": 8.95, - "eval_loss": 2.8726937770843506, - "eval_runtime": 232.1542, - "eval_samples_per_second": 885.231, - "eval_steps_per_second": 55.33, + "eval_loss": 2.4438347816467285, + "eval_runtime": 233.6193, + "eval_samples_per_second": 879.679, + "eval_steps_per_second": 54.983, "step": 2184000 }, { "epoch": 8.98, - "learning_rate": 8.671725173017595e-07, - "loss": 2.8514, + "learning_rate": 3.5533333333333334e-08, + "loss": 2.5123, "step": 2192000 }, { "epoch": 8.98, - "eval_loss": 2.889397621154785, - "eval_runtime": 232.9383, - "eval_samples_per_second": 882.251, - "eval_steps_per_second": 55.143, + "eval_loss": 2.4552195072174072, + "eval_runtime": 235.201, + "eval_samples_per_second": 873.763, + "eval_steps_per_second": 54.613, "step": 2192000 }, { "epoch": 9.01, - "eval_loss": 2.8750553131103516, - "eval_runtime": 234.1088, - "eval_samples_per_second": 877.84, - "eval_steps_per_second": 54.868, + "eval_loss": 2.4441311359405518, + "eval_runtime": 234.6948, + "eval_samples_per_second": 875.648, + "eval_steps_per_second": 54.731, "step": 2200000 }, { "epoch": 9.05, - "learning_rate": 8.004669390477779e-07, - "loss": 2.8312, + "learning_rate": 3.28e-08, + "loss": 2.5044, "step": 2208000 }, { "epoch": 9.05, - "eval_loss": 2.877980947494507, - "eval_runtime": 234.0757, - "eval_samples_per_second": 877.964, - "eval_steps_per_second": 54.875, + "eval_loss": 2.4438366889953613, + "eval_runtime": 233.1145, + "eval_samples_per_second": 881.584, + "eval_steps_per_second": 55.102, "step": 2208000 }, { "epoch": 9.08, - "eval_loss": 2.8862829208374023, - "eval_runtime": 232.0198, - "eval_samples_per_second": 885.743, - "eval_steps_per_second": 55.362, + "eval_loss": 2.453371286392212, + "eval_runtime": 234.9783, + "eval_samples_per_second": 874.592, + "eval_steps_per_second": 54.665, "step": 2216000 }, { "epoch": 9.11, - "learning_rate": 7.337613607937964e-07, - "loss": 2.8315, + "learning_rate": 3.0066666666666665e-08, + "loss": 2.5068, "step": 2224000 }, { "epoch": 9.11, - "eval_loss": 2.881166696548462, - "eval_runtime": 232.6004, - "eval_samples_per_second": 883.533, - "eval_steps_per_second": 55.223, + "eval_loss": 2.449671745300293, + "eval_runtime": 232.7881, + "eval_samples_per_second": 882.82, + "eval_steps_per_second": 55.179, "step": 2224000 }, { "epoch": 9.15, - "eval_loss": 2.871530532836914, - "eval_runtime": 233.3854, - "eval_samples_per_second": 880.561, - "eval_steps_per_second": 55.038, + "eval_loss": 2.444044828414917, + "eval_runtime": 233.2255, + "eval_samples_per_second": 881.164, + "eval_steps_per_second": 55.075, "step": 2232000 }, { "epoch": 9.18, - "learning_rate": 6.67055782539815e-07, - "loss": 2.8509, + "learning_rate": 2.7333333333333333e-08, + "loss": 2.5165, "step": 2240000 }, { "epoch": 9.18, - "eval_loss": 2.890825033187866, - "eval_runtime": 233.6047, - "eval_samples_per_second": 879.734, - "eval_steps_per_second": 54.986, + "eval_loss": 2.457695722579956, + "eval_runtime": 234.2621, + "eval_samples_per_second": 877.265, + "eval_steps_per_second": 54.832, "step": 2240000 }, { "epoch": 9.21, - "eval_loss": 2.880777359008789, - "eval_runtime": 232.8121, - "eval_samples_per_second": 882.729, - "eval_steps_per_second": 55.173, + "eval_loss": 2.4506990909576416, + "eval_runtime": 232.4001, + "eval_samples_per_second": 884.294, + "eval_steps_per_second": 55.271, "step": 2248000 }, { "epoch": 9.24, - "learning_rate": 6.003502042858334e-07, - "loss": 2.8394, + "learning_rate": 2.46e-08, + "loss": 2.5087, "step": 2256000 }, { "epoch": 9.24, - "eval_loss": 2.8801651000976562, - "eval_runtime": 235.058, - "eval_samples_per_second": 874.295, - "eval_steps_per_second": 54.646, + "eval_loss": 2.4494166374206543, + "eval_runtime": 233.0606, + "eval_samples_per_second": 881.788, + "eval_steps_per_second": 55.114, "step": 2256000 }, { "epoch": 9.28, - "eval_loss": 2.8692407608032227, - "eval_runtime": 234.8843, - "eval_samples_per_second": 874.941, - "eval_steps_per_second": 54.686, + "eval_loss": 2.4393150806427, + "eval_runtime": 234.1923, + "eval_samples_per_second": 877.527, + "eval_steps_per_second": 54.848, "step": 2264000 }, { "epoch": 9.31, - "learning_rate": 5.33644626031852e-07, - "loss": 2.8332, + "learning_rate": 2.1866666666666667e-08, + "loss": 2.5036, "step": 2272000 }, { "epoch": 9.31, - "eval_loss": 2.871225357055664, - "eval_runtime": 233.9349, - "eval_samples_per_second": 878.492, - "eval_steps_per_second": 54.908, + "eval_loss": 2.4486756324768066, + "eval_runtime": 233.5876, + "eval_samples_per_second": 879.798, + "eval_steps_per_second": 54.99, "step": 2272000 }, { "epoch": 9.34, - "eval_loss": 2.8688416481018066, - "eval_runtime": 233.3243, - "eval_samples_per_second": 880.791, - "eval_steps_per_second": 55.052, + "eval_loss": 2.442298173904419, + "eval_runtime": 233.1053, + "eval_samples_per_second": 881.619, + "eval_steps_per_second": 55.104, "step": 2280000 }, { "epoch": 9.38, - "learning_rate": 4.669390477778705e-07, - "loss": 2.837, + "learning_rate": 1.9133333333333333e-08, + "loss": 2.5086, "step": 2288000 }, { "epoch": 9.38, - "eval_loss": 2.8778717517852783, - "eval_runtime": 233.117, - "eval_samples_per_second": 881.574, - "eval_steps_per_second": 55.101, + "eval_loss": 2.4455623626708984, + "eval_runtime": 232.7856, + "eval_samples_per_second": 882.83, + "eval_steps_per_second": 55.18, "step": 2288000 }, { "epoch": 9.41, - "eval_loss": 2.879408597946167, - "eval_runtime": 233.2294, - "eval_samples_per_second": 881.149, - "eval_steps_per_second": 55.075, + "eval_loss": 2.449575185775757, + "eval_runtime": 234.5471, + "eval_samples_per_second": 876.199, + "eval_steps_per_second": 54.765, "step": 2296000 }, { "epoch": 9.44, - "learning_rate": 4.0023346952388894e-07, - "loss": 2.8344, + "learning_rate": 1.64e-08, + "loss": 2.5034, "step": 2304000 }, { "epoch": 9.44, - "eval_loss": 2.8751115798950195, - "eval_runtime": 233.3516, - "eval_samples_per_second": 880.688, - "eval_steps_per_second": 55.046, + "eval_loss": 2.4498891830444336, + "eval_runtime": 232.7935, + "eval_samples_per_second": 882.8, + "eval_steps_per_second": 55.178, "step": 2304000 }, { "epoch": 9.47, - "eval_loss": 2.874969482421875, - "eval_runtime": 233.1427, - "eval_samples_per_second": 881.477, - "eval_steps_per_second": 55.095, + "eval_loss": 2.4432790279388428, + "eval_runtime": 233.6332, + "eval_samples_per_second": 879.627, + "eval_steps_per_second": 54.979, "step": 2312000 }, { "epoch": 9.51, - "learning_rate": 3.335278912699075e-07, - "loss": 2.8376, + "learning_rate": 1.3666666666666667e-08, + "loss": 2.5099, "step": 2320000 }, { "epoch": 9.51, - "eval_loss": 2.8837876319885254, - "eval_runtime": 233.3471, - "eval_samples_per_second": 880.705, - "eval_steps_per_second": 55.047, + "eval_loss": 2.4534084796905518, + "eval_runtime": 233.6601, + "eval_samples_per_second": 879.525, + "eval_steps_per_second": 54.973, "step": 2320000 }, { "epoch": 9.54, - "eval_loss": 2.8825130462646484, - "eval_runtime": 233.8169, - "eval_samples_per_second": 878.935, - "eval_steps_per_second": 54.936, + "eval_loss": 2.4494857788085938, + "eval_runtime": 233.4959, + "eval_samples_per_second": 880.144, + "eval_steps_per_second": 55.012, "step": 2328000 }, { "epoch": 9.57, - "learning_rate": 2.66822313015926e-07, - "loss": 2.8329, + "learning_rate": 1.0933333333333334e-08, + "loss": 2.5065, "step": 2336000 }, { "epoch": 9.57, - "eval_loss": 2.880941390991211, - "eval_runtime": 234.5491, - "eval_samples_per_second": 876.192, - "eval_steps_per_second": 54.765, + "eval_loss": 2.4510202407836914, + "eval_runtime": 233.7041, + "eval_samples_per_second": 879.36, + "eval_steps_per_second": 54.963, "step": 2336000 }, { "epoch": 9.6, - "eval_loss": 2.8842968940734863, - "eval_runtime": 233.344, - "eval_samples_per_second": 880.717, - "eval_steps_per_second": 55.047, + "eval_loss": 2.4512877464294434, + "eval_runtime": 237.5888, + "eval_samples_per_second": 864.982, + "eval_steps_per_second": 54.064, "step": 2344000 }, { "epoch": 9.64, - "learning_rate": 2.0011673476194447e-07, - "loss": 2.8264, + "learning_rate": 8.2e-09, + "loss": 2.502, "step": 2352000 }, { "epoch": 9.64, - "eval_loss": 2.8784091472625732, - "eval_runtime": 233.7157, - "eval_samples_per_second": 879.316, - "eval_steps_per_second": 54.96, + "eval_loss": 2.451225996017456, + "eval_runtime": 233.4207, + "eval_samples_per_second": 880.427, + "eval_steps_per_second": 55.029, "step": 2352000 }, { "epoch": 9.67, - "eval_loss": 2.8687503337860107, - "eval_runtime": 233.7026, - "eval_samples_per_second": 879.365, - "eval_steps_per_second": 54.963, + "eval_loss": 2.4469268321990967, + "eval_runtime": 234.7938, + "eval_samples_per_second": 875.279, + "eval_steps_per_second": 54.708, "step": 2360000 }, { "epoch": 9.7, - "learning_rate": 1.33411156507963e-07, - "loss": 2.8317, + "learning_rate": 5.466666666666667e-09, + "loss": 2.5043, "step": 2368000 }, { "epoch": 9.7, - "eval_loss": 2.8792617321014404, - "eval_runtime": 235.256, - "eval_samples_per_second": 873.559, - "eval_steps_per_second": 54.6, + "eval_loss": 2.4544479846954346, + "eval_runtime": 234.2869, + "eval_samples_per_second": 877.173, + "eval_steps_per_second": 54.826, "step": 2368000 }, { "epoch": 9.74, - "eval_loss": 2.8814611434936523, - "eval_runtime": 235.5823, - "eval_samples_per_second": 872.349, - "eval_steps_per_second": 54.524, + "eval_loss": 2.4492740631103516, + "eval_runtime": 234.1805, + "eval_samples_per_second": 877.571, + "eval_steps_per_second": 54.851, "step": 2376000 }, { "epoch": 9.77, - "learning_rate": 6.67055782539815e-08, - "loss": 2.8328, + "learning_rate": 2.7333333333333334e-09, + "loss": 2.5068, "step": 2384000 }, { "epoch": 9.77, - "eval_loss": 2.8755900859832764, - "eval_runtime": 233.6052, - "eval_samples_per_second": 879.732, - "eval_steps_per_second": 54.986, + "eval_loss": 2.453711748123169, + "eval_runtime": 233.0608, + "eval_samples_per_second": 881.787, + "eval_steps_per_second": 55.114, "step": 2384000 }, { "epoch": 9.8, - "eval_loss": 2.869050979614258, - "eval_runtime": 233.5136, - "eval_samples_per_second": 880.077, - "eval_steps_per_second": 55.007, + "eval_loss": 2.4386837482452393, + "eval_runtime": 234.1662, + "eval_samples_per_second": 877.625, + "eval_steps_per_second": 54.854, "step": 2392000 }, { "epoch": 9.83, "learning_rate": 0.0, - "loss": 2.841, + "loss": 2.5118, "step": 2400000 }, { "epoch": 9.83, - "eval_loss": 2.867439031600952, - "eval_runtime": 233.911, - "eval_samples_per_second": 878.582, - "eval_steps_per_second": 54.914, + "eval_loss": 2.4494030475616455, + "eval_runtime": 234.0187, + "eval_samples_per_second": 878.178, + "eval_steps_per_second": 54.889, "step": 2400000 }, { "epoch": 9.83, "step": 2400000, "total_flos": 7.305293129309786e+17, - "train_loss": 2.773840421549479, - "train_runtime": 219744.0708, - "train_samples_per_second": 174.749, - "train_steps_per_second": 10.922 + "train_loss": 2.5438934391276042, + "train_runtime": 220778.1092, + "train_samples_per_second": 173.93, + "train_steps_per_second": 10.871 } ], "logging_steps": 16000,