diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,7 +1,7 @@ { - "best_metric": 2.2634286880493164, - "best_model_checkpoint": "./model_tweets_2020_Q4_25/checkpoint-2016000", - "epoch": 6.736955569778018, + "best_metric": 2.256277322769165, + "best_model_checkpoint": "./model_tweets_2020_Q4_25/checkpoint-1952000", + "epoch": 6.73682319488226, "eval_steps": 8000, "global_step": 2400000, "is_hyper_param_search": false, @@ -10,3319 +10,3319 @@ "log_history": [ { "epoch": 0.02, - "eval_loss": 2.592593193054199, - "eval_runtime": 340.1804, - "eval_samples_per_second": 881.867, - "eval_steps_per_second": 55.118, + "eval_loss": 2.580249547958374, + "eval_runtime": 320.9634, + "eval_samples_per_second": 934.686, + "eval_steps_per_second": 58.418, "step": 8000 }, { "epoch": 0.04, "learning_rate": 4.0726666666666665e-07, - "loss": 2.7864, + "loss": 2.8151, "step": 16000 }, { "epoch": 0.04, - "eval_loss": 2.5070722103118896, - "eval_runtime": 334.504, - "eval_samples_per_second": 896.832, - "eval_steps_per_second": 56.053, + "eval_loss": 2.488163471221924, + "eval_runtime": 321.5644, + "eval_samples_per_second": 932.939, + "eval_steps_per_second": 58.309, "step": 16000 }, { "epoch": 0.07, - "eval_loss": 2.468975782394409, - "eval_runtime": 337.9946, - "eval_samples_per_second": 887.57, - "eval_steps_per_second": 55.474, + "eval_loss": 2.429165840148926, + "eval_runtime": 321.2729, + "eval_samples_per_second": 933.786, + "eval_steps_per_second": 58.362, "step": 24000 }, { "epoch": 0.09, "learning_rate": 4.0453333333333336e-07, - "loss": 2.5937, + "loss": 2.5636, "step": 32000 }, { "epoch": 0.09, - "eval_loss": 2.435482978820801, - "eval_runtime": 338.3798, - "eval_samples_per_second": 886.56, - "eval_steps_per_second": 55.411, + "eval_loss": 2.3980140686035156, + "eval_runtime": 321.6728, + "eval_samples_per_second": 932.625, + "eval_steps_per_second": 58.289, "step": 32000 }, { "epoch": 0.11, - "eval_loss": 2.4124932289123535, - "eval_runtime": 330.9228, - "eval_samples_per_second": 906.538, - "eval_steps_per_second": 56.66, + "eval_loss": 2.3799262046813965, + "eval_runtime": 323.0695, + "eval_samples_per_second": 928.593, + "eval_steps_per_second": 58.037, "step": 40000 }, { "epoch": 0.13, "learning_rate": 4.018e-07, - "loss": 2.55, + "loss": 2.4947, "step": 48000 }, { "epoch": 0.13, - "eval_loss": 2.400876760482788, - "eval_runtime": 334.2849, - "eval_samples_per_second": 897.42, - "eval_steps_per_second": 56.09, + "eval_loss": 2.3665478229522705, + "eval_runtime": 322.6194, + "eval_samples_per_second": 929.888, + "eval_steps_per_second": 58.118, "step": 48000 }, { "epoch": 0.16, - "eval_loss": 2.389232873916626, - "eval_runtime": 333.216, - "eval_samples_per_second": 900.299, - "eval_steps_per_second": 56.27, + "eval_loss": 2.345531940460205, + "eval_runtime": 322.4114, + "eval_samples_per_second": 930.488, + "eval_steps_per_second": 58.156, "step": 56000 }, { "epoch": 0.18, "learning_rate": 3.9906666666666667e-07, - "loss": 2.5159, + "loss": 2.473, "step": 64000 }, { "epoch": 0.18, - "eval_loss": 2.3736019134521484, - "eval_runtime": 333.0778, - "eval_samples_per_second": 900.672, - "eval_steps_per_second": 56.293, + "eval_loss": 2.341932773590088, + "eval_runtime": 324.4552, + "eval_samples_per_second": 924.627, + "eval_steps_per_second": 57.789, "step": 64000 }, { "epoch": 0.2, - "eval_loss": 2.3712990283966064, - "eval_runtime": 332.3155, - "eval_samples_per_second": 902.738, - "eval_steps_per_second": 56.422, + "eval_loss": 2.3307127952575684, + "eval_runtime": 322.5941, + "eval_samples_per_second": 929.961, + "eval_steps_per_second": 58.123, "step": 72000 }, { "epoch": 0.22, "learning_rate": 3.963333333333333e-07, - "loss": 2.495, + "loss": 2.4512, "step": 80000 }, { "epoch": 0.22, - "eval_loss": 2.364145278930664, - "eval_runtime": 329.5546, - "eval_samples_per_second": 910.301, - "eval_steps_per_second": 56.895, + "eval_loss": 2.3288769721984863, + "eval_runtime": 322.3925, + "eval_samples_per_second": 930.543, + "eval_steps_per_second": 58.159, "step": 80000 }, { "epoch": 0.25, - "eval_loss": 2.357400417327881, - "eval_runtime": 330.227, - "eval_samples_per_second": 908.448, - "eval_steps_per_second": 56.779, + "eval_loss": 2.325032949447632, + "eval_runtime": 322.736, + "eval_samples_per_second": 929.552, + "eval_steps_per_second": 58.097, "step": 88000 }, { "epoch": 0.27, "learning_rate": 3.936e-07, - "loss": 2.4845, + "loss": 2.4421, "step": 96000 }, { "epoch": 0.27, - "eval_loss": 2.349066734313965, - "eval_runtime": 332.2917, - "eval_samples_per_second": 902.803, - "eval_steps_per_second": 56.426, + "eval_loss": 2.318911075592041, + "eval_runtime": 323.2095, + "eval_samples_per_second": 928.19, + "eval_steps_per_second": 58.012, "step": 96000 }, { "epoch": 0.29, - "eval_loss": 2.3484702110290527, - "eval_runtime": 330.9315, - "eval_samples_per_second": 906.514, - "eval_steps_per_second": 56.658, + "eval_loss": 2.3199880123138428, + "eval_runtime": 323.2363, + "eval_samples_per_second": 928.114, + "eval_steps_per_second": 58.007, "step": 104000 }, { "epoch": 0.31, "learning_rate": 3.908666666666667e-07, - "loss": 2.4765, + "loss": 2.4354, "step": 112000 }, { "epoch": 0.31, - "eval_loss": 2.3432772159576416, - "eval_runtime": 331.9574, - "eval_samples_per_second": 903.712, - "eval_steps_per_second": 56.483, + "eval_loss": 2.3154587745666504, + "eval_runtime": 323.95, + "eval_samples_per_second": 926.069, + "eval_steps_per_second": 57.879, "step": 112000 }, { "epoch": 0.34, - "eval_loss": 2.337597131729126, - "eval_runtime": 328.3652, - "eval_samples_per_second": 913.599, - "eval_steps_per_second": 57.101, + "eval_loss": 2.313781976699829, + "eval_runtime": 324.5922, + "eval_samples_per_second": 924.237, + "eval_steps_per_second": 57.765, "step": 120000 }, { "epoch": 0.36, "learning_rate": 3.8813333333333334e-07, - "loss": 2.472, + "loss": 2.4324, "step": 128000 }, { "epoch": 0.36, - "eval_loss": 2.339606285095215, - "eval_runtime": 331.483, - "eval_samples_per_second": 905.006, - "eval_steps_per_second": 56.564, + "eval_loss": 2.305436372756958, + "eval_runtime": 323.2003, + "eval_samples_per_second": 928.217, + "eval_steps_per_second": 58.014, "step": 128000 }, { "epoch": 0.38, - "eval_loss": 2.3326175212860107, - "eval_runtime": 331.9588, - "eval_samples_per_second": 903.709, - "eval_steps_per_second": 56.483, + "eval_loss": 2.302849054336548, + "eval_runtime": 323.3577, + "eval_samples_per_second": 927.765, + "eval_steps_per_second": 57.985, "step": 136000 }, { "epoch": 0.4, "learning_rate": 3.854e-07, - "loss": 2.467, + "loss": 2.4253, "step": 144000 }, { "epoch": 0.4, - "eval_loss": 2.3383891582489014, - "eval_runtime": 330.5258, - "eval_samples_per_second": 907.626, - "eval_steps_per_second": 56.728, + "eval_loss": 2.3029212951660156, + "eval_runtime": 324.8316, + "eval_samples_per_second": 923.555, + "eval_steps_per_second": 57.722, "step": 144000 }, { "epoch": 0.43, - "eval_loss": 2.3349926471710205, - "eval_runtime": 331.6599, - "eval_samples_per_second": 904.523, - "eval_steps_per_second": 56.534, + "eval_loss": 2.3006043434143066, + "eval_runtime": 323.2225, + "eval_samples_per_second": 928.153, + "eval_steps_per_second": 58.01, "step": 152000 }, { "epoch": 0.45, "learning_rate": 3.8266666666666665e-07, - "loss": 2.46, + "loss": 2.4156, "step": 160000 }, { "epoch": 0.45, - "eval_loss": 2.3262743949890137, - "eval_runtime": 332.3863, - "eval_samples_per_second": 902.546, - "eval_steps_per_second": 56.41, + "eval_loss": 2.300135612487793, + "eval_runtime": 323.6582, + "eval_samples_per_second": 926.904, + "eval_steps_per_second": 57.931, "step": 160000 }, { "epoch": 0.47, - "eval_loss": 2.323078155517578, - "eval_runtime": 328.7829, - "eval_samples_per_second": 912.438, - "eval_steps_per_second": 57.029, + "eval_loss": 2.298043727874756, + "eval_runtime": 322.8658, + "eval_samples_per_second": 929.179, + "eval_steps_per_second": 58.074, "step": 168000 }, { "epoch": 0.49, "learning_rate": 3.799333333333333e-07, - "loss": 2.4593, + "loss": 2.4165, "step": 176000 }, { "epoch": 0.49, - "eval_loss": 2.32228946685791, - "eval_runtime": 329.1085, - "eval_samples_per_second": 911.535, - "eval_steps_per_second": 56.972, + "eval_loss": 2.291269063949585, + "eval_runtime": 323.9312, + "eval_samples_per_second": 926.122, + "eval_steps_per_second": 57.883, "step": 176000 }, { "epoch": 0.52, - "eval_loss": 2.32503604888916, - "eval_runtime": 329.9498, - "eval_samples_per_second": 909.211, - "eval_steps_per_second": 56.827, + "eval_loss": 2.297363519668579, + "eval_runtime": 323.2402, + "eval_samples_per_second": 928.102, + "eval_steps_per_second": 58.006, "step": 184000 }, { "epoch": 0.54, "learning_rate": 3.772e-07, - "loss": 2.4552, + "loss": 2.4131, "step": 192000 }, { "epoch": 0.54, - "eval_loss": 2.319518566131592, - "eval_runtime": 328.4451, - "eval_samples_per_second": 913.376, - "eval_steps_per_second": 57.087, + "eval_loss": 2.2906086444854736, + "eval_runtime": 323.5876, + "eval_samples_per_second": 927.106, + "eval_steps_per_second": 57.944, "step": 192000 }, { "epoch": 0.56, - "eval_loss": 2.3236451148986816, - "eval_runtime": 329.6628, - "eval_samples_per_second": 910.003, - "eval_steps_per_second": 56.876, + "eval_loss": 2.2908411026000977, + "eval_runtime": 324.835, + "eval_samples_per_second": 923.546, + "eval_steps_per_second": 57.722, "step": 200000 }, { "epoch": 0.58, "learning_rate": 3.7446666666666667e-07, - "loss": 2.4558, + "loss": 2.407, "step": 208000 }, { "epoch": 0.58, - "eval_loss": 2.322141170501709, - "eval_runtime": 328.8408, - "eval_samples_per_second": 912.277, - "eval_steps_per_second": 57.018, + "eval_loss": 2.289541482925415, + "eval_runtime": 323.2737, + "eval_samples_per_second": 928.006, + "eval_steps_per_second": 58.0, "step": 208000 }, { "epoch": 0.61, - "eval_loss": 2.3194377422332764, - "eval_runtime": 329.1871, - "eval_samples_per_second": 911.318, - "eval_steps_per_second": 56.958, + "eval_loss": 2.2865185737609863, + "eval_runtime": 323.7161, + "eval_samples_per_second": 926.738, + "eval_steps_per_second": 57.921, "step": 216000 }, { "epoch": 0.63, "learning_rate": 3.7173333333333333e-07, - "loss": 2.4487, + "loss": 2.4153, "step": 224000 }, { "epoch": 0.63, - "eval_loss": 2.3224573135375977, - "eval_runtime": 328.2655, - "eval_samples_per_second": 913.876, - "eval_steps_per_second": 57.118, + "eval_loss": 2.2913596630096436, + "eval_runtime": 323.8117, + "eval_samples_per_second": 926.464, + "eval_steps_per_second": 57.904, "step": 224000 }, { "epoch": 0.65, - "eval_loss": 2.3220534324645996, - "eval_runtime": 329.5297, - "eval_samples_per_second": 910.37, - "eval_steps_per_second": 56.899, + "eval_loss": 2.280600070953369, + "eval_runtime": 324.7681, + "eval_samples_per_second": 923.736, + "eval_steps_per_second": 57.734, "step": 232000 }, { "epoch": 0.67, "learning_rate": 3.69e-07, - "loss": 2.4485, + "loss": 2.4011, "step": 240000 }, { "epoch": 0.67, - "eval_loss": 2.3135385513305664, - "eval_runtime": 328.6734, - "eval_samples_per_second": 912.742, - "eval_steps_per_second": 57.048, + "eval_loss": 2.2818994522094727, + "eval_runtime": 324.8269, + "eval_samples_per_second": 923.569, + "eval_steps_per_second": 57.723, "step": 240000 }, { "epoch": 0.7, - "eval_loss": 2.3109307289123535, - "eval_runtime": 330.0109, - "eval_samples_per_second": 909.043, - "eval_steps_per_second": 56.816, + "eval_loss": 2.2854413986206055, + "eval_runtime": 324.8244, + "eval_samples_per_second": 923.576, + "eval_steps_per_second": 57.724, "step": 248000 }, { "epoch": 0.72, "learning_rate": 3.6626666666666664e-07, - "loss": 2.4461, + "loss": 2.4087, "step": 256000 }, { "epoch": 0.72, - "eval_loss": 2.3133792877197266, - "eval_runtime": 330.0512, - "eval_samples_per_second": 908.932, - "eval_steps_per_second": 56.809, + "eval_loss": 2.283675193786621, + "eval_runtime": 326.3862, + "eval_samples_per_second": 919.157, + "eval_steps_per_second": 57.447, "step": 256000 }, { "epoch": 0.74, - "eval_loss": 2.317667007446289, - "eval_runtime": 328.6469, - "eval_samples_per_second": 912.816, - "eval_steps_per_second": 57.052, + "eval_loss": 2.286595106124878, + "eval_runtime": 327.6717, + "eval_samples_per_second": 915.551, + "eval_steps_per_second": 57.222, "step": 264000 }, { "epoch": 0.76, "learning_rate": 3.6353333333333335e-07, - "loss": 2.4513, + "loss": 2.4059, "step": 272000 }, { "epoch": 0.76, - "eval_loss": 2.310190200805664, - "eval_runtime": 329.3646, - "eval_samples_per_second": 910.826, - "eval_steps_per_second": 56.928, + "eval_loss": 2.285534143447876, + "eval_runtime": 326.6584, + "eval_samples_per_second": 918.391, + "eval_steps_per_second": 57.399, "step": 272000 }, { "epoch": 0.79, - "eval_loss": 2.3051719665527344, - "eval_runtime": 329.4854, - "eval_samples_per_second": 910.492, - "eval_steps_per_second": 56.907, + "eval_loss": 2.28678560256958, + "eval_runtime": 329.4988, + "eval_samples_per_second": 910.474, + "eval_steps_per_second": 56.905, "step": 280000 }, { "epoch": 0.81, "learning_rate": 3.608e-07, - "loss": 2.4488, + "loss": 2.4086, "step": 288000 }, { "epoch": 0.81, - "eval_loss": 2.3044047355651855, - "eval_runtime": 329.2769, - "eval_samples_per_second": 911.069, - "eval_steps_per_second": 56.943, + "eval_loss": 2.277035713195801, + "eval_runtime": 327.402, + "eval_samples_per_second": 916.305, + "eval_steps_per_second": 57.269, "step": 288000 }, { "epoch": 0.83, - "eval_loss": 2.3117146492004395, - "eval_runtime": 328.6955, - "eval_samples_per_second": 912.681, - "eval_steps_per_second": 57.044, + "eval_loss": 2.2788984775543213, + "eval_runtime": 328.7295, + "eval_samples_per_second": 912.604, + "eval_steps_per_second": 57.038, "step": 296000 }, { "epoch": 0.85, "learning_rate": 3.5806666666666666e-07, - "loss": 2.4447, + "loss": 2.4093, "step": 304000 }, { "epoch": 0.85, - "eval_loss": 2.3050835132598877, - "eval_runtime": 329.2136, - "eval_samples_per_second": 911.244, - "eval_steps_per_second": 56.954, + "eval_loss": 2.2792067527770996, + "eval_runtime": 328.6343, + "eval_samples_per_second": 912.869, + "eval_steps_per_second": 57.054, "step": 304000 }, { "epoch": 0.88, - "eval_loss": 2.3112313747406006, - "eval_runtime": 329.9672, - "eval_samples_per_second": 909.163, - "eval_steps_per_second": 56.824, + "eval_loss": 2.2796542644500732, + "eval_runtime": 328.9041, + "eval_samples_per_second": 912.12, + "eval_steps_per_second": 57.007, "step": 312000 }, { "epoch": 0.9, "learning_rate": 3.553333333333333e-07, - "loss": 2.4485, + "loss": 2.4036, "step": 320000 }, { "epoch": 0.9, - "eval_loss": 2.3064165115356445, - "eval_runtime": 328.707, - "eval_samples_per_second": 912.649, - "eval_steps_per_second": 57.042, + "eval_loss": 2.2794368267059326, + "eval_runtime": 327.0881, + "eval_samples_per_second": 917.184, + "eval_steps_per_second": 57.324, "step": 320000 }, { "epoch": 0.92, - "eval_loss": 2.3099164962768555, - "eval_runtime": 329.8653, - "eval_samples_per_second": 909.444, - "eval_steps_per_second": 56.841, + "eval_loss": 2.2767865657806396, + "eval_runtime": 325.4813, + "eval_samples_per_second": 921.712, + "eval_steps_per_second": 57.607, "step": 328000 }, { "epoch": 0.94, "learning_rate": 3.5259999999999997e-07, - "loss": 2.4475, + "loss": 2.4063, "step": 336000 }, { "epoch": 0.94, - "eval_loss": 2.310988664627075, - "eval_runtime": 329.0328, - "eval_samples_per_second": 911.745, - "eval_steps_per_second": 56.985, + "eval_loss": 2.28360652923584, + "eval_runtime": 326.0539, + "eval_samples_per_second": 920.093, + "eval_steps_per_second": 57.506, "step": 336000 }, { "epoch": 0.97, - "eval_loss": 2.3013858795166016, - "eval_runtime": 329.2936, - "eval_samples_per_second": 911.023, - "eval_steps_per_second": 56.94, + "eval_loss": 2.2808754444122314, + "eval_runtime": 324.6753, + "eval_samples_per_second": 924.0, + "eval_steps_per_second": 57.75, "step": 344000 }, { "epoch": 0.99, "learning_rate": 3.498666666666667e-07, - "loss": 2.4464, + "loss": 2.4047, "step": 352000 }, { "epoch": 0.99, - "eval_loss": 2.3032288551330566, - "eval_runtime": 329.4779, - "eval_samples_per_second": 910.513, - "eval_steps_per_second": 56.908, + "eval_loss": 2.280778408050537, + "eval_runtime": 325.3269, + "eval_samples_per_second": 922.149, + "eval_steps_per_second": 57.634, "step": 352000 }, { "epoch": 1.01, - "eval_loss": 2.3035871982574463, - "eval_runtime": 329.3314, - "eval_samples_per_second": 910.918, - "eval_steps_per_second": 56.934, + "eval_loss": 2.28403377532959, + "eval_runtime": 325.2468, + "eval_samples_per_second": 922.377, + "eval_steps_per_second": 57.649, "step": 360000 }, { "epoch": 1.03, "learning_rate": 3.4713333333333333e-07, - "loss": 2.4478, + "loss": 2.4084, "step": 368000 }, { "epoch": 1.03, - "eval_loss": 2.305039882659912, - "eval_runtime": 330.1998, - "eval_samples_per_second": 908.523, - "eval_steps_per_second": 56.784, + "eval_loss": 2.279930591583252, + "eval_runtime": 327.9631, + "eval_samples_per_second": 914.737, + "eval_steps_per_second": 57.171, "step": 368000 }, { "epoch": 1.06, - "eval_loss": 2.3078055381774902, - "eval_runtime": 329.8754, - "eval_samples_per_second": 909.416, - "eval_steps_per_second": 56.84, + "eval_loss": 2.272570848464966, + "eval_runtime": 327.8275, + "eval_samples_per_second": 915.115, + "eval_steps_per_second": 57.195, "step": 376000 }, { "epoch": 1.08, "learning_rate": 3.444e-07, - "loss": 2.4416, + "loss": 2.4041, "step": 384000 }, { "epoch": 1.08, - "eval_loss": 2.302764892578125, - "eval_runtime": 328.8958, - "eval_samples_per_second": 912.125, - "eval_steps_per_second": 57.009, + "eval_loss": 2.2823517322540283, + "eval_runtime": 328.4584, + "eval_samples_per_second": 913.358, + "eval_steps_per_second": 57.085, "step": 384000 }, { "epoch": 1.1, - "eval_loss": 2.3016645908355713, - "eval_runtime": 329.422, - "eval_samples_per_second": 910.668, - "eval_steps_per_second": 56.918, + "eval_loss": 2.278149127960205, + "eval_runtime": 326.9556, + "eval_samples_per_second": 917.556, + "eval_steps_per_second": 57.347, "step": 392000 }, { "epoch": 1.12, "learning_rate": 3.416666666666667e-07, - "loss": 2.4374, + "loss": 2.4034, "step": 400000 }, { "epoch": 1.12, - "eval_loss": 2.3012008666992188, - "eval_runtime": 329.1475, - "eval_samples_per_second": 911.427, - "eval_steps_per_second": 56.965, + "eval_loss": 2.275142192840576, + "eval_runtime": 326.8439, + "eval_samples_per_second": 917.869, + "eval_steps_per_second": 57.367, "step": 400000 }, { "epoch": 1.15, - "eval_loss": 2.3017375469207764, - "eval_runtime": 329.6739, - "eval_samples_per_second": 909.972, - "eval_steps_per_second": 56.874, + "eval_loss": 2.2760984897613525, + "eval_runtime": 325.9846, + "eval_samples_per_second": 920.289, + "eval_steps_per_second": 57.518, "step": 408000 }, { "epoch": 1.17, "learning_rate": 3.3893333333333335e-07, - "loss": 2.4406, + "loss": 2.3951, "step": 416000 }, { "epoch": 1.17, - "eval_loss": 2.3042545318603516, - "eval_runtime": 329.4772, - "eval_samples_per_second": 910.515, - "eval_steps_per_second": 56.908, + "eval_loss": 2.2731635570526123, + "eval_runtime": 326.1395, + "eval_samples_per_second": 919.852, + "eval_steps_per_second": 57.491, "step": 416000 }, { "epoch": 1.19, - "eval_loss": 2.3058371543884277, - "eval_runtime": 329.3774, - "eval_samples_per_second": 910.791, - "eval_steps_per_second": 56.926, + "eval_loss": 2.2709577083587646, + "eval_runtime": 326.1973, + "eval_samples_per_second": 919.689, + "eval_steps_per_second": 57.481, "step": 424000 }, { "epoch": 1.21, "learning_rate": 3.3619999999999995e-07, - "loss": 2.4434, + "loss": 2.409, "step": 432000 }, { "epoch": 1.21, - "eval_loss": 2.2937967777252197, - "eval_runtime": 328.8166, - "eval_samples_per_second": 912.345, - "eval_steps_per_second": 57.023, + "eval_loss": 2.277972936630249, + "eval_runtime": 325.3949, + "eval_samples_per_second": 921.957, + "eval_steps_per_second": 57.622, "step": 432000 }, { "epoch": 1.24, - "eval_loss": 2.2971138954162598, - "eval_runtime": 330.0079, - "eval_samples_per_second": 909.051, - "eval_steps_per_second": 56.817, + "eval_loss": 2.2714641094207764, + "eval_runtime": 325.6353, + "eval_samples_per_second": 921.276, + "eval_steps_per_second": 57.58, "step": 440000 }, { "epoch": 1.26, "learning_rate": 3.3346666666666666e-07, - "loss": 2.4421, + "loss": 2.3985, "step": 448000 }, { "epoch": 1.26, - "eval_loss": 2.3025450706481934, - "eval_runtime": 329.6817, - "eval_samples_per_second": 909.95, - "eval_steps_per_second": 56.873, + "eval_loss": 2.279003620147705, + "eval_runtime": 326.3983, + "eval_samples_per_second": 919.122, + "eval_steps_per_second": 57.445, "step": 448000 }, { "epoch": 1.28, - "eval_loss": 2.2950313091278076, - "eval_runtime": 329.2997, - "eval_samples_per_second": 911.006, - "eval_steps_per_second": 56.939, + "eval_loss": 2.276561737060547, + "eval_runtime": 326.5381, + "eval_samples_per_second": 918.729, + "eval_steps_per_second": 57.421, "step": 456000 }, { "epoch": 1.3, "learning_rate": 3.307333333333333e-07, - "loss": 2.443, + "loss": 2.4016, "step": 464000 }, { "epoch": 1.3, - "eval_loss": 2.2986950874328613, - "eval_runtime": 329.2432, - "eval_samples_per_second": 911.162, - "eval_steps_per_second": 56.949, + "eval_loss": 2.2744641304016113, + "eval_runtime": 326.438, + "eval_samples_per_second": 919.011, + "eval_steps_per_second": 57.438, "step": 464000 }, { "epoch": 1.32, - "eval_loss": 2.294912099838257, - "eval_runtime": 329.1309, - "eval_samples_per_second": 911.473, - "eval_steps_per_second": 56.968, + "eval_loss": 2.2719147205352783, + "eval_runtime": 326.1182, + "eval_samples_per_second": 919.912, + "eval_steps_per_second": 57.494, "step": 472000 }, { "epoch": 1.35, "learning_rate": 3.28e-07, - "loss": 2.4357, + "loss": 2.3978, "step": 480000 }, { "epoch": 1.35, - "eval_loss": 2.3026058673858643, - "eval_runtime": 329.2084, - "eval_samples_per_second": 911.259, - "eval_steps_per_second": 56.955, + "eval_loss": 2.2755250930786133, + "eval_runtime": 326.0946, + "eval_samples_per_second": 919.978, + "eval_steps_per_second": 57.499, "step": 480000 }, { "epoch": 1.37, - "eval_loss": 2.2961277961730957, - "eval_runtime": 329.3643, - "eval_samples_per_second": 910.827, - "eval_steps_per_second": 56.928, + "eval_loss": 2.269918203353882, + "eval_runtime": 326.8772, + "eval_samples_per_second": 917.776, + "eval_steps_per_second": 57.361, "step": 488000 }, { "epoch": 1.39, "learning_rate": 3.252666666666667e-07, - "loss": 2.4366, + "loss": 2.406, "step": 496000 }, { "epoch": 1.39, - "eval_loss": 2.3002982139587402, - "eval_runtime": 331.8417, - "eval_samples_per_second": 904.027, - "eval_steps_per_second": 56.503, + "eval_loss": 2.282317876815796, + "eval_runtime": 325.8019, + "eval_samples_per_second": 920.805, + "eval_steps_per_second": 57.55, "step": 496000 }, { "epoch": 1.41, - "eval_loss": 2.2953805923461914, - "eval_runtime": 330.1726, - "eval_samples_per_second": 908.598, - "eval_steps_per_second": 56.788, + "eval_loss": 2.2735817432403564, + "eval_runtime": 326.0969, + "eval_samples_per_second": 919.972, + "eval_steps_per_second": 57.498, "step": 504000 }, { "epoch": 1.44, "learning_rate": 3.2253333333333334e-07, - "loss": 2.4528, + "loss": 2.3958, "step": 512000 }, { "epoch": 1.44, - "eval_loss": 2.2882533073425293, - "eval_runtime": 332.2968, - "eval_samples_per_second": 902.789, - "eval_steps_per_second": 56.425, + "eval_loss": 2.2728230953216553, + "eval_runtime": 326.2067, + "eval_samples_per_second": 919.662, + "eval_steps_per_second": 57.479, "step": 512000 }, { "epoch": 1.46, - "eval_loss": 2.2999649047851562, - "eval_runtime": 329.9715, - "eval_samples_per_second": 909.151, - "eval_steps_per_second": 56.823, + "eval_loss": 2.2762703895568848, + "eval_runtime": 326.3243, + "eval_samples_per_second": 919.331, + "eval_steps_per_second": 57.458, "step": 520000 }, { "epoch": 1.48, "learning_rate": 3.198e-07, - "loss": 2.4389, + "loss": 2.406, "step": 528000 }, { "epoch": 1.48, - "eval_loss": 2.2938716411590576, - "eval_runtime": 330.7123, - "eval_samples_per_second": 907.115, - "eval_steps_per_second": 56.696, + "eval_loss": 2.2780961990356445, + "eval_runtime": 325.8653, + "eval_samples_per_second": 920.626, + "eval_steps_per_second": 57.539, "step": 528000 }, { "epoch": 1.5, - "eval_loss": 2.29899525642395, - "eval_runtime": 329.622, - "eval_samples_per_second": 910.115, - "eval_steps_per_second": 56.883, + "eval_loss": 2.2722842693328857, + "eval_runtime": 326.0044, + "eval_samples_per_second": 920.233, + "eval_steps_per_second": 57.515, "step": 536000 }, { "epoch": 1.53, "learning_rate": 3.1706666666666665e-07, - "loss": 2.441, + "loss": 2.4, "step": 544000 }, { "epoch": 1.53, - "eval_loss": 2.2915515899658203, - "eval_runtime": 331.8875, - "eval_samples_per_second": 903.903, - "eval_steps_per_second": 56.495, + "eval_loss": 2.273293972015381, + "eval_runtime": 326.966, + "eval_samples_per_second": 917.527, + "eval_steps_per_second": 57.345, "step": 544000 }, { "epoch": 1.55, - "eval_loss": 2.2906155586242676, - "eval_runtime": 331.8429, - "eval_samples_per_second": 904.024, - "eval_steps_per_second": 56.503, + "eval_loss": 2.271476984024048, + "eval_runtime": 326.8892, + "eval_samples_per_second": 917.742, + "eval_steps_per_second": 57.359, "step": 552000 }, { "epoch": 1.57, "learning_rate": 3.1433333333333336e-07, - "loss": 2.4372, + "loss": 2.3998, "step": 560000 }, { "epoch": 1.57, - "eval_loss": 2.2884891033172607, - "eval_runtime": 329.9724, - "eval_samples_per_second": 909.149, - "eval_steps_per_second": 56.823, + "eval_loss": 2.271629810333252, + "eval_runtime": 326.5264, + "eval_samples_per_second": 918.762, + "eval_steps_per_second": 57.423, "step": 560000 }, { "epoch": 1.59, - "eval_loss": 2.300299882888794, - "eval_runtime": 329.9035, - "eval_samples_per_second": 909.339, - "eval_steps_per_second": 56.835, + "eval_loss": 2.27506422996521, + "eval_runtime": 326.712, + "eval_samples_per_second": 918.24, + "eval_steps_per_second": 57.39, "step": 568000 }, { "epoch": 1.62, "learning_rate": 3.116e-07, - "loss": 2.4379, + "loss": 2.4017, "step": 576000 }, { "epoch": 1.62, - "eval_loss": 2.298779249191284, - "eval_runtime": 329.3281, - "eval_samples_per_second": 910.927, - "eval_steps_per_second": 56.934, + "eval_loss": 2.274268865585327, + "eval_runtime": 326.6112, + "eval_samples_per_second": 918.523, + "eval_steps_per_second": 57.408, "step": 576000 }, { "epoch": 1.64, - "eval_loss": 2.2923216819763184, - "eval_runtime": 330.0728, - "eval_samples_per_second": 908.872, - "eval_steps_per_second": 56.806, + "eval_loss": 2.2739031314849854, + "eval_runtime": 326.4511, + "eval_samples_per_second": 918.974, + "eval_steps_per_second": 57.436, "step": 584000 }, { "epoch": 1.66, "learning_rate": 3.0886666666666667e-07, - "loss": 2.4347, + "loss": 2.4019, "step": 592000 }, { "epoch": 1.66, - "eval_loss": 2.2936880588531494, - "eval_runtime": 331.2101, - "eval_samples_per_second": 905.751, - "eval_steps_per_second": 56.611, + "eval_loss": 2.275505542755127, + "eval_runtime": 329.4605, + "eval_samples_per_second": 910.58, + "eval_steps_per_second": 56.911, "step": 592000 }, { "epoch": 1.68, - "eval_loss": 2.2957868576049805, - "eval_runtime": 330.8976, - "eval_samples_per_second": 906.607, - "eval_steps_per_second": 56.664, + "eval_loss": 2.269094228744507, + "eval_runtime": 327.3789, + "eval_samples_per_second": 916.369, + "eval_steps_per_second": 57.273, "step": 600000 }, { "epoch": 1.71, "learning_rate": 3.061333333333333e-07, - "loss": 2.4311, + "loss": 2.398, "step": 608000 }, { "epoch": 1.71, - "eval_loss": 2.299522638320923, - "eval_runtime": 330.1467, - "eval_samples_per_second": 908.669, - "eval_steps_per_second": 56.793, + "eval_loss": 2.2705538272857666, + "eval_runtime": 327.1271, + "eval_samples_per_second": 917.075, + "eval_steps_per_second": 57.317, "step": 608000 }, { "epoch": 1.73, - "eval_loss": 2.2941486835479736, - "eval_runtime": 329.8116, - "eval_samples_per_second": 909.592, - "eval_steps_per_second": 56.851, + "eval_loss": 2.270341634750366, + "eval_runtime": 326.9286, + "eval_samples_per_second": 917.632, + "eval_steps_per_second": 57.352, "step": 616000 }, { "epoch": 1.75, "learning_rate": 3.034e-07, - "loss": 2.4437, + "loss": 2.4027, "step": 624000 }, { "epoch": 1.75, - "eval_loss": 2.2949397563934326, - "eval_runtime": 330.2069, - "eval_samples_per_second": 908.503, - "eval_steps_per_second": 56.783, + "eval_loss": 2.2657225131988525, + "eval_runtime": 326.8016, + "eval_samples_per_second": 917.988, + "eval_steps_per_second": 57.374, "step": 624000 }, { "epoch": 1.77, - "eval_loss": 2.2877914905548096, - "eval_runtime": 331.6123, - "eval_samples_per_second": 904.653, - "eval_steps_per_second": 56.542, + "eval_loss": 2.267418146133423, + "eval_runtime": 326.6227, + "eval_samples_per_second": 918.491, + "eval_steps_per_second": 57.406, "step": 632000 }, { "epoch": 1.8, "learning_rate": 3.0066666666666663e-07, - "loss": 2.4306, + "loss": 2.4, "step": 640000 }, { "epoch": 1.8, - "eval_loss": 2.28951096534729, - "eval_runtime": 331.7636, - "eval_samples_per_second": 904.24, - "eval_steps_per_second": 56.516, + "eval_loss": 2.2748591899871826, + "eval_runtime": 326.8527, + "eval_samples_per_second": 917.845, + "eval_steps_per_second": 57.365, "step": 640000 }, { "epoch": 1.82, - "eval_loss": 2.2930004596710205, - "eval_runtime": 332.1218, - "eval_samples_per_second": 903.265, - "eval_steps_per_second": 56.455, + "eval_loss": 2.2713701725006104, + "eval_runtime": 326.3767, + "eval_samples_per_second": 919.183, + "eval_steps_per_second": 57.449, "step": 648000 }, { "epoch": 1.84, "learning_rate": 2.9793333333333334e-07, - "loss": 2.4341, + "loss": 2.4046, "step": 656000 }, { "epoch": 1.84, - "eval_loss": 2.2894575595855713, - "eval_runtime": 330.5104, - "eval_samples_per_second": 907.669, - "eval_steps_per_second": 56.73, + "eval_loss": 2.2694690227508545, + "eval_runtime": 326.9136, + "eval_samples_per_second": 917.674, + "eval_steps_per_second": 57.355, "step": 656000 }, { "epoch": 1.86, - "eval_loss": 2.290764808654785, - "eval_runtime": 333.5634, - "eval_samples_per_second": 899.361, - "eval_steps_per_second": 56.211, + "eval_loss": 2.2724227905273438, + "eval_runtime": 326.9654, + "eval_samples_per_second": 917.528, + "eval_steps_per_second": 57.346, "step": 664000 }, { "epoch": 1.89, "learning_rate": 2.952e-07, - "loss": 2.4333, + "loss": 2.4033, "step": 672000 }, { "epoch": 1.89, - "eval_loss": 2.284210681915283, - "eval_runtime": 331.1722, - "eval_samples_per_second": 905.855, - "eval_steps_per_second": 56.617, + "eval_loss": 2.2697391510009766, + "eval_runtime": 326.8958, + "eval_samples_per_second": 917.724, + "eval_steps_per_second": 57.358, "step": 672000 }, { "epoch": 1.91, - "eval_loss": 2.29123592376709, - "eval_runtime": 331.1751, - "eval_samples_per_second": 905.847, - "eval_steps_per_second": 56.617, + "eval_loss": 2.2697041034698486, + "eval_runtime": 326.8461, + "eval_samples_per_second": 917.863, + "eval_steps_per_second": 57.366, "step": 680000 }, { "epoch": 1.93, "learning_rate": 2.9246666666666665e-07, - "loss": 2.4403, + "loss": 2.3981, "step": 688000 }, { "epoch": 1.93, - "eval_loss": 2.290036916732788, - "eval_runtime": 331.0957, - "eval_samples_per_second": 906.064, - "eval_steps_per_second": 56.63, + "eval_loss": 2.267427444458008, + "eval_runtime": 327.9149, + "eval_samples_per_second": 914.872, + "eval_steps_per_second": 57.179, "step": 688000 }, { "epoch": 1.95, - "eval_loss": 2.2862110137939453, - "eval_runtime": 332.3754, - "eval_samples_per_second": 902.576, - "eval_steps_per_second": 56.412, + "eval_loss": 2.266889810562134, + "eval_runtime": 327.4325, + "eval_samples_per_second": 916.219, + "eval_steps_per_second": 57.264, "step": 696000 }, { "epoch": 1.98, "learning_rate": 2.897333333333333e-07, - "loss": 2.4396, + "loss": 2.4029, "step": 704000 }, { "epoch": 1.98, - "eval_loss": 2.287149667739868, - "eval_runtime": 332.2121, - "eval_samples_per_second": 903.019, - "eval_steps_per_second": 56.44, + "eval_loss": 2.275509834289551, + "eval_runtime": 327.0353, + "eval_samples_per_second": 917.332, + "eval_steps_per_second": 57.333, "step": 704000 }, { "epoch": 2.0, - "eval_loss": 2.294781446456909, - "eval_runtime": 331.5538, - "eval_samples_per_second": 904.813, - "eval_steps_per_second": 56.552, + "eval_loss": 2.2664170265197754, + "eval_runtime": 329.3443, + "eval_samples_per_second": 910.901, + "eval_steps_per_second": 56.931, "step": 712000 }, { "epoch": 2.02, "learning_rate": 2.8699999999999996e-07, - "loss": 2.441, + "loss": 2.4046, "step": 720000 }, { "epoch": 2.02, - "eval_loss": 2.294177293777466, - "eval_runtime": 332.0516, - "eval_samples_per_second": 903.456, - "eval_steps_per_second": 56.467, + "eval_loss": 2.2758920192718506, + "eval_runtime": 328.0111, + "eval_samples_per_second": 914.603, + "eval_steps_per_second": 57.163, "step": 720000 }, { "epoch": 2.04, - "eval_loss": 2.2828338146209717, - "eval_runtime": 331.6742, - "eval_samples_per_second": 904.484, - "eval_steps_per_second": 56.531, + "eval_loss": 2.2689473628997803, + "eval_runtime": 327.8597, + "eval_samples_per_second": 915.026, + "eval_steps_per_second": 57.189, "step": 728000 }, { "epoch": 2.07, "learning_rate": 2.8426666666666667e-07, - "loss": 2.434, + "loss": 2.4056, "step": 736000 }, { "epoch": 2.07, - "eval_loss": 2.2808279991149902, - "eval_runtime": 332.944, - "eval_samples_per_second": 901.034, - "eval_steps_per_second": 56.316, + "eval_loss": 2.2710442543029785, + "eval_runtime": 327.6707, + "eval_samples_per_second": 915.553, + "eval_steps_per_second": 57.222, "step": 736000 }, { "epoch": 2.09, - "eval_loss": 2.2883412837982178, - "eval_runtime": 334.8461, - "eval_samples_per_second": 895.916, - "eval_steps_per_second": 55.996, + "eval_loss": 2.2743895053863525, + "eval_runtime": 326.938, + "eval_samples_per_second": 917.605, + "eval_steps_per_second": 57.35, "step": 744000 }, { "epoch": 2.11, "learning_rate": 2.815333333333333e-07, - "loss": 2.4387, + "loss": 2.4036, "step": 752000 }, { "epoch": 2.11, - "eval_loss": 2.292271375656128, - "eval_runtime": 332.2552, - "eval_samples_per_second": 902.902, - "eval_steps_per_second": 56.433, + "eval_loss": 2.265347719192505, + "eval_runtime": 327.8639, + "eval_samples_per_second": 915.014, + "eval_steps_per_second": 57.188, "step": 752000 }, { "epoch": 2.13, - "eval_loss": 2.284794569015503, - "eval_runtime": 335.1059, - "eval_samples_per_second": 895.222, - "eval_steps_per_second": 55.952, + "eval_loss": 2.264220952987671, + "eval_runtime": 328.2384, + "eval_samples_per_second": 913.97, + "eval_steps_per_second": 57.123, "step": 760000 }, { "epoch": 2.16, "learning_rate": 2.7880000000000003e-07, - "loss": 2.4342, + "loss": 2.3961, "step": 768000 }, { "epoch": 2.16, - "eval_loss": 2.2847986221313477, - "eval_runtime": 331.6425, - "eval_samples_per_second": 904.57, - "eval_steps_per_second": 56.537, + "eval_loss": 2.2702980041503906, + "eval_runtime": 328.1483, + "eval_samples_per_second": 914.221, + "eval_steps_per_second": 57.139, "step": 768000 }, { "epoch": 2.18, - "eval_loss": 2.2864720821380615, - "eval_runtime": 332.0862, - "eval_samples_per_second": 903.362, - "eval_steps_per_second": 56.461, + "eval_loss": 2.2682902812957764, + "eval_runtime": 327.4533, + "eval_samples_per_second": 916.161, + "eval_steps_per_second": 57.26, "step": 776000 }, { "epoch": 2.2, "learning_rate": 2.7606666666666664e-07, - "loss": 2.4389, + "loss": 2.3939, "step": 784000 }, { "epoch": 2.2, - "eval_loss": 2.288520574569702, - "eval_runtime": 332.0566, - "eval_samples_per_second": 903.442, - "eval_steps_per_second": 56.466, + "eval_loss": 2.2746386528015137, + "eval_runtime": 327.8678, + "eval_samples_per_second": 915.003, + "eval_steps_per_second": 57.188, "step": 784000 }, { "epoch": 2.22, - "eval_loss": 2.2794013023376465, - "eval_runtime": 331.7528, - "eval_samples_per_second": 904.27, - "eval_steps_per_second": 56.518, + "eval_loss": 2.2666993141174316, + "eval_runtime": 329.1807, + "eval_samples_per_second": 911.353, + "eval_steps_per_second": 56.96, "step": 792000 }, { "epoch": 2.25, "learning_rate": 2.733333333333333e-07, - "loss": 2.4318, + "loss": 2.3998, "step": 800000 }, { "epoch": 2.25, - "eval_loss": 2.2861220836639404, - "eval_runtime": 332.6258, - "eval_samples_per_second": 901.896, - "eval_steps_per_second": 56.37, + "eval_loss": 2.268972396850586, + "eval_runtime": 328.4073, + "eval_samples_per_second": 913.5, + "eval_steps_per_second": 57.094, "step": 800000 }, { "epoch": 2.27, - "eval_loss": 2.2875726222991943, - "eval_runtime": 332.0265, - "eval_samples_per_second": 903.524, - "eval_steps_per_second": 56.471, + "eval_loss": 2.2696826457977295, + "eval_runtime": 329.554, + "eval_samples_per_second": 910.321, + "eval_steps_per_second": 56.895, "step": 808000 }, { "epoch": 2.29, "learning_rate": 2.706e-07, - "loss": 2.4343, + "loss": 2.3921, "step": 816000 }, { "epoch": 2.29, - "eval_loss": 2.2820258140563965, - "eval_runtime": 332.3446, - "eval_samples_per_second": 902.659, - "eval_steps_per_second": 56.417, + "eval_loss": 2.268064498901367, + "eval_runtime": 328.2902, + "eval_samples_per_second": 913.826, + "eval_steps_per_second": 57.114, "step": 816000 }, { "epoch": 2.31, - "eval_loss": 2.283487319946289, - "eval_runtime": 332.0052, - "eval_samples_per_second": 903.582, - "eval_steps_per_second": 56.475, + "eval_loss": 2.27397084236145, + "eval_runtime": 328.4539, + "eval_samples_per_second": 913.37, + "eval_steps_per_second": 57.086, "step": 824000 }, { "epoch": 2.34, "learning_rate": 2.6786666666666666e-07, - "loss": 2.4335, + "loss": 2.4011, "step": 832000 }, { "epoch": 2.34, - "eval_loss": 2.278824806213379, - "eval_runtime": 334.3682, - "eval_samples_per_second": 897.196, - "eval_steps_per_second": 56.076, + "eval_loss": 2.270357608795166, + "eval_runtime": 328.9931, + "eval_samples_per_second": 911.873, + "eval_steps_per_second": 56.992, "step": 832000 }, { "epoch": 2.36, - "eval_loss": 2.281332015991211, - "eval_runtime": 334.2594, - "eval_samples_per_second": 897.489, - "eval_steps_per_second": 56.094, + "eval_loss": 2.2666330337524414, + "eval_runtime": 328.6018, + "eval_samples_per_second": 912.959, + "eval_steps_per_second": 57.06, "step": 840000 }, { "epoch": 2.38, "learning_rate": 2.651333333333333e-07, - "loss": 2.4428, + "loss": 2.3948, "step": 848000 }, { "epoch": 2.38, - "eval_loss": 2.2788710594177246, - "eval_runtime": 335.0984, - "eval_samples_per_second": 895.242, - "eval_steps_per_second": 55.954, + "eval_loss": 2.2689247131347656, + "eval_runtime": 328.0791, + "eval_samples_per_second": 914.414, + "eval_steps_per_second": 57.151, "step": 848000 }, { "epoch": 2.4, - "eval_loss": 2.285792589187622, - "eval_runtime": 333.8076, - "eval_samples_per_second": 898.703, - "eval_steps_per_second": 56.17, + "eval_loss": 2.2741663455963135, + "eval_runtime": 329.8118, + "eval_samples_per_second": 909.61, + "eval_steps_per_second": 56.851, "step": 856000 }, { "epoch": 2.43, "learning_rate": 2.624e-07, - "loss": 2.4272, + "loss": 2.3957, "step": 864000 }, { "epoch": 2.43, - "eval_loss": 2.288302183151245, - "eval_runtime": 334.3033, - "eval_samples_per_second": 897.371, - "eval_steps_per_second": 56.087, + "eval_loss": 2.2755067348480225, + "eval_runtime": 329.6482, + "eval_samples_per_second": 910.061, + "eval_steps_per_second": 56.879, "step": 864000 }, { "epoch": 2.45, - "eval_loss": 2.280890941619873, - "eval_runtime": 335.6751, - "eval_samples_per_second": 893.704, - "eval_steps_per_second": 55.858, + "eval_loss": 2.268922805786133, + "eval_runtime": 328.948, + "eval_samples_per_second": 911.998, + "eval_steps_per_second": 57.0, "step": 872000 }, { "epoch": 2.47, "learning_rate": 2.596666666666667e-07, - "loss": 2.4331, + "loss": 2.3971, "step": 880000 }, { "epoch": 2.47, - "eval_loss": 2.288017988204956, - "eval_runtime": 340.6851, - "eval_samples_per_second": 880.561, - "eval_steps_per_second": 55.036, + "eval_loss": 2.271690607070923, + "eval_runtime": 328.8273, + "eval_samples_per_second": 912.333, + "eval_steps_per_second": 57.021, "step": 880000 }, { "epoch": 2.49, - "eval_loss": 2.283820867538452, - "eval_runtime": 339.1018, - "eval_samples_per_second": 884.672, - "eval_steps_per_second": 55.293, + "eval_loss": 2.2689971923828125, + "eval_runtime": 329.7312, + "eval_samples_per_second": 909.832, + "eval_steps_per_second": 56.864, "step": 888000 }, { "epoch": 2.52, "learning_rate": 2.5693333333333333e-07, - "loss": 2.4326, + "loss": 2.3982, "step": 896000 }, { "epoch": 2.52, - "eval_loss": 2.2804486751556396, - "eval_runtime": 335.9214, - "eval_samples_per_second": 893.048, - "eval_steps_per_second": 55.817, + "eval_loss": 2.264453649520874, + "eval_runtime": 329.0657, + "eval_samples_per_second": 911.672, + "eval_steps_per_second": 56.98, "step": 896000 }, { "epoch": 2.54, - "eval_loss": 2.2831339836120605, - "eval_runtime": 337.559, - "eval_samples_per_second": 888.716, - "eval_steps_per_second": 55.546, + "eval_loss": 2.2726194858551025, + "eval_runtime": 328.4591, + "eval_samples_per_second": 913.356, + "eval_steps_per_second": 57.085, "step": 904000 }, { "epoch": 2.56, "learning_rate": 2.542e-07, - "loss": 2.436, + "loss": 2.4005, "step": 912000 }, { "epoch": 2.56, - "eval_loss": 2.286670446395874, - "eval_runtime": 334.8087, - "eval_samples_per_second": 896.016, - "eval_steps_per_second": 56.002, + "eval_loss": 2.262789011001587, + "eval_runtime": 329.0087, + "eval_samples_per_second": 911.83, + "eval_steps_per_second": 56.989, "step": 912000 }, { "epoch": 2.58, - "eval_loss": 2.28481125831604, - "eval_runtime": 336.2362, - "eval_samples_per_second": 892.212, - "eval_steps_per_second": 55.764, + "eval_loss": 2.2725658416748047, + "eval_runtime": 331.131, + "eval_samples_per_second": 905.986, + "eval_steps_per_second": 56.624, "step": 920000 }, { "epoch": 2.6, "learning_rate": 2.5146666666666664e-07, - "loss": 2.435, + "loss": 2.4037, "step": 928000 }, { "epoch": 2.6, - "eval_loss": 2.287050485610962, - "eval_runtime": 335.1109, - "eval_samples_per_second": 895.208, - "eval_steps_per_second": 55.952, + "eval_loss": 2.2759974002838135, + "eval_runtime": 329.3386, + "eval_samples_per_second": 910.917, + "eval_steps_per_second": 56.932, "step": 928000 }, { "epoch": 2.63, - "eval_loss": 2.2828164100646973, - "eval_runtime": 333.979, - "eval_samples_per_second": 898.242, - "eval_steps_per_second": 56.141, + "eval_loss": 2.2662434577941895, + "eval_runtime": 331.0495, + "eval_samples_per_second": 906.209, + "eval_steps_per_second": 56.638, "step": 936000 }, { "epoch": 2.65, "learning_rate": 2.4873333333333335e-07, - "loss": 2.44, + "loss": 2.4031, "step": 944000 }, { "epoch": 2.65, - "eval_loss": 2.2807881832122803, - "eval_runtime": 334.0843, - "eval_samples_per_second": 897.959, - "eval_steps_per_second": 56.124, + "eval_loss": 2.272948741912842, + "eval_runtime": 329.451, + "eval_samples_per_second": 910.606, + "eval_steps_per_second": 56.913, "step": 944000 }, { "epoch": 2.67, - "eval_loss": 2.2852513790130615, - "eval_runtime": 334.6701, - "eval_samples_per_second": 896.387, - "eval_steps_per_second": 56.025, + "eval_loss": 2.270596742630005, + "eval_runtime": 328.9394, + "eval_samples_per_second": 912.022, + "eval_steps_per_second": 57.001, "step": 952000 }, { "epoch": 2.69, "learning_rate": 2.46e-07, - "loss": 2.4285, + "loss": 2.4025, "step": 960000 }, { "epoch": 2.69, - "eval_loss": 2.279860258102417, - "eval_runtime": 338.1471, - "eval_samples_per_second": 887.17, - "eval_steps_per_second": 55.449, + "eval_loss": 2.2684247493743896, + "eval_runtime": 328.8064, + "eval_samples_per_second": 912.391, + "eval_steps_per_second": 57.024, "step": 960000 }, { "epoch": 2.72, - "eval_loss": 2.282912015914917, - "eval_runtime": 335.4921, - "eval_samples_per_second": 894.191, - "eval_steps_per_second": 55.888, + "eval_loss": 2.2634849548339844, + "eval_runtime": 329.3927, + "eval_samples_per_second": 910.767, + "eval_steps_per_second": 56.923, "step": 968000 }, { "epoch": 2.74, "learning_rate": 2.4326666666666666e-07, - "loss": 2.423, + "loss": 2.409, "step": 976000 }, { "epoch": 2.74, - "eval_loss": 2.2761270999908447, - "eval_runtime": 335.1528, - "eval_samples_per_second": 895.096, - "eval_steps_per_second": 55.945, + "eval_loss": 2.2605979442596436, + "eval_runtime": 330.2691, + "eval_samples_per_second": 908.35, + "eval_steps_per_second": 56.772, "step": 976000 }, { "epoch": 2.76, - "eval_loss": 2.276808738708496, - "eval_runtime": 336.6204, - "eval_samples_per_second": 891.194, - "eval_steps_per_second": 55.701, + "eval_loss": 2.2664294242858887, + "eval_runtime": 334.9875, + "eval_samples_per_second": 895.556, + "eval_steps_per_second": 55.972, "step": 984000 }, { "epoch": 2.78, "learning_rate": 2.405333333333333e-07, - "loss": 2.4353, + "loss": 2.4085, "step": 992000 }, { "epoch": 2.78, - "eval_loss": 2.2844138145446777, - "eval_runtime": 335.677, - "eval_samples_per_second": 893.698, - "eval_steps_per_second": 55.857, + "eval_loss": 2.2646701335906982, + "eval_runtime": 332.6305, + "eval_samples_per_second": 901.902, + "eval_steps_per_second": 56.369, "step": 992000 }, { "epoch": 2.81, - "eval_loss": 2.2828099727630615, - "eval_runtime": 335.9499, - "eval_samples_per_second": 892.972, - "eval_steps_per_second": 55.812, + "eval_loss": 2.265587329864502, + "eval_runtime": 330.3094, + "eval_samples_per_second": 908.239, + "eval_steps_per_second": 56.765, "step": 1000000 }, { "epoch": 2.83, "learning_rate": 2.3779999999999997e-07, - "loss": 2.4301, + "loss": 2.3971, "step": 1008000 }, { "epoch": 2.83, - "eval_loss": 2.2806167602539062, - "eval_runtime": 334.273, - "eval_samples_per_second": 897.452, - "eval_steps_per_second": 56.092, + "eval_loss": 2.265507221221924, + "eval_runtime": 332.5509, + "eval_samples_per_second": 902.118, + "eval_steps_per_second": 56.382, "step": 1008000 }, { "epoch": 2.85, - "eval_loss": 2.281301975250244, - "eval_runtime": 335.4442, - "eval_samples_per_second": 894.319, - "eval_steps_per_second": 55.896, + "eval_loss": 2.2681467533111572, + "eval_runtime": 329.8973, + "eval_samples_per_second": 909.374, + "eval_steps_per_second": 56.836, "step": 1016000 }, { "epoch": 2.87, "learning_rate": 2.3506666666666668e-07, - "loss": 2.4284, + "loss": 2.3946, "step": 1024000 }, { "epoch": 2.87, - "eval_loss": 2.2789454460144043, - "eval_runtime": 334.3, - "eval_samples_per_second": 897.38, - "eval_steps_per_second": 56.087, + "eval_loss": 2.267101526260376, + "eval_runtime": 329.91, + "eval_samples_per_second": 909.339, + "eval_steps_per_second": 56.834, "step": 1024000 }, { "epoch": 2.9, - "eval_loss": 2.2769646644592285, - "eval_runtime": 333.9542, - "eval_samples_per_second": 898.309, - "eval_steps_per_second": 56.145, + "eval_loss": 2.2659785747528076, + "eval_runtime": 332.3096, + "eval_samples_per_second": 902.772, + "eval_steps_per_second": 56.423, "step": 1032000 }, { "epoch": 2.92, "learning_rate": 2.3233333333333334e-07, - "loss": 2.4252, + "loss": 2.4063, "step": 1040000 }, { "epoch": 2.92, - "eval_loss": 2.2762739658355713, - "eval_runtime": 334.5705, - "eval_samples_per_second": 896.654, - "eval_steps_per_second": 56.042, + "eval_loss": 2.2696707248687744, + "eval_runtime": 329.9244, + "eval_samples_per_second": 909.299, + "eval_steps_per_second": 56.831, "step": 1040000 }, { "epoch": 2.94, - "eval_loss": 2.276264190673828, - "eval_runtime": 334.7693, - "eval_samples_per_second": 896.122, - "eval_steps_per_second": 56.009, + "eval_loss": 2.2705624103546143, + "eval_runtime": 330.8986, + "eval_samples_per_second": 906.622, + "eval_steps_per_second": 56.664, "step": 1048000 }, { "epoch": 2.96, "learning_rate": 2.2960000000000002e-07, - "loss": 2.4289, + "loss": 2.399, "step": 1056000 }, { "epoch": 2.96, - "eval_loss": 2.277852773666382, - "eval_runtime": 334.9735, - "eval_samples_per_second": 895.575, - "eval_steps_per_second": 55.975, + "eval_loss": 2.2625114917755127, + "eval_runtime": 330.8268, + "eval_samples_per_second": 906.819, + "eval_steps_per_second": 56.676, "step": 1056000 }, { "epoch": 2.99, - "eval_loss": 2.2811880111694336, - "eval_runtime": 334.7215, - "eval_samples_per_second": 896.25, - "eval_steps_per_second": 56.017, + "eval_loss": 2.26986026763916, + "eval_runtime": 330.4632, + "eval_samples_per_second": 907.817, + "eval_steps_per_second": 56.739, "step": 1064000 }, { "epoch": 3.01, "learning_rate": 2.2686666666666667e-07, - "loss": 2.4349, + "loss": 2.4024, "step": 1072000 }, { "epoch": 3.01, - "eval_loss": 2.2881336212158203, - "eval_runtime": 335.0398, - "eval_samples_per_second": 895.398, - "eval_steps_per_second": 55.963, + "eval_loss": 2.2622313499450684, + "eval_runtime": 331.2446, + "eval_samples_per_second": 905.675, + "eval_steps_per_second": 56.605, "step": 1072000 }, { "epoch": 3.03, - "eval_loss": 2.2804529666900635, - "eval_runtime": 336.2168, - "eval_samples_per_second": 892.264, - "eval_steps_per_second": 55.768, + "eval_loss": 2.269458293914795, + "eval_runtime": 330.8485, + "eval_samples_per_second": 906.759, + "eval_steps_per_second": 56.672, "step": 1080000 }, { "epoch": 3.05, "learning_rate": 2.2413333333333333e-07, - "loss": 2.4365, + "loss": 2.4035, "step": 1088000 }, { "epoch": 3.05, - "eval_loss": 2.2758219242095947, - "eval_runtime": 335.853, - "eval_samples_per_second": 893.23, - "eval_steps_per_second": 55.828, + "eval_loss": 2.2699954509735107, + "eval_runtime": 332.9859, + "eval_samples_per_second": 900.939, + "eval_steps_per_second": 56.309, "step": 1088000 }, { "epoch": 3.08, - "eval_loss": 2.2732620239257812, - "eval_runtime": 334.8017, - "eval_samples_per_second": 896.035, - "eval_steps_per_second": 56.003, + "eval_loss": 2.262361526489258, + "eval_runtime": 333.2535, + "eval_samples_per_second": 900.216, + "eval_steps_per_second": 56.263, "step": 1096000 }, { "epoch": 3.1, "learning_rate": 2.214e-07, - "loss": 2.4274, + "loss": 2.4061, "step": 1104000 }, { "epoch": 3.1, - "eval_loss": 2.2842442989349365, - "eval_runtime": 337.9471, - "eval_samples_per_second": 887.695, - "eval_steps_per_second": 55.482, + "eval_loss": 2.2690372467041016, + "eval_runtime": 332.4767, + "eval_samples_per_second": 902.319, + "eval_steps_per_second": 56.395, "step": 1104000 }, { "epoch": 3.12, - "eval_loss": 2.280796766281128, - "eval_runtime": 336.7832, - "eval_samples_per_second": 890.763, - "eval_steps_per_second": 55.674, + "eval_loss": 2.265334367752075, + "eval_runtime": 333.384, + "eval_samples_per_second": 899.863, + "eval_steps_per_second": 56.241, "step": 1112000 }, { "epoch": 3.14, "learning_rate": 2.1866666666666667e-07, - "loss": 2.4326, + "loss": 2.4044, "step": 1120000 }, { "epoch": 3.14, - "eval_loss": 2.2752950191497803, - "eval_runtime": 335.1287, - "eval_samples_per_second": 895.161, - "eval_steps_per_second": 55.949, + "eval_loss": 2.267867088317871, + "eval_runtime": 332.4491, + "eval_samples_per_second": 902.394, + "eval_steps_per_second": 56.4, "step": 1120000 }, { "epoch": 3.17, - "eval_loss": 2.2791523933410645, - "eval_runtime": 335.4799, - "eval_samples_per_second": 894.224, - "eval_steps_per_second": 55.89, + "eval_loss": 2.2657666206359863, + "eval_runtime": 337.9264, + "eval_samples_per_second": 887.767, + "eval_steps_per_second": 55.485, "step": 1128000 }, { "epoch": 3.19, "learning_rate": 2.1593333333333332e-07, - "loss": 2.4244, + "loss": 2.3996, "step": 1136000 }, { "epoch": 3.19, - "eval_loss": 2.27884578704834, - "eval_runtime": 335.4217, - "eval_samples_per_second": 894.379, - "eval_steps_per_second": 55.9, + "eval_loss": 2.2680134773254395, + "eval_runtime": 335.9795, + "eval_samples_per_second": 892.912, + "eval_steps_per_second": 55.807, "step": 1136000 }, { "epoch": 3.21, - "eval_loss": 2.282371997833252, - "eval_runtime": 336.2876, - "eval_samples_per_second": 892.076, - "eval_steps_per_second": 55.756, + "eval_loss": 2.26682186126709, + "eval_runtime": 332.0277, + "eval_samples_per_second": 903.539, + "eval_steps_per_second": 56.471, "step": 1144000 }, { "epoch": 3.23, "learning_rate": 2.132e-07, - "loss": 2.4285, + "loss": 2.3943, "step": 1152000 }, { "epoch": 3.23, - "eval_loss": 2.2799980640411377, - "eval_runtime": 335.867, - "eval_samples_per_second": 893.193, - "eval_steps_per_second": 55.826, + "eval_loss": 2.2689149379730225, + "eval_runtime": 332.3397, + "eval_samples_per_second": 902.691, + "eval_steps_per_second": 56.418, "step": 1152000 }, { "epoch": 3.26, - "eval_loss": 2.2783775329589844, - "eval_runtime": 335.6089, - "eval_samples_per_second": 893.88, - "eval_steps_per_second": 55.869, + "eval_loss": 2.2701900005340576, + "eval_runtime": 333.2287, + "eval_samples_per_second": 900.283, + "eval_steps_per_second": 56.268, "step": 1160000 }, { "epoch": 3.28, "learning_rate": 2.1046666666666666e-07, - "loss": 2.4371, + "loss": 2.3948, "step": 1168000 }, { "epoch": 3.28, - "eval_loss": 2.2675371170043945, - "eval_runtime": 335.7808, - "eval_samples_per_second": 893.422, - "eval_steps_per_second": 55.84, + "eval_loss": 2.2652790546417236, + "eval_runtime": 332.3733, + "eval_samples_per_second": 902.6, + "eval_steps_per_second": 56.412, "step": 1168000 }, { "epoch": 3.3, - "eval_loss": 2.2739932537078857, - "eval_runtime": 336.4769, - "eval_samples_per_second": 891.574, - "eval_steps_per_second": 55.724, + "eval_loss": 2.262141466140747, + "eval_runtime": 332.7579, + "eval_samples_per_second": 901.556, + "eval_steps_per_second": 56.347, "step": 1176000 }, { "epoch": 3.32, "learning_rate": 2.0773333333333334e-07, - "loss": 2.4273, + "loss": 2.4047, "step": 1184000 }, { "epoch": 3.32, - "eval_loss": 2.2804715633392334, - "eval_runtime": 339.219, - "eval_samples_per_second": 884.367, - "eval_steps_per_second": 55.274, + "eval_loss": 2.272305488586426, + "eval_runtime": 332.21, + "eval_samples_per_second": 903.043, + "eval_steps_per_second": 56.44, "step": 1184000 }, { "epoch": 3.35, - "eval_loss": 2.2848641872406006, - "eval_runtime": 336.5161, - "eval_samples_per_second": 891.47, - "eval_steps_per_second": 55.718, + "eval_loss": 2.271768808364868, + "eval_runtime": 334.301, + "eval_samples_per_second": 897.395, + "eval_steps_per_second": 56.087, "step": 1192000 }, { "epoch": 3.37, "learning_rate": 2.05e-07, - "loss": 2.4359, + "loss": 2.4057, "step": 1200000 }, { "epoch": 3.37, - "eval_loss": 2.2807633876800537, - "eval_runtime": 336.9191, - "eval_samples_per_second": 890.404, - "eval_steps_per_second": 55.651, + "eval_loss": 2.266768217086792, + "eval_runtime": 331.8859, + "eval_samples_per_second": 903.925, + "eval_steps_per_second": 56.495, "step": 1200000 }, { "epoch": 3.39, - "eval_loss": 2.2790510654449463, - "eval_runtime": 336.1666, - "eval_samples_per_second": 892.397, - "eval_steps_per_second": 55.776, + "eval_loss": 2.264948844909668, + "eval_runtime": 333.4261, + "eval_samples_per_second": 899.75, + "eval_steps_per_second": 56.234, "step": 1208000 }, { "epoch": 3.41, "learning_rate": 2.0226666666666668e-07, - "loss": 2.4303, + "loss": 2.3901, "step": 1216000 }, { "epoch": 3.41, - "eval_loss": 2.2729713916778564, - "eval_runtime": 337.949, - "eval_samples_per_second": 887.69, - "eval_steps_per_second": 55.482, + "eval_loss": 2.2699382305145264, + "eval_runtime": 334.7905, + "eval_samples_per_second": 896.083, + "eval_steps_per_second": 56.005, "step": 1216000 }, { "epoch": 3.44, - "eval_loss": 2.273223638534546, - "eval_runtime": 338.6051, - "eval_samples_per_second": 885.97, - "eval_steps_per_second": 55.374, + "eval_loss": 2.2682831287384033, + "eval_runtime": 335.082, + "eval_samples_per_second": 895.303, + "eval_steps_per_second": 55.956, "step": 1224000 }, { "epoch": 3.46, "learning_rate": 1.9953333333333333e-07, - "loss": 2.4306, + "loss": 2.3942, "step": 1232000 }, { "epoch": 3.46, - "eval_loss": 2.2784602642059326, - "eval_runtime": 338.5063, - "eval_samples_per_second": 886.229, - "eval_steps_per_second": 55.39, + "eval_loss": 2.2679033279418945, + "eval_runtime": 333.2769, + "eval_samples_per_second": 900.152, + "eval_steps_per_second": 56.26, "step": 1232000 }, { "epoch": 3.48, - "eval_loss": 2.2763924598693848, - "eval_runtime": 336.2284, - "eval_samples_per_second": 892.233, - "eval_steps_per_second": 55.766, + "eval_loss": 2.264688014984131, + "eval_runtime": 335.8312, + "eval_samples_per_second": 893.306, + "eval_steps_per_second": 55.832, "step": 1240000 }, { "epoch": 3.5, "learning_rate": 1.968e-07, - "loss": 2.4267, + "loss": 2.4052, "step": 1248000 }, { "epoch": 3.5, - "eval_loss": 2.2739803791046143, - "eval_runtime": 337.8728, - "eval_samples_per_second": 887.89, - "eval_steps_per_second": 55.494, + "eval_loss": 2.265596866607666, + "eval_runtime": 333.6068, + "eval_samples_per_second": 899.262, + "eval_steps_per_second": 56.204, "step": 1248000 }, { "epoch": 3.53, - "eval_loss": 2.2789418697357178, - "eval_runtime": 337.6123, - "eval_samples_per_second": 888.576, - "eval_steps_per_second": 55.537, + "eval_loss": 2.267854690551758, + "eval_runtime": 333.2939, + "eval_samples_per_second": 900.107, + "eval_steps_per_second": 56.257, "step": 1256000 }, { "epoch": 3.55, "learning_rate": 1.9406666666666667e-07, - "loss": 2.4271, + "loss": 2.401, "step": 1264000 }, { "epoch": 3.55, - "eval_loss": 2.277411937713623, - "eval_runtime": 337.0558, - "eval_samples_per_second": 890.043, - "eval_steps_per_second": 55.629, + "eval_loss": 2.268515110015869, + "eval_runtime": 332.5102, + "eval_samples_per_second": 902.228, + "eval_steps_per_second": 56.389, "step": 1264000 }, { "epoch": 3.57, - "eval_loss": 2.276827335357666, - "eval_runtime": 337.1407, - "eval_samples_per_second": 889.818, - "eval_steps_per_second": 55.615, + "eval_loss": 2.26540207862854, + "eval_runtime": 332.9978, + "eval_samples_per_second": 900.907, + "eval_steps_per_second": 56.307, "step": 1272000 }, { "epoch": 3.59, "learning_rate": 1.9133333333333333e-07, - "loss": 2.4263, + "loss": 2.4012, "step": 1280000 }, { "epoch": 3.59, - "eval_loss": 2.279576539993286, - "eval_runtime": 338.6094, - "eval_samples_per_second": 885.959, - "eval_steps_per_second": 55.374, + "eval_loss": 2.260671854019165, + "eval_runtime": 333.82, + "eval_samples_per_second": 898.688, + "eval_steps_per_second": 56.168, "step": 1280000 }, { "epoch": 3.62, - "eval_loss": 2.2759058475494385, - "eval_runtime": 337.2844, - "eval_samples_per_second": 889.439, - "eval_steps_per_second": 55.591, + "eval_loss": 2.2668306827545166, + "eval_runtime": 334.7781, + "eval_samples_per_second": 896.116, + "eval_steps_per_second": 56.007, "step": 1288000 }, { "epoch": 3.64, "learning_rate": 1.886e-07, - "loss": 2.431, + "loss": 2.4015, "step": 1296000 }, { "epoch": 3.64, - "eval_loss": 2.274071216583252, - "eval_runtime": 337.5903, - "eval_samples_per_second": 888.633, - "eval_steps_per_second": 55.541, + "eval_loss": 2.267199754714966, + "eval_runtime": 333.9129, + "eval_samples_per_second": 898.438, + "eval_steps_per_second": 56.152, "step": 1296000 }, { "epoch": 3.66, - "eval_loss": 2.282141923904419, - "eval_runtime": 337.5885, - "eval_samples_per_second": 888.638, - "eval_steps_per_second": 55.541, + "eval_loss": 2.268502712249756, + "eval_runtime": 334.246, + "eval_samples_per_second": 897.542, + "eval_steps_per_second": 56.096, "step": 1304000 }, { "epoch": 3.68, "learning_rate": 1.8586666666666666e-07, - "loss": 2.4273, + "loss": 2.4039, "step": 1312000 }, { "epoch": 3.68, - "eval_loss": 2.2739663124084473, - "eval_runtime": 337.7029, - "eval_samples_per_second": 888.337, - "eval_steps_per_second": 55.522, + "eval_loss": 2.267529010772705, + "eval_runtime": 333.8135, + "eval_samples_per_second": 898.705, + "eval_steps_per_second": 56.169, "step": 1312000 }, { "epoch": 3.71, - "eval_loss": 2.2712557315826416, - "eval_runtime": 337.356, - "eval_samples_per_second": 889.251, - "eval_steps_per_second": 55.579, + "eval_loss": 2.2702226638793945, + "eval_runtime": 336.4463, + "eval_samples_per_second": 891.673, + "eval_steps_per_second": 55.73, "step": 1320000 }, { "epoch": 3.73, "learning_rate": 1.8313333333333332e-07, - "loss": 2.4371, + "loss": 2.3927, "step": 1328000 }, { "epoch": 3.73, - "eval_loss": 2.2704272270202637, - "eval_runtime": 339.5591, - "eval_samples_per_second": 883.481, - "eval_steps_per_second": 55.219, + "eval_loss": 2.268892526626587, + "eval_runtime": 334.6454, + "eval_samples_per_second": 896.471, + "eval_steps_per_second": 56.029, "step": 1328000 }, { "epoch": 3.75, - "eval_loss": 2.273430109024048, - "eval_runtime": 339.8184, - "eval_samples_per_second": 882.807, - "eval_steps_per_second": 55.177, + "eval_loss": 2.2673678398132324, + "eval_runtime": 334.3792, + "eval_samples_per_second": 897.185, + "eval_steps_per_second": 56.074, "step": 1336000 }, { "epoch": 3.77, "learning_rate": 1.804e-07, - "loss": 2.4273, + "loss": 2.3998, "step": 1344000 }, { "epoch": 3.77, - "eval_loss": 2.2745580673217773, - "eval_runtime": 338.4911, - "eval_samples_per_second": 886.268, - "eval_steps_per_second": 55.393, + "eval_loss": 2.2693703174591064, + "eval_runtime": 336.7748, + "eval_samples_per_second": 890.803, + "eval_steps_per_second": 55.675, "step": 1344000 }, { "epoch": 3.8, - "eval_loss": 2.284024953842163, - "eval_runtime": 337.8579, - "eval_samples_per_second": 887.929, - "eval_steps_per_second": 55.497, + "eval_loss": 2.264862298965454, + "eval_runtime": 336.6189, + "eval_samples_per_second": 891.216, + "eval_steps_per_second": 55.701, "step": 1352000 }, { "epoch": 3.82, "learning_rate": 1.7766666666666666e-07, - "loss": 2.4246, + "loss": 2.404, "step": 1360000 }, { "epoch": 3.82, - "eval_loss": 2.2764360904693604, - "eval_runtime": 337.6899, - "eval_samples_per_second": 888.371, - "eval_steps_per_second": 55.524, + "eval_loss": 2.263476848602295, + "eval_runtime": 333.0441, + "eval_samples_per_second": 900.782, + "eval_steps_per_second": 56.299, "step": 1360000 }, { "epoch": 3.84, - "eval_loss": 2.274030923843384, - "eval_runtime": 338.4156, - "eval_samples_per_second": 886.466, - "eval_steps_per_second": 55.405, + "eval_loss": 2.2680845260620117, + "eval_runtime": 333.2221, + "eval_samples_per_second": 900.301, + "eval_steps_per_second": 56.269, "step": 1368000 }, { "epoch": 3.86, "learning_rate": 1.7493333333333334e-07, - "loss": 2.4308, + "loss": 2.4023, "step": 1376000 }, { "epoch": 3.86, - "eval_loss": 2.2730188369750977, - "eval_runtime": 338.8191, - "eval_samples_per_second": 885.411, - "eval_steps_per_second": 55.339, + "eval_loss": 2.260050058364868, + "eval_runtime": 333.6835, + "eval_samples_per_second": 899.056, + "eval_steps_per_second": 56.191, "step": 1376000 }, { "epoch": 3.88, - "eval_loss": 2.2750706672668457, - "eval_runtime": 338.5243, - "eval_samples_per_second": 886.182, - "eval_steps_per_second": 55.387, + "eval_loss": 2.2660913467407227, + "eval_runtime": 334.5678, + "eval_samples_per_second": 896.679, + "eval_steps_per_second": 56.042, "step": 1384000 }, { "epoch": 3.91, "learning_rate": 1.722e-07, - "loss": 2.4341, + "loss": 2.393, "step": 1392000 }, { "epoch": 3.91, - "eval_loss": 2.277709484100342, - "eval_runtime": 338.1761, - "eval_samples_per_second": 887.094, - "eval_steps_per_second": 55.444, + "eval_loss": 2.261288642883301, + "eval_runtime": 334.5524, + "eval_samples_per_second": 896.721, + "eval_steps_per_second": 56.045, "step": 1392000 }, { "epoch": 3.93, - "eval_loss": 2.267932653427124, - "eval_runtime": 340.4821, - "eval_samples_per_second": 881.086, - "eval_steps_per_second": 55.069, + "eval_loss": 2.271660327911377, + "eval_runtime": 334.4275, + "eval_samples_per_second": 897.055, + "eval_steps_per_second": 56.066, "step": 1400000 }, { "epoch": 3.95, "learning_rate": 1.6946666666666668e-07, - "loss": 2.4266, + "loss": 2.402, "step": 1408000 }, { "epoch": 3.95, - "eval_loss": 2.277672052383423, - "eval_runtime": 338.3287, - "eval_samples_per_second": 886.694, - "eval_steps_per_second": 55.419, + "eval_loss": 2.2671592235565186, + "eval_runtime": 333.6753, + "eval_samples_per_second": 899.078, + "eval_steps_per_second": 56.192, "step": 1408000 }, { "epoch": 3.97, - "eval_loss": 2.2783212661743164, - "eval_runtime": 338.5149, - "eval_samples_per_second": 886.206, - "eval_steps_per_second": 55.389, + "eval_loss": 2.263709545135498, + "eval_runtime": 333.67, + "eval_samples_per_second": 899.092, + "eval_steps_per_second": 56.193, "step": 1416000 }, { "epoch": 4.0, "learning_rate": 1.6673333333333333e-07, - "loss": 2.4344, + "loss": 2.4047, "step": 1424000 }, { "epoch": 4.0, - "eval_loss": 2.2742812633514404, - "eval_runtime": 339.2997, - "eval_samples_per_second": 884.156, - "eval_steps_per_second": 55.261, + "eval_loss": 2.2704622745513916, + "eval_runtime": 336.6456, + "eval_samples_per_second": 891.145, + "eval_steps_per_second": 55.697, "step": 1424000 }, { "epoch": 4.02, - "eval_loss": 2.2690906524658203, - "eval_runtime": 338.4672, - "eval_samples_per_second": 886.331, - "eval_steps_per_second": 55.397, + "eval_loss": 2.2682485580444336, + "eval_runtime": 337.2045, + "eval_samples_per_second": 889.668, + "eval_steps_per_second": 55.604, "step": 1432000 }, { "epoch": 4.04, "learning_rate": 1.64e-07, - "loss": 2.431, + "loss": 2.4045, "step": 1440000 }, { "epoch": 4.04, - "eval_loss": 2.2713911533355713, - "eval_runtime": 339.3847, - "eval_samples_per_second": 883.935, - "eval_steps_per_second": 55.247, + "eval_loss": 2.2630040645599365, + "eval_runtime": 335.66, + "eval_samples_per_second": 893.761, + "eval_steps_per_second": 55.86, "step": 1440000 }, { "epoch": 4.06, - "eval_loss": 2.2693912982940674, - "eval_runtime": 339.8425, - "eval_samples_per_second": 882.744, - "eval_steps_per_second": 55.173, + "eval_loss": 2.269909143447876, + "eval_runtime": 336.6708, + "eval_samples_per_second": 891.078, + "eval_steps_per_second": 55.692, "step": 1448000 }, { "epoch": 4.09, "learning_rate": 1.6126666666666667e-07, - "loss": 2.4296, + "loss": 2.3973, "step": 1456000 }, { "epoch": 4.09, - "eval_loss": 2.274890422821045, - "eval_runtime": 341.4559, - "eval_samples_per_second": 878.573, - "eval_steps_per_second": 54.912, + "eval_loss": 2.2578797340393066, + "eval_runtime": 335.7138, + "eval_samples_per_second": 893.618, + "eval_steps_per_second": 55.851, "step": 1456000 }, { "epoch": 4.11, - "eval_loss": 2.280956268310547, - "eval_runtime": 340.0142, - "eval_samples_per_second": 882.298, - "eval_steps_per_second": 55.145, + "eval_loss": 2.2601444721221924, + "eval_runtime": 334.2559, + "eval_samples_per_second": 897.516, + "eval_steps_per_second": 56.095, "step": 1464000 }, { "epoch": 4.13, "learning_rate": 1.5853333333333332e-07, - "loss": 2.4265, + "loss": 2.399, "step": 1472000 }, { "epoch": 4.13, - "eval_loss": 2.2743868827819824, - "eval_runtime": 341.6626, - "eval_samples_per_second": 878.042, - "eval_steps_per_second": 54.879, + "eval_loss": 2.26086688041687, + "eval_runtime": 334.4066, + "eval_samples_per_second": 897.112, + "eval_steps_per_second": 56.069, "step": 1472000 }, { "epoch": 4.15, - "eval_loss": 2.271418571472168, - "eval_runtime": 339.1644, - "eval_samples_per_second": 884.509, - "eval_steps_per_second": 55.283, + "eval_loss": 2.269728660583496, + "eval_runtime": 334.0805, + "eval_samples_per_second": 897.987, + "eval_steps_per_second": 56.124, "step": 1480000 }, { "epoch": 4.18, "learning_rate": 1.558e-07, - "loss": 2.4266, + "loss": 2.399, "step": 1488000 }, { "epoch": 4.18, - "eval_loss": 2.273254871368408, - "eval_runtime": 339.6628, - "eval_samples_per_second": 883.211, - "eval_steps_per_second": 55.202, + "eval_loss": 2.2630419731140137, + "eval_runtime": 334.5552, + "eval_samples_per_second": 896.713, + "eval_steps_per_second": 56.045, "step": 1488000 }, { "epoch": 4.2, - "eval_loss": 2.278977632522583, - "eval_runtime": 339.3751, - "eval_samples_per_second": 883.96, - "eval_steps_per_second": 55.249, + "eval_loss": 2.2658443450927734, + "eval_runtime": 336.5508, + "eval_samples_per_second": 891.396, + "eval_steps_per_second": 55.712, "step": 1496000 }, { "epoch": 4.22, "learning_rate": 1.5306666666666666e-07, - "loss": 2.4253, + "loss": 2.3995, "step": 1504000 }, { "epoch": 4.22, - "eval_loss": 2.27659273147583, - "eval_runtime": 339.267, - "eval_samples_per_second": 884.242, - "eval_steps_per_second": 55.266, + "eval_loss": 2.265606641769409, + "eval_runtime": 335.2841, + "eval_samples_per_second": 894.763, + "eval_steps_per_second": 55.923, "step": 1504000 }, { "epoch": 4.24, - "eval_loss": 2.276425361633301, - "eval_runtime": 339.521, - "eval_samples_per_second": 883.58, - "eval_steps_per_second": 55.225, + "eval_loss": 2.2688894271850586, + "eval_runtime": 337.311, + "eval_samples_per_second": 889.387, + "eval_steps_per_second": 55.587, "step": 1512000 }, { "epoch": 4.27, "learning_rate": 1.5033333333333332e-07, - "loss": 2.4303, + "loss": 2.3929, "step": 1520000 }, { "epoch": 4.27, - "eval_loss": 2.269226312637329, - "eval_runtime": 340.2122, - "eval_samples_per_second": 881.785, - "eval_steps_per_second": 55.113, + "eval_loss": 2.2678134441375732, + "eval_runtime": 337.3214, + "eval_samples_per_second": 889.359, + "eval_steps_per_second": 55.585, "step": 1520000 }, { "epoch": 4.29, - "eval_loss": 2.268404006958008, - "eval_runtime": 340.6392, - "eval_samples_per_second": 880.68, - "eval_steps_per_second": 55.044, + "eval_loss": 2.2694430351257324, + "eval_runtime": 336.6085, + "eval_samples_per_second": 891.243, + "eval_steps_per_second": 55.703, "step": 1528000 }, { "epoch": 4.31, "learning_rate": 1.476e-07, - "loss": 2.4373, + "loss": 2.404, "step": 1536000 }, { "epoch": 4.31, - "eval_loss": 2.275193929672241, - "eval_runtime": 341.5541, - "eval_samples_per_second": 878.321, - "eval_steps_per_second": 54.896, + "eval_loss": 2.2631914615631104, + "eval_runtime": 337.5687, + "eval_samples_per_second": 888.708, + "eval_steps_per_second": 55.544, "step": 1536000 }, { "epoch": 4.33, - "eval_loss": 2.270094633102417, - "eval_runtime": 341.2941, - "eval_samples_per_second": 878.99, - "eval_steps_per_second": 54.938, + "eval_loss": 2.2656803131103516, + "eval_runtime": 336.4606, + "eval_samples_per_second": 891.635, + "eval_steps_per_second": 55.727, "step": 1544000 }, { "epoch": 4.36, "learning_rate": 1.4486666666666665e-07, - "loss": 2.4346, + "loss": 2.3932, "step": 1552000 }, { "epoch": 4.36, - "eval_loss": 2.2757863998413086, - "eval_runtime": 340.0188, - "eval_samples_per_second": 882.287, - "eval_steps_per_second": 55.144, + "eval_loss": 2.2641873359680176, + "eval_runtime": 335.6292, + "eval_samples_per_second": 893.844, + "eval_steps_per_second": 55.865, "step": 1552000 }, { "epoch": 4.38, - "eval_loss": 2.2727184295654297, - "eval_runtime": 340.3133, - "eval_samples_per_second": 881.523, - "eval_steps_per_second": 55.096, + "eval_loss": 2.260714054107666, + "eval_runtime": 335.5993, + "eval_samples_per_second": 893.923, + "eval_steps_per_second": 55.87, "step": 1560000 }, { "epoch": 4.4, "learning_rate": 1.4213333333333334e-07, - "loss": 2.4294, + "loss": 2.3985, "step": 1568000 }, { "epoch": 4.4, - "eval_loss": 2.2752585411071777, - "eval_runtime": 340.0045, - "eval_samples_per_second": 882.324, - "eval_steps_per_second": 55.146, + "eval_loss": 2.2634730339050293, + "eval_runtime": 335.566, + "eval_samples_per_second": 894.012, + "eval_steps_per_second": 55.876, "step": 1568000 }, { "epoch": 4.42, - "eval_loss": 2.2686994075775146, - "eval_runtime": 340.2528, - "eval_samples_per_second": 881.68, - "eval_steps_per_second": 55.106, + "eval_loss": 2.2645463943481445, + "eval_runtime": 337.3641, + "eval_samples_per_second": 889.247, + "eval_steps_per_second": 55.578, "step": 1576000 }, { "epoch": 4.45, "learning_rate": 1.3940000000000002e-07, - "loss": 2.439, + "loss": 2.3997, "step": 1584000 }, { "epoch": 4.45, - "eval_loss": 2.2775542736053467, - "eval_runtime": 342.6447, - "eval_samples_per_second": 875.525, - "eval_steps_per_second": 54.721, + "eval_loss": 2.2654054164886475, + "eval_runtime": 336.173, + "eval_samples_per_second": 892.398, + "eval_steps_per_second": 55.775, "step": 1584000 }, { "epoch": 4.47, - "eval_loss": 2.274559497833252, - "eval_runtime": 342.3787, - "eval_samples_per_second": 876.205, - "eval_steps_per_second": 54.764, + "eval_loss": 2.2672231197357178, + "eval_runtime": 336.1452, + "eval_samples_per_second": 892.472, + "eval_steps_per_second": 55.779, "step": 1592000 }, { "epoch": 4.49, "learning_rate": 1.3666666666666665e-07, - "loss": 2.4337, + "loss": 2.396, "step": 1600000 }, { "epoch": 4.49, - "eval_loss": 2.2730839252471924, - "eval_runtime": 340.9451, - "eval_samples_per_second": 879.889, - "eval_steps_per_second": 54.994, + "eval_loss": 2.2665934562683105, + "eval_runtime": 336.5057, + "eval_samples_per_second": 891.515, + "eval_steps_per_second": 55.72, "step": 1600000 }, { "epoch": 4.51, - "eval_loss": 2.2721805572509766, - "eval_runtime": 342.0012, - "eval_samples_per_second": 877.172, - "eval_steps_per_second": 54.824, + "eval_loss": 2.2708349227905273, + "eval_runtime": 335.6471, + "eval_samples_per_second": 893.796, + "eval_steps_per_second": 55.862, "step": 1608000 }, { "epoch": 4.54, "learning_rate": 1.3393333333333333e-07, - "loss": 2.4273, + "loss": 2.4012, "step": 1616000 }, { "epoch": 4.54, - "eval_loss": 2.270340919494629, - "eval_runtime": 341.2582, - "eval_samples_per_second": 879.082, - "eval_steps_per_second": 54.944, + "eval_loss": 2.2706656455993652, + "eval_runtime": 335.6113, + "eval_samples_per_second": 893.891, + "eval_steps_per_second": 55.868, "step": 1616000 }, { "epoch": 4.56, - "eval_loss": 2.2802205085754395, - "eval_runtime": 341.9475, - "eval_samples_per_second": 877.31, - "eval_steps_per_second": 54.833, + "eval_loss": 2.2683677673339844, + "eval_runtime": 335.9133, + "eval_samples_per_second": 893.087, + "eval_steps_per_second": 55.818, "step": 1624000 }, { "epoch": 4.58, "learning_rate": 1.312e-07, - "loss": 2.4275, + "loss": 2.4074, "step": 1632000 }, { "epoch": 4.58, - "eval_loss": 2.2707149982452393, - "eval_runtime": 341.3433, - "eval_samples_per_second": 878.863, - "eval_steps_per_second": 54.93, + "eval_loss": 2.2676126956939697, + "eval_runtime": 336.2793, + "eval_samples_per_second": 892.116, + "eval_steps_per_second": 55.757, "step": 1632000 }, { "epoch": 4.6, - "eval_loss": 2.270657777786255, - "eval_runtime": 342.3544, - "eval_samples_per_second": 876.267, - "eval_steps_per_second": 54.768, + "eval_loss": 2.2657711505889893, + "eval_runtime": 336.5159, + "eval_samples_per_second": 891.488, + "eval_steps_per_second": 55.718, "step": 1640000 }, { "epoch": 4.63, "learning_rate": 1.2846666666666667e-07, - "loss": 2.4201, + "loss": 2.3965, "step": 1648000 }, { "epoch": 4.63, - "eval_loss": 2.268555164337158, - "eval_runtime": 343.0641, - "eval_samples_per_second": 874.455, - "eval_steps_per_second": 54.655, + "eval_loss": 2.2716164588928223, + "eval_runtime": 335.6672, + "eval_samples_per_second": 893.742, + "eval_steps_per_second": 55.859, "step": 1648000 }, { "epoch": 4.65, - "eval_loss": 2.2706844806671143, - "eval_runtime": 343.4762, - "eval_samples_per_second": 873.405, - "eval_steps_per_second": 54.589, + "eval_loss": 2.2655858993530273, + "eval_runtime": 335.9521, + "eval_samples_per_second": 892.984, + "eval_steps_per_second": 55.812, "step": 1656000 }, { "epoch": 4.67, "learning_rate": 1.2573333333333332e-07, - "loss": 2.4319, + "loss": 2.4021, "step": 1664000 }, { "epoch": 4.67, - "eval_loss": 2.2739858627319336, - "eval_runtime": 346.4999, - "eval_samples_per_second": 865.784, - "eval_steps_per_second": 54.113, + "eval_loss": 2.2689690589904785, + "eval_runtime": 336.4235, + "eval_samples_per_second": 891.733, + "eval_steps_per_second": 55.733, "step": 1664000 }, { "epoch": 4.69, - "eval_loss": 2.2696831226348877, - "eval_runtime": 344.1357, - "eval_samples_per_second": 871.732, - "eval_steps_per_second": 54.484, + "eval_loss": 2.265604257583618, + "eval_runtime": 337.1771, + "eval_samples_per_second": 889.74, + "eval_steps_per_second": 55.609, "step": 1672000 }, { "epoch": 4.72, "learning_rate": 1.23e-07, - "loss": 2.4314, + "loss": 2.3981, "step": 1680000 }, { "epoch": 4.72, - "eval_loss": 2.2746658325195312, - "eval_runtime": 342.9467, - "eval_samples_per_second": 874.754, - "eval_steps_per_second": 54.673, + "eval_loss": 2.2659354209899902, + "eval_runtime": 337.0582, + "eval_samples_per_second": 890.054, + "eval_steps_per_second": 55.628, "step": 1680000 }, { "epoch": 4.74, - "eval_loss": 2.2693660259246826, - "eval_runtime": 345.5224, - "eval_samples_per_second": 868.233, - "eval_steps_per_second": 54.266, + "eval_loss": 2.2666890621185303, + "eval_runtime": 336.7986, + "eval_samples_per_second": 890.74, + "eval_steps_per_second": 55.671, "step": 1688000 }, { "epoch": 4.76, "learning_rate": 1.2026666666666666e-07, - "loss": 2.4242, + "loss": 2.3974, "step": 1696000 }, { "epoch": 4.76, - "eval_loss": 2.2731966972351074, - "eval_runtime": 346.3015, - "eval_samples_per_second": 866.28, - "eval_steps_per_second": 54.144, + "eval_loss": 2.2654528617858887, + "eval_runtime": 338.6552, + "eval_samples_per_second": 885.857, + "eval_steps_per_second": 55.366, "step": 1696000 }, { "epoch": 4.78, - "eval_loss": 2.272595167160034, - "eval_runtime": 348.6322, - "eval_samples_per_second": 860.488, - "eval_steps_per_second": 53.782, + "eval_loss": 2.2675693035125732, + "eval_runtime": 336.4191, + "eval_samples_per_second": 891.745, + "eval_steps_per_second": 55.734, "step": 1704000 }, { "epoch": 4.81, "learning_rate": 1.1753333333333334e-07, - "loss": 2.4302, + "loss": 2.3964, "step": 1712000 }, { "epoch": 4.81, - "eval_loss": 2.2703990936279297, - "eval_runtime": 344.7163, - "eval_samples_per_second": 870.263, - "eval_steps_per_second": 54.393, + "eval_loss": 2.265490770339966, + "eval_runtime": 338.7304, + "eval_samples_per_second": 885.66, + "eval_steps_per_second": 55.354, "step": 1712000 }, { "epoch": 4.83, - "eval_loss": 2.2755091190338135, - "eval_runtime": 342.7534, - "eval_samples_per_second": 875.247, - "eval_steps_per_second": 54.704, + "eval_loss": 2.2635693550109863, + "eval_runtime": 337.2341, + "eval_samples_per_second": 889.59, + "eval_steps_per_second": 55.599, "step": 1720000 }, { "epoch": 4.85, "learning_rate": 1.1480000000000001e-07, - "loss": 2.4375, + "loss": 2.3933, "step": 1728000 }, { "epoch": 4.85, - "eval_loss": 2.270075559616089, - "eval_runtime": 342.9559, - "eval_samples_per_second": 874.73, - "eval_steps_per_second": 54.672, + "eval_loss": 2.267894983291626, + "eval_runtime": 337.1638, + "eval_samples_per_second": 889.775, + "eval_steps_per_second": 55.611, "step": 1728000 }, { "epoch": 4.87, - "eval_loss": 2.2719573974609375, - "eval_runtime": 342.2711, - "eval_samples_per_second": 876.481, - "eval_steps_per_second": 54.781, + "eval_loss": 2.266650438308716, + "eval_runtime": 337.1959, + "eval_samples_per_second": 889.69, + "eval_steps_per_second": 55.606, "step": 1736000 }, { "epoch": 4.9, "learning_rate": 1.1206666666666666e-07, - "loss": 2.4305, + "loss": 2.4066, "step": 1744000 }, { "epoch": 4.9, - "eval_loss": 2.2697696685791016, - "eval_runtime": 342.9739, - "eval_samples_per_second": 874.685, - "eval_steps_per_second": 54.669, + "eval_loss": 2.264688730239868, + "eval_runtime": 338.0924, + "eval_samples_per_second": 887.331, + "eval_steps_per_second": 55.458, "step": 1744000 }, { "epoch": 4.92, - "eval_loss": 2.272111415863037, - "eval_runtime": 344.9176, - "eval_samples_per_second": 869.755, - "eval_steps_per_second": 54.361, + "eval_loss": 2.265735149383545, + "eval_runtime": 338.8846, + "eval_samples_per_second": 885.257, + "eval_steps_per_second": 55.329, "step": 1752000 }, { "epoch": 4.94, "learning_rate": 1.0933333333333333e-07, - "loss": 2.4353, + "loss": 2.4027, "step": 1760000 }, { "epoch": 4.94, - "eval_loss": 2.2751970291137695, - "eval_runtime": 344.6056, - "eval_samples_per_second": 870.543, - "eval_steps_per_second": 54.41, + "eval_loss": 2.2628121376037598, + "eval_runtime": 337.9881, + "eval_samples_per_second": 887.605, + "eval_steps_per_second": 55.475, "step": 1760000 }, { "epoch": 4.96, - "eval_loss": 2.2763051986694336, - "eval_runtime": 344.4917, - "eval_samples_per_second": 870.831, - "eval_steps_per_second": 54.428, + "eval_loss": 2.2642323970794678, + "eval_runtime": 339.1796, + "eval_samples_per_second": 884.487, + "eval_steps_per_second": 55.28, "step": 1768000 }, { "epoch": 4.99, "learning_rate": 1.066e-07, - "loss": 2.4274, + "loss": 2.4029, "step": 1776000 }, { "epoch": 4.99, - "eval_loss": 2.2746589183807373, - "eval_runtime": 345.8714, - "eval_samples_per_second": 867.357, - "eval_steps_per_second": 54.211, + "eval_loss": 2.2676889896392822, + "eval_runtime": 338.3313, + "eval_samples_per_second": 886.705, + "eval_steps_per_second": 55.419, "step": 1776000 }, { "epoch": 5.01, - "eval_loss": 2.277564764022827, - "eval_runtime": 344.7831, - "eval_samples_per_second": 870.095, - "eval_steps_per_second": 54.382, + "eval_loss": 2.2704169750213623, + "eval_runtime": 340.3735, + "eval_samples_per_second": 881.385, + "eval_steps_per_second": 55.087, "step": 1784000 }, { "epoch": 5.03, "learning_rate": 1.0386666666666667e-07, - "loss": 2.4234, + "loss": 2.3958, "step": 1792000 }, { "epoch": 5.03, - "eval_loss": 2.2705652713775635, - "eval_runtime": 345.9447, - "eval_samples_per_second": 867.173, - "eval_steps_per_second": 54.199, + "eval_loss": 2.2650022506713867, + "eval_runtime": 337.884, + "eval_samples_per_second": 887.879, + "eval_steps_per_second": 55.492, "step": 1792000 }, { "epoch": 5.05, - "eval_loss": 2.2719192504882812, - "eval_runtime": 345.7061, - "eval_samples_per_second": 867.772, - "eval_steps_per_second": 54.237, + "eval_loss": 2.265009880065918, + "eval_runtime": 339.0311, + "eval_samples_per_second": 884.875, + "eval_steps_per_second": 55.305, "step": 1800000 }, { "epoch": 5.08, "learning_rate": 1.0113333333333334e-07, - "loss": 2.4304, + "loss": 2.4054, "step": 1808000 }, { "epoch": 5.08, - "eval_loss": 2.2667484283447266, - "eval_runtime": 344.5475, - "eval_samples_per_second": 870.69, - "eval_steps_per_second": 54.419, + "eval_loss": 2.2680423259735107, + "eval_runtime": 338.3773, + "eval_samples_per_second": 886.584, + "eval_steps_per_second": 55.412, "step": 1808000 }, { "epoch": 5.1, - "eval_loss": 2.276196241378784, - "eval_runtime": 342.6831, - "eval_samples_per_second": 875.427, - "eval_steps_per_second": 54.715, + "eval_loss": 2.2601048946380615, + "eval_runtime": 338.8902, + "eval_samples_per_second": 885.243, + "eval_steps_per_second": 55.328, "step": 1816000 }, { "epoch": 5.12, "learning_rate": 9.84e-08, - "loss": 2.4308, + "loss": 2.3984, "step": 1824000 }, { "epoch": 5.12, - "eval_loss": 2.27565860748291, - "eval_runtime": 344.3771, - "eval_samples_per_second": 871.121, - "eval_steps_per_second": 54.446, + "eval_loss": 2.267129898071289, + "eval_runtime": 341.218, + "eval_samples_per_second": 879.203, + "eval_steps_per_second": 54.95, "step": 1824000 }, { "epoch": 5.14, - "eval_loss": 2.27123761177063, - "eval_runtime": 343.6662, - "eval_samples_per_second": 872.923, - "eval_steps_per_second": 54.559, + "eval_loss": 2.263897657394409, + "eval_runtime": 339.0811, + "eval_samples_per_second": 884.744, + "eval_steps_per_second": 55.296, "step": 1832000 }, { "epoch": 5.16, "learning_rate": 9.566666666666666e-08, - "loss": 2.4342, + "loss": 2.4005, "step": 1840000 }, { "epoch": 5.16, - "eval_loss": 2.267634868621826, - "eval_runtime": 343.6527, - "eval_samples_per_second": 872.957, - "eval_steps_per_second": 54.561, + "eval_loss": 2.262948989868164, + "eval_runtime": 338.4625, + "eval_samples_per_second": 886.361, + "eval_steps_per_second": 55.398, "step": 1840000 }, { "epoch": 5.19, - "eval_loss": 2.273836851119995, - "eval_runtime": 346.2597, - "eval_samples_per_second": 866.384, - "eval_steps_per_second": 54.15, + "eval_loss": 2.2656354904174805, + "eval_runtime": 339.1914, + "eval_samples_per_second": 884.456, + "eval_steps_per_second": 55.279, "step": 1848000 }, { "epoch": 5.21, "learning_rate": 9.293333333333333e-08, - "loss": 2.4342, + "loss": 2.3962, "step": 1856000 }, { "epoch": 5.21, - "eval_loss": 2.2754852771759033, - "eval_runtime": 343.1416, - "eval_samples_per_second": 874.257, - "eval_steps_per_second": 54.642, + "eval_loss": 2.2646210193634033, + "eval_runtime": 339.4764, + "eval_samples_per_second": 883.714, + "eval_steps_per_second": 55.232, "step": 1856000 }, { "epoch": 5.23, - "eval_loss": 2.274082899093628, - "eval_runtime": 343.2282, - "eval_samples_per_second": 874.037, - "eval_steps_per_second": 54.628, + "eval_loss": 2.2571327686309814, + "eval_runtime": 340.4494, + "eval_samples_per_second": 881.188, + "eval_steps_per_second": 55.074, "step": 1864000 }, { "epoch": 5.25, "learning_rate": 9.02e-08, - "loss": 2.4329, + "loss": 2.4033, "step": 1872000 }, { "epoch": 5.25, - "eval_loss": 2.2734124660491943, - "eval_runtime": 346.8274, - "eval_samples_per_second": 864.966, - "eval_steps_per_second": 54.061, + "eval_loss": 2.2689077854156494, + "eval_runtime": 339.6348, + "eval_samples_per_second": 883.302, + "eval_steps_per_second": 55.206, "step": 1872000 }, { "epoch": 5.28, - "eval_loss": 2.27142596244812, - "eval_runtime": 344.6072, - "eval_samples_per_second": 870.539, - "eval_steps_per_second": 54.41, + "eval_loss": 2.263167381286621, + "eval_runtime": 340.3091, + "eval_samples_per_second": 881.552, + "eval_steps_per_second": 55.097, "step": 1880000 }, { "epoch": 5.3, "learning_rate": 8.746666666666667e-08, - "loss": 2.4306, + "loss": 2.4064, "step": 1888000 }, { "epoch": 5.3, - "eval_loss": 2.272188663482666, - "eval_runtime": 345.5379, - "eval_samples_per_second": 868.194, - "eval_steps_per_second": 54.263, + "eval_loss": 2.2632765769958496, + "eval_runtime": 342.5582, + "eval_samples_per_second": 875.764, + "eval_steps_per_second": 54.735, "step": 1888000 }, { "epoch": 5.32, - "eval_loss": 2.270195484161377, - "eval_runtime": 344.8128, - "eval_samples_per_second": 870.02, - "eval_steps_per_second": 54.377, + "eval_loss": 2.2693655490875244, + "eval_runtime": 342.7491, + "eval_samples_per_second": 875.276, + "eval_steps_per_second": 54.705, "step": 1896000 }, { "epoch": 5.34, "learning_rate": 8.473333333333334e-08, - "loss": 2.4302, + "loss": 2.3967, "step": 1904000 }, { "epoch": 5.34, - "eval_loss": 2.276052713394165, - "eval_runtime": 343.8201, - "eval_samples_per_second": 872.532, - "eval_steps_per_second": 54.534, + "eval_loss": 2.2685184478759766, + "eval_runtime": 342.158, + "eval_samples_per_second": 876.788, + "eval_steps_per_second": 54.799, "step": 1904000 }, { "epoch": 5.37, - "eval_loss": 2.2747642993927, - "eval_runtime": 343.5822, - "eval_samples_per_second": 873.136, - "eval_steps_per_second": 54.572, + "eval_loss": 2.2636401653289795, + "eval_runtime": 341.2652, + "eval_samples_per_second": 879.082, + "eval_steps_per_second": 54.943, "step": 1912000 }, { "epoch": 5.39, "learning_rate": 8.2e-08, - "loss": 2.4303, + "loss": 2.4002, "step": 1920000 }, { "epoch": 5.39, - "eval_loss": 2.2763144969940186, - "eval_runtime": 343.8699, - "eval_samples_per_second": 872.406, - "eval_steps_per_second": 54.526, + "eval_loss": 2.268721103668213, + "eval_runtime": 343.2554, + "eval_samples_per_second": 873.985, + "eval_steps_per_second": 54.624, "step": 1920000 }, { "epoch": 5.41, - "eval_loss": 2.2730941772460938, - "eval_runtime": 343.83, - "eval_samples_per_second": 872.507, - "eval_steps_per_second": 54.533, + "eval_loss": 2.263157844543457, + "eval_runtime": 341.2197, + "eval_samples_per_second": 879.199, + "eval_steps_per_second": 54.95, "step": 1928000 }, { "epoch": 5.43, "learning_rate": 7.926666666666666e-08, - "loss": 2.4234, + "loss": 2.4045, "step": 1936000 }, { "epoch": 5.43, - "eval_loss": 2.2676327228546143, - "eval_runtime": 346.6045, - "eval_samples_per_second": 865.523, - "eval_steps_per_second": 54.096, + "eval_loss": 2.262470006942749, + "eval_runtime": 342.6853, + "eval_samples_per_second": 875.439, + "eval_steps_per_second": 54.715, "step": 1936000 }, { "epoch": 5.46, - "eval_loss": 2.275022268295288, - "eval_runtime": 343.8317, - "eval_samples_per_second": 872.502, - "eval_steps_per_second": 54.532, + "eval_loss": 2.267735242843628, + "eval_runtime": 346.6665, + "eval_samples_per_second": 865.385, + "eval_steps_per_second": 54.087, "step": 1944000 }, { "epoch": 5.48, "learning_rate": 7.653333333333333e-08, - "loss": 2.4349, + "loss": 2.4096, "step": 1952000 }, { "epoch": 5.48, - "eval_loss": 2.276860475540161, - "eval_runtime": 344.8812, - "eval_samples_per_second": 869.847, - "eval_steps_per_second": 54.367, + "eval_loss": 2.256277322769165, + "eval_runtime": 340.6214, + "eval_samples_per_second": 880.743, + "eval_steps_per_second": 55.046, "step": 1952000 }, { "epoch": 5.5, - "eval_loss": 2.2728497982025146, - "eval_runtime": 345.0042, - "eval_samples_per_second": 869.537, - "eval_steps_per_second": 54.347, + "eval_loss": 2.264164447784424, + "eval_runtime": 341.931, + "eval_samples_per_second": 877.37, + "eval_steps_per_second": 54.836, "step": 1960000 }, { "epoch": 5.52, "learning_rate": 7.38e-08, - "loss": 2.4295, + "loss": 2.4004, "step": 1968000 }, { "epoch": 5.52, - "eval_loss": 2.275022506713867, - "eval_runtime": 344.3706, - "eval_samples_per_second": 871.137, - "eval_steps_per_second": 54.447, + "eval_loss": 2.269155979156494, + "eval_runtime": 342.3742, + "eval_samples_per_second": 876.234, + "eval_steps_per_second": 54.765, "step": 1968000 }, { "epoch": 5.55, - "eval_loss": 2.270230531692505, - "eval_runtime": 344.3401, - "eval_samples_per_second": 871.214, - "eval_steps_per_second": 54.452, + "eval_loss": 2.2696123123168945, + "eval_runtime": 345.6816, + "eval_samples_per_second": 867.851, + "eval_steps_per_second": 54.241, "step": 1976000 }, { "epoch": 5.57, "learning_rate": 7.106666666666667e-08, - "loss": 2.428, + "loss": 2.4065, "step": 1984000 }, { "epoch": 5.57, - "eval_loss": 2.2729129791259766, - "eval_runtime": 346.4482, - "eval_samples_per_second": 865.913, - "eval_steps_per_second": 54.121, + "eval_loss": 2.2579238414764404, + "eval_runtime": 341.8896, + "eval_samples_per_second": 877.476, + "eval_steps_per_second": 54.842, "step": 1984000 }, { "epoch": 5.59, - "eval_loss": 2.2706665992736816, - "eval_runtime": 344.2826, - "eval_samples_per_second": 871.36, - "eval_steps_per_second": 54.461, + "eval_loss": 2.266026020050049, + "eval_runtime": 344.4173, + "eval_samples_per_second": 871.036, + "eval_steps_per_second": 54.44, "step": 1992000 }, { "epoch": 5.61, "learning_rate": 6.833333333333332e-08, - "loss": 2.4336, + "loss": 2.4025, "step": 2000000 }, { "epoch": 5.61, - "eval_loss": 2.277449607849121, - "eval_runtime": 346.3633, - "eval_samples_per_second": 866.125, - "eval_steps_per_second": 54.134, + "eval_loss": 2.2654054164886475, + "eval_runtime": 342.2708, + "eval_samples_per_second": 876.499, + "eval_steps_per_second": 54.781, "step": 2000000 }, { "epoch": 5.64, - "eval_loss": 2.273486375808716, - "eval_runtime": 345.4177, - "eval_samples_per_second": 868.496, - "eval_steps_per_second": 54.282, + "eval_loss": 2.2706494331359863, + "eval_runtime": 341.5445, + "eval_samples_per_second": 878.363, + "eval_steps_per_second": 54.898, "step": 2008000 }, { "epoch": 5.66, "learning_rate": 6.56e-08, - "loss": 2.4332, + "loss": 2.3993, "step": 2016000 }, { "epoch": 5.66, - "eval_loss": 2.2634286880493164, - "eval_runtime": 344.8659, - "eval_samples_per_second": 869.886, - "eval_steps_per_second": 54.369, + "eval_loss": 2.270448684692383, + "eval_runtime": 340.9974, + "eval_samples_per_second": 879.772, + "eval_steps_per_second": 54.986, "step": 2016000 }, { "epoch": 5.68, - "eval_loss": 2.2678945064544678, - "eval_runtime": 345.3228, - "eval_samples_per_second": 868.735, - "eval_steps_per_second": 54.297, + "eval_loss": 2.2663590908050537, + "eval_runtime": 340.7056, + "eval_samples_per_second": 880.526, + "eval_steps_per_second": 55.033, "step": 2024000 }, { "epoch": 5.7, "learning_rate": 6.286666666666666e-08, - "loss": 2.4342, + "loss": 2.4034, "step": 2032000 }, { "epoch": 5.7, - "eval_loss": 2.2753427028656006, - "eval_runtime": 345.298, - "eval_samples_per_second": 868.797, - "eval_steps_per_second": 54.301, + "eval_loss": 2.2659454345703125, + "eval_runtime": 341.9489, + "eval_samples_per_second": 877.324, + "eval_steps_per_second": 54.833, "step": 2032000 }, { "epoch": 5.73, - "eval_loss": 2.271911382675171, - "eval_runtime": 346.7418, - "eval_samples_per_second": 865.18, - "eval_steps_per_second": 54.075, + "eval_loss": 2.268005609512329, + "eval_runtime": 340.8655, + "eval_samples_per_second": 880.113, + "eval_steps_per_second": 55.007, "step": 2040000 }, { "epoch": 5.75, "learning_rate": 6.013333333333333e-08, - "loss": 2.4279, + "loss": 2.4004, "step": 2048000 }, { "epoch": 5.75, - "eval_loss": 2.271139621734619, - "eval_runtime": 345.3244, - "eval_samples_per_second": 868.731, - "eval_steps_per_second": 54.297, + "eval_loss": 2.2611002922058105, + "eval_runtime": 340.9511, + "eval_samples_per_second": 879.891, + "eval_steps_per_second": 54.993, "step": 2048000 }, { "epoch": 5.77, - "eval_loss": 2.277822256088257, - "eval_runtime": 346.0842, - "eval_samples_per_second": 866.824, - "eval_steps_per_second": 54.178, + "eval_loss": 2.264587879180908, + "eval_runtime": 342.5116, + "eval_samples_per_second": 875.883, + "eval_steps_per_second": 54.743, "step": 2056000 }, { "epoch": 5.79, "learning_rate": 5.7400000000000004e-08, - "loss": 2.4281, + "loss": 2.4025, "step": 2064000 }, { "epoch": 5.79, - "eval_loss": 2.2693228721618652, - "eval_runtime": 345.3065, - "eval_samples_per_second": 868.776, - "eval_steps_per_second": 54.3, + "eval_loss": 2.268247604370117, + "eval_runtime": 343.4269, + "eval_samples_per_second": 873.548, + "eval_steps_per_second": 54.597, "step": 2064000 }, { "epoch": 5.82, - "eval_loss": 2.271515369415283, - "eval_runtime": 346.2536, - "eval_samples_per_second": 866.4, - "eval_steps_per_second": 54.151, + "eval_loss": 2.264587640762329, + "eval_runtime": 341.3392, + "eval_samples_per_second": 878.891, + "eval_steps_per_second": 54.931, "step": 2072000 }, { "epoch": 5.84, "learning_rate": 5.4666666666666666e-08, - "loss": 2.4246, + "loss": 2.4063, "step": 2080000 }, { "epoch": 5.84, - "eval_loss": 2.2674171924591064, - "eval_runtime": 345.4527, - "eval_samples_per_second": 868.408, - "eval_steps_per_second": 54.277, + "eval_loss": 2.2597994804382324, + "eval_runtime": 343.1178, + "eval_samples_per_second": 874.335, + "eval_steps_per_second": 54.646, "step": 2080000 }, { "epoch": 5.86, - "eval_loss": 2.2699599266052246, - "eval_runtime": 345.7915, - "eval_samples_per_second": 867.557, - "eval_steps_per_second": 54.223, + "eval_loss": 2.267334461212158, + "eval_runtime": 344.4059, + "eval_samples_per_second": 871.065, + "eval_steps_per_second": 54.442, "step": 2088000 }, { "epoch": 5.88, "learning_rate": 5.1933333333333335e-08, - "loss": 2.4235, + "loss": 2.4071, "step": 2096000 }, { "epoch": 5.88, - "eval_loss": 2.270324230194092, - "eval_runtime": 347.3105, - "eval_samples_per_second": 863.763, - "eval_steps_per_second": 53.986, + "eval_loss": 2.264587879180908, + "eval_runtime": 342.5952, + "eval_samples_per_second": 875.669, + "eval_steps_per_second": 54.729, "step": 2096000 }, { "epoch": 5.91, - "eval_loss": 2.272321939468384, - "eval_runtime": 347.0148, - "eval_samples_per_second": 864.499, - "eval_steps_per_second": 54.032, + "eval_loss": 2.2672042846679688, + "eval_runtime": 342.3657, + "eval_samples_per_second": 876.256, + "eval_steps_per_second": 54.766, "step": 2104000 }, { "epoch": 5.93, "learning_rate": 4.92e-08, - "loss": 2.4388, + "loss": 2.401, "step": 2112000 }, { "epoch": 5.93, - "eval_loss": 2.268273115158081, - "eval_runtime": 346.0854, - "eval_samples_per_second": 866.821, - "eval_steps_per_second": 54.177, + "eval_loss": 2.2647833824157715, + "eval_runtime": 343.2309, + "eval_samples_per_second": 874.047, + "eval_steps_per_second": 54.628, "step": 2112000 }, { "epoch": 5.95, - "eval_loss": 2.2712411880493164, - "eval_runtime": 346.9622, - "eval_samples_per_second": 864.63, - "eval_steps_per_second": 54.04, + "eval_loss": 2.2654144763946533, + "eval_runtime": 344.1951, + "eval_samples_per_second": 871.599, + "eval_steps_per_second": 54.475, "step": 2120000 }, { "epoch": 5.97, "learning_rate": 4.6466666666666666e-08, - "loss": 2.431, + "loss": 2.402, "step": 2128000 }, { "epoch": 5.97, - "eval_loss": 2.27392578125, - "eval_runtime": 346.2653, - "eval_samples_per_second": 866.37, - "eval_steps_per_second": 54.149, + "eval_loss": 2.2664010524749756, + "eval_runtime": 342.7081, + "eval_samples_per_second": 875.381, + "eval_steps_per_second": 54.711, "step": 2128000 }, { "epoch": 6.0, - "eval_loss": 2.2757456302642822, - "eval_runtime": 346.0244, - "eval_samples_per_second": 866.973, - "eval_steps_per_second": 54.187, + "eval_loss": 2.2682883739471436, + "eval_runtime": 342.1336, + "eval_samples_per_second": 876.851, + "eval_steps_per_second": 54.803, "step": 2136000 }, { "epoch": 6.02, "learning_rate": 4.3733333333333335e-08, - "loss": 2.4329, + "loss": 2.4004, "step": 2144000 }, { "epoch": 6.02, - "eval_loss": 2.2785327434539795, - "eval_runtime": 346.851, - "eval_samples_per_second": 864.907, - "eval_steps_per_second": 54.058, + "eval_loss": 2.261821985244751, + "eval_runtime": 343.7815, + "eval_samples_per_second": 872.647, + "eval_steps_per_second": 54.54, "step": 2144000 }, { "epoch": 6.04, - "eval_loss": 2.2720842361450195, - "eval_runtime": 346.7603, - "eval_samples_per_second": 865.134, - "eval_steps_per_second": 54.072, + "eval_loss": 2.2668938636779785, + "eval_runtime": 344.1074, + "eval_samples_per_second": 871.821, + "eval_steps_per_second": 54.489, "step": 2152000 }, { "epoch": 6.06, "learning_rate": 4.1e-08, - "loss": 2.4266, + "loss": 2.4001, "step": 2160000 }, { "epoch": 6.06, - "eval_loss": 2.274451971054077, - "eval_runtime": 346.0729, - "eval_samples_per_second": 866.852, - "eval_steps_per_second": 54.179, + "eval_loss": 2.2630324363708496, + "eval_runtime": 341.9786, + "eval_samples_per_second": 877.248, + "eval_steps_per_second": 54.828, "step": 2160000 }, { "epoch": 6.09, - "eval_loss": 2.2738053798675537, - "eval_runtime": 346.4358, - "eval_samples_per_second": 865.944, - "eval_steps_per_second": 54.123, + "eval_loss": 2.2631518840789795, + "eval_runtime": 341.9226, + "eval_samples_per_second": 877.391, + "eval_steps_per_second": 54.837, "step": 2168000 }, { "epoch": 6.11, "learning_rate": 3.8266666666666665e-08, - "loss": 2.4255, + "loss": 2.4046, "step": 2176000 }, { "epoch": 6.11, - "eval_loss": 2.273524284362793, - "eval_runtime": 346.5246, - "eval_samples_per_second": 865.722, - "eval_steps_per_second": 54.109, + "eval_loss": 2.26960825920105, + "eval_runtime": 344.2789, + "eval_samples_per_second": 871.387, + "eval_steps_per_second": 54.462, "step": 2176000 }, { "epoch": 6.13, - "eval_loss": 2.2667336463928223, - "eval_runtime": 347.6074, - "eval_samples_per_second": 863.025, - "eval_steps_per_second": 53.94, + "eval_loss": 2.2641026973724365, + "eval_runtime": 343.3436, + "eval_samples_per_second": 873.76, + "eval_steps_per_second": 54.61, "step": 2184000 }, { "epoch": 6.15, "learning_rate": 3.5533333333333334e-08, - "loss": 2.4263, + "loss": 2.405, "step": 2192000 }, { "epoch": 6.15, - "eval_loss": 2.2765865325927734, - "eval_runtime": 347.5449, - "eval_samples_per_second": 863.181, - "eval_steps_per_second": 53.95, + "eval_loss": 2.262655735015869, + "eval_runtime": 344.8039, + "eval_samples_per_second": 870.06, + "eval_steps_per_second": 54.379, "step": 2192000 }, { "epoch": 6.18, - "eval_loss": 2.2754104137420654, - "eval_runtime": 349.1889, - "eval_samples_per_second": 859.117, - "eval_steps_per_second": 53.696, + "eval_loss": 2.268143653869629, + "eval_runtime": 343.934, + "eval_samples_per_second": 872.26, + "eval_steps_per_second": 54.516, "step": 2200000 }, { "epoch": 6.2, "learning_rate": 3.28e-08, - "loss": 2.4388, + "loss": 2.4063, "step": 2208000 }, { "epoch": 6.2, - "eval_loss": 2.269387722015381, - "eval_runtime": 347.4241, - "eval_samples_per_second": 863.481, - "eval_steps_per_second": 53.969, + "eval_loss": 2.2603704929351807, + "eval_runtime": 342.6448, + "eval_samples_per_second": 875.542, + "eval_steps_per_second": 54.721, "step": 2208000 }, { "epoch": 6.22, - "eval_loss": 2.267467737197876, - "eval_runtime": 348.3026, - "eval_samples_per_second": 861.303, - "eval_steps_per_second": 53.832, + "eval_loss": 2.271454095840454, + "eval_runtime": 343.324, + "eval_samples_per_second": 873.81, + "eval_steps_per_second": 54.613, "step": 2216000 }, { "epoch": 6.24, "learning_rate": 3.0066666666666665e-08, - "loss": 2.4293, + "loss": 2.3991, "step": 2224000 }, { "epoch": 6.24, - "eval_loss": 2.26993465423584, - "eval_runtime": 347.7333, - "eval_samples_per_second": 862.713, - "eval_steps_per_second": 53.921, + "eval_loss": 2.268319606781006, + "eval_runtime": 342.6834, + "eval_samples_per_second": 875.444, + "eval_steps_per_second": 54.715, "step": 2224000 }, { "epoch": 6.27, - "eval_loss": 2.271167039871216, - "eval_runtime": 348.7054, - "eval_samples_per_second": 860.308, - "eval_steps_per_second": 53.77, + "eval_loss": 2.265730857849121, + "eval_runtime": 346.275, + "eval_samples_per_second": 866.363, + "eval_steps_per_second": 54.148, "step": 2232000 }, { "epoch": 6.29, "learning_rate": 2.7333333333333333e-08, - "loss": 2.428, + "loss": 2.405, "step": 2240000 }, { "epoch": 6.29, - "eval_loss": 2.270735740661621, - "eval_runtime": 348.9551, - "eval_samples_per_second": 859.692, - "eval_steps_per_second": 53.732, + "eval_loss": 2.2645092010498047, + "eval_runtime": 343.3622, + "eval_samples_per_second": 873.713, + "eval_steps_per_second": 54.607, "step": 2240000 }, { "epoch": 6.31, - "eval_loss": 2.273216962814331, - "eval_runtime": 349.8006, - "eval_samples_per_second": 857.614, - "eval_steps_per_second": 53.602, + "eval_loss": 2.2676303386688232, + "eval_runtime": 343.161, + "eval_samples_per_second": 874.225, + "eval_steps_per_second": 54.639, "step": 2248000 }, { "epoch": 6.33, "learning_rate": 2.46e-08, - "loss": 2.4247, + "loss": 2.3941, "step": 2256000 }, { "epoch": 6.33, - "eval_loss": 2.275233745574951, - "eval_runtime": 347.1846, - "eval_samples_per_second": 864.076, - "eval_steps_per_second": 54.006, + "eval_loss": 2.270566463470459, + "eval_runtime": 344.7989, + "eval_samples_per_second": 870.072, + "eval_steps_per_second": 54.38, "step": 2256000 }, { "epoch": 6.36, - "eval_loss": 2.2703025341033936, - "eval_runtime": 347.2363, - "eval_samples_per_second": 863.948, - "eval_steps_per_second": 53.998, + "eval_loss": 2.259324312210083, + "eval_runtime": 344.3396, + "eval_samples_per_second": 871.233, + "eval_steps_per_second": 54.452, "step": 2264000 }, { "epoch": 6.38, "learning_rate": 2.1866666666666667e-08, - "loss": 2.4272, + "loss": 2.4041, "step": 2272000 }, { "epoch": 6.38, - "eval_loss": 2.268977403640747, - "eval_runtime": 347.4129, - "eval_samples_per_second": 863.509, - "eval_steps_per_second": 53.97, + "eval_loss": 2.267908811569214, + "eval_runtime": 344.2377, + "eval_samples_per_second": 871.491, + "eval_steps_per_second": 54.468, "step": 2272000 }, { "epoch": 6.4, - "eval_loss": 2.2775228023529053, - "eval_runtime": 348.4338, - "eval_samples_per_second": 860.979, - "eval_steps_per_second": 53.812, + "eval_loss": 2.2643110752105713, + "eval_runtime": 343.3047, + "eval_samples_per_second": 873.859, + "eval_steps_per_second": 54.616, "step": 2280000 }, { "epoch": 6.42, "learning_rate": 1.9133333333333333e-08, - "loss": 2.4297, + "loss": 2.4001, "step": 2288000 }, { "epoch": 6.42, - "eval_loss": 2.2680258750915527, - "eval_runtime": 349.0209, - "eval_samples_per_second": 859.53, - "eval_steps_per_second": 53.722, + "eval_loss": 2.2728431224823, + "eval_runtime": 343.644, + "eval_samples_per_second": 872.996, + "eval_steps_per_second": 54.562, "step": 2288000 }, { - "epoch": 6.45, - "eval_loss": 2.271179676055908, - "eval_runtime": 349.3722, - "eval_samples_per_second": 858.666, - "eval_steps_per_second": 53.668, + "epoch": 6.44, + "eval_loss": 2.263103485107422, + "eval_runtime": 343.0897, + "eval_samples_per_second": 874.407, + "eval_steps_per_second": 54.65, "step": 2296000 }, { "epoch": 6.47, "learning_rate": 1.64e-08, - "loss": 2.4268, + "loss": 2.3983, "step": 2304000 }, { "epoch": 6.47, - "eval_loss": 2.2815394401550293, - "eval_runtime": 347.9244, - "eval_samples_per_second": 862.239, - "eval_steps_per_second": 53.891, + "eval_loss": 2.263552188873291, + "eval_runtime": 344.7078, + "eval_samples_per_second": 870.302, + "eval_steps_per_second": 54.394, "step": 2304000 }, { "epoch": 6.49, - "eval_loss": 2.269704818725586, - "eval_runtime": 348.7638, - "eval_samples_per_second": 860.164, - "eval_steps_per_second": 53.761, + "eval_loss": 2.262969732284546, + "eval_runtime": 343.3199, + "eval_samples_per_second": 873.821, + "eval_steps_per_second": 54.614, "step": 2312000 }, { "epoch": 6.51, "learning_rate": 1.3666666666666667e-08, - "loss": 2.4248, + "loss": 2.4003, "step": 2320000 }, { "epoch": 6.51, - "eval_loss": 2.2793667316436768, - "eval_runtime": 349.717, - "eval_samples_per_second": 857.819, - "eval_steps_per_second": 53.615, + "eval_loss": 2.2662770748138428, + "eval_runtime": 344.4313, + "eval_samples_per_second": 871.001, + "eval_steps_per_second": 54.438, "step": 2320000 }, { "epoch": 6.53, - "eval_loss": 2.2721614837646484, - "eval_runtime": 349.5657, - "eval_samples_per_second": 858.191, - "eval_steps_per_second": 53.638, + "eval_loss": 2.264718770980835, + "eval_runtime": 344.3318, + "eval_samples_per_second": 871.253, + "eval_steps_per_second": 54.453, "step": 2328000 }, { "epoch": 6.56, "learning_rate": 1.0933333333333334e-08, - "loss": 2.4285, + "loss": 2.3981, "step": 2336000 }, { "epoch": 6.56, - "eval_loss": 2.2685911655426025, - "eval_runtime": 348.4666, - "eval_samples_per_second": 860.897, - "eval_steps_per_second": 53.807, + "eval_loss": 2.2669222354888916, + "eval_runtime": 344.4268, + "eval_samples_per_second": 871.012, + "eval_steps_per_second": 54.438, "step": 2336000 }, { "epoch": 6.58, - "eval_loss": 2.274138927459717, - "eval_runtime": 348.1529, - "eval_samples_per_second": 861.673, - "eval_steps_per_second": 53.856, + "eval_loss": 2.266000509262085, + "eval_runtime": 344.4815, + "eval_samples_per_second": 870.874, + "eval_steps_per_second": 54.43, "step": 2344000 }, { "epoch": 6.6, "learning_rate": 8.2e-09, - "loss": 2.4318, + "loss": 2.3951, "step": 2352000 }, { "epoch": 6.6, - "eval_loss": 2.267868995666504, - "eval_runtime": 349.9266, - "eval_samples_per_second": 857.305, - "eval_steps_per_second": 53.583, + "eval_loss": 2.2692267894744873, + "eval_runtime": 344.0579, + "eval_samples_per_second": 871.946, + "eval_steps_per_second": 54.497, "step": 2352000 }, { "epoch": 6.62, - "eval_loss": 2.272257089614868, - "eval_runtime": 352.1942, - "eval_samples_per_second": 851.786, - "eval_steps_per_second": 53.238, + "eval_loss": 2.264406442642212, + "eval_runtime": 344.7783, + "eval_samples_per_second": 870.124, + "eval_steps_per_second": 54.383, "step": 2360000 }, { "epoch": 6.65, "learning_rate": 5.466666666666667e-09, - "loss": 2.4269, + "loss": 2.4013, "step": 2368000 }, { "epoch": 6.65, - "eval_loss": 2.274069309234619, - "eval_runtime": 348.3964, - "eval_samples_per_second": 861.071, - "eval_steps_per_second": 53.818, + "eval_loss": 2.2610132694244385, + "eval_runtime": 344.5393, + "eval_samples_per_second": 870.728, + "eval_steps_per_second": 54.42, "step": 2368000 }, { "epoch": 6.67, - "eval_loss": 2.2739241123199463, - "eval_runtime": 348.8166, - "eval_samples_per_second": 860.034, - "eval_steps_per_second": 53.753, + "eval_loss": 2.26550555229187, + "eval_runtime": 344.5292, + "eval_samples_per_second": 870.754, + "eval_steps_per_second": 54.422, "step": 2376000 }, { "epoch": 6.69, "learning_rate": 2.7333333333333334e-09, - "loss": 2.4275, + "loss": 2.4, "step": 2384000 }, { "epoch": 6.69, - "eval_loss": 2.27441143989563, - "eval_runtime": 348.6995, - "eval_samples_per_second": 860.322, - "eval_steps_per_second": 53.771, + "eval_loss": 2.25915789604187, + "eval_runtime": 344.8958, + "eval_samples_per_second": 869.828, + "eval_steps_per_second": 54.364, "step": 2384000 }, { "epoch": 6.71, - "eval_loss": 2.2764933109283447, - "eval_runtime": 350.1503, - "eval_samples_per_second": 856.758, - "eval_steps_per_second": 53.548, + "eval_loss": 2.266591787338257, + "eval_runtime": 344.5939, + "eval_samples_per_second": 870.59, + "eval_steps_per_second": 54.412, "step": 2392000 }, { "epoch": 6.74, "learning_rate": 0.0, - "loss": 2.4259, + "loss": 2.3975, "step": 2400000 }, { "epoch": 6.74, - "eval_loss": 2.278808116912842, - "eval_runtime": 349.1911, - "eval_samples_per_second": 859.111, - "eval_steps_per_second": 53.696, + "eval_loss": 2.2684991359710693, + "eval_runtime": 344.5329, + "eval_samples_per_second": 870.744, + "eval_steps_per_second": 54.422, "step": 2400000 }, { "epoch": 6.74, "step": 2400000, - "total_flos": 7.565202754943813e+17, - "train_loss": 2.4400800537109375, - "train_runtime": 255136.6894, - "train_samples_per_second": 150.508, - "train_steps_per_second": 9.407 + "total_flos": 8.367702695823237e+17, + "train_loss": 2.4076748518880207, + "train_runtime": 247856.7094, + "train_samples_per_second": 154.928, + "train_steps_per_second": 9.683 } ], "logging_steps": 16000, "max_steps": 2400000, "num_train_epochs": 7, "save_steps": 32000, - "total_flos": 7.565202754943813e+17, + "total_flos": 8.367702695823237e+17, "trial_name": null, "trial_params": null }