diff --git "a/checkpoint-100000/trainer_state.json" "b/checkpoint-100000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-100000/trainer_state.json" @@ -0,0 +1,6716 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.3572704537334763, + "global_step": 100000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2.9999999999999997e-06, + "loss": 1.0412, + "step": 100 + }, + { + "epoch": 0.0, + "learning_rate": 5.999999999999999e-06, + "loss": 0.835, + "step": 200 + }, + { + "epoch": 0.01, + "learning_rate": 8.999999999999999e-06, + "loss": 0.7822, + "step": 300 + }, + { + "epoch": 0.01, + "learning_rate": 1.1999999999999999e-05, + "loss": 0.7718, + "step": 400 + }, + { + "epoch": 0.01, + "learning_rate": 1.4999999999999999e-05, + "loss": 0.7707, + "step": 500 + }, + { + "epoch": 0.01, + "learning_rate": 1.7999999999999997e-05, + "loss": 0.7697, + "step": 600 + }, + { + "epoch": 0.02, + "learning_rate": 2.1e-05, + "loss": 0.769, + "step": 700 + }, + { + "epoch": 0.02, + "learning_rate": 2.3999999999999997e-05, + "loss": 0.7682, + "step": 800 + }, + { + "epoch": 0.02, + "learning_rate": 2.6999999999999996e-05, + "loss": 0.7674, + "step": 900 + }, + { + "epoch": 0.02, + "learning_rate": 2.9999999999999997e-05, + "loss": 0.767, + "step": 1000 + }, + { + "epoch": 0.02, + "eval_runtime": 45.7675, + "eval_samples_per_second": 235.975, + "eval_steps_per_second": 7.385, + "step": 1000 + }, + { + "epoch": 0.03, + "learning_rate": 3.2999999999999996e-05, + "loss": 0.7665, + "step": 1100 + }, + { + "epoch": 0.03, + "learning_rate": 3.5999999999999994e-05, + "loss": 0.7662, + "step": 1200 + }, + { + "epoch": 0.03, + "learning_rate": 3.9e-05, + "loss": 0.7661, + "step": 1300 + }, + { + "epoch": 0.03, + "learning_rate": 4.2e-05, + "loss": 0.766, + "step": 1400 + }, + { + "epoch": 0.04, + "learning_rate": 4.4999999999999996e-05, + "loss": 0.7659, + "step": 1500 + }, + { + "epoch": 0.04, + "learning_rate": 4.7999999999999994e-05, + "loss": 0.7656, + "step": 1600 + }, + { + "epoch": 0.04, + "learning_rate": 5.1e-05, + "loss": 0.7655, + "step": 1700 + }, + { + "epoch": 0.04, + "learning_rate": 5.399999999999999e-05, + "loss": 0.7655, + "step": 1800 + }, + { + "epoch": 0.05, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.7653, + "step": 1900 + }, + { + "epoch": 0.05, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.7655, + "step": 2000 + }, + { + "epoch": 0.05, + "eval_runtime": 45.5917, + "eval_samples_per_second": 236.885, + "eval_steps_per_second": 7.414, + "step": 2000 + }, + { + "epoch": 0.05, + "learning_rate": 6.299999999999999e-05, + "loss": 0.7651, + "step": 2100 + }, + { + "epoch": 0.05, + "learning_rate": 6.599999999999999e-05, + "loss": 0.7653, + "step": 2200 + }, + { + "epoch": 0.05, + "learning_rate": 6.9e-05, + "loss": 0.7654, + "step": 2300 + }, + { + "epoch": 0.06, + "learning_rate": 7.199999999999999e-05, + "loss": 0.765, + "step": 2400 + }, + { + "epoch": 0.06, + "learning_rate": 7.5e-05, + "loss": 0.7649, + "step": 2500 + }, + { + "epoch": 0.06, + "learning_rate": 7.8e-05, + "loss": 0.7648, + "step": 2600 + }, + { + "epoch": 0.06, + "learning_rate": 8.1e-05, + "loss": 0.7647, + "step": 2700 + }, + { + "epoch": 0.07, + "learning_rate": 8.4e-05, + "loss": 0.7645, + "step": 2800 + }, + { + "epoch": 0.07, + "learning_rate": 8.699999999999999e-05, + "loss": 0.7645, + "step": 2900 + }, + { + "epoch": 0.07, + "learning_rate": 8.999999999999999e-05, + "loss": 0.7644, + "step": 3000 + }, + { + "epoch": 0.07, + "eval_runtime": 45.7281, + "eval_samples_per_second": 236.179, + "eval_steps_per_second": 7.392, + "step": 3000 + }, + { + "epoch": 0.07, + "learning_rate": 9.3e-05, + "loss": 0.7641, + "step": 3100 + }, + { + "epoch": 0.08, + "learning_rate": 9.599999999999999e-05, + "loss": 0.764, + "step": 3200 + }, + { + "epoch": 0.08, + "learning_rate": 9.9e-05, + "loss": 0.7638, + "step": 3300 + }, + { + "epoch": 0.08, + "learning_rate": 0.000102, + "loss": 0.763, + "step": 3400 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010499999999999999, + "loss": 0.7665, + "step": 3500 + }, + { + "epoch": 0.09, + "learning_rate": 0.00010799999999999998, + "loss": 0.7669, + "step": 3600 + }, + { + "epoch": 0.09, + "learning_rate": 0.00011099999999999999, + "loss": 0.7653, + "step": 3700 + }, + { + "epoch": 0.09, + "learning_rate": 0.00011399999999999999, + "loss": 0.7535, + "step": 3800 + }, + { + "epoch": 0.09, + "learning_rate": 0.000117, + "loss": 0.7218, + "step": 3900 + }, + { + "epoch": 0.1, + "learning_rate": 0.00011999999999999999, + "loss": 0.6956, + "step": 4000 + }, + { + "epoch": 0.1, + "eval_runtime": 45.9121, + "eval_samples_per_second": 235.232, + "eval_steps_per_second": 7.362, + "step": 4000 + }, + { + "epoch": 0.1, + "learning_rate": 0.00012299999999999998, + "loss": 0.6758, + "step": 4100 + }, + { + "epoch": 0.1, + "learning_rate": 0.00012599999999999997, + "loss": 0.6557, + "step": 4200 + }, + { + "epoch": 0.1, + "learning_rate": 0.000129, + "loss": 0.6402, + "step": 4300 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013199999999999998, + "loss": 0.6302, + "step": 4400 + }, + { + "epoch": 0.11, + "learning_rate": 0.000135, + "loss": 0.623, + "step": 4500 + }, + { + "epoch": 0.11, + "learning_rate": 0.000138, + "loss": 0.6169, + "step": 4600 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014099999999999998, + "loss": 0.6121, + "step": 4700 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014399999999999998, + "loss": 0.607, + "step": 4800 + }, + { + "epoch": 0.12, + "learning_rate": 0.000147, + "loss": 0.6039, + "step": 4900 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015, + "loss": 0.6012, + "step": 5000 + }, + { + "epoch": 0.12, + "eval_runtime": 46.0979, + "eval_samples_per_second": 234.284, + "eval_steps_per_second": 7.332, + "step": 5000 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001499996172456075, + "loss": 0.5981, + "step": 5100 + }, + { + "epoch": 0.12, + "learning_rate": 0.00014999846898661572, + "loss": 0.5954, + "step": 5200 + }, + { + "epoch": 0.13, + "learning_rate": 0.00014999655523558183, + "loss": 0.5935, + "step": 5300 + }, + { + "epoch": 0.13, + "learning_rate": 0.00014999387601343436, + "loss": 0.5911, + "step": 5400 + }, + { + "epoch": 0.13, + "learning_rate": 0.00014999043134947282, + "loss": 0.5895, + "step": 5500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00014998622128136748, + "loss": 0.5877, + "step": 5600 + }, + { + "epoch": 0.14, + "learning_rate": 0.000149981245855159, + "loss": 0.5866, + "step": 5700 + }, + { + "epoch": 0.14, + "learning_rate": 0.00014997550512525784, + "loss": 0.5845, + "step": 5800 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001499689991544437, + "loss": 0.5784, + "step": 5900 + }, + { + "epoch": 0.14, + "learning_rate": 0.00014996172801386482, + "loss": 0.5684, + "step": 6000 + }, + { + "epoch": 0.14, + "eval_runtime": 46.0154, + "eval_samples_per_second": 234.704, + "eval_steps_per_second": 7.345, + "step": 6000 + }, + { + "epoch": 0.15, + "learning_rate": 0.00014995369178303722, + "loss": 0.5642, + "step": 6100 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001499448905498439, + "loss": 0.5625, + "step": 6200 + }, + { + "epoch": 0.15, + "learning_rate": 0.00014993532441053364, + "loss": 0.5601, + "step": 6300 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001499249934697203, + "loss": 0.5581, + "step": 6400 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001499138978403813, + "loss": 0.554, + "step": 6500 + }, + { + "epoch": 0.16, + "learning_rate": 0.00014990203764385677, + "loss": 0.5462, + "step": 6600 + }, + { + "epoch": 0.16, + "learning_rate": 0.00014988941300984784, + "loss": 0.5284, + "step": 6700 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001498760240764155, + "loss": 0.5032, + "step": 6800 + }, + { + "epoch": 0.16, + "learning_rate": 0.000149861870989979, + "loss": 0.4751, + "step": 6900 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001498469539053142, + "loss": 0.4574, + "step": 7000 + }, + { + "epoch": 0.17, + "eval_runtime": 45.9402, + "eval_samples_per_second": 235.088, + "eval_steps_per_second": 7.357, + "step": 7000 + }, + { + "epoch": 0.17, + "learning_rate": 0.00014983127298555198, + "loss": 0.4453, + "step": 7100 + }, + { + "epoch": 0.17, + "learning_rate": 0.00014981482840217632, + "loss": 0.437, + "step": 7200 + }, + { + "epoch": 0.17, + "learning_rate": 0.00014979762033502262, + "loss": 0.4306, + "step": 7300 + }, + { + "epoch": 0.18, + "learning_rate": 0.00014977964897227547, + "loss": 0.4254, + "step": 7400 + }, + { + "epoch": 0.18, + "learning_rate": 0.00014976091451046687, + "loss": 0.4204, + "step": 7500 + }, + { + "epoch": 0.18, + "learning_rate": 0.00014974141715447386, + "loss": 0.4178, + "step": 7600 + }, + { + "epoch": 0.18, + "learning_rate": 0.00014972115711751644, + "loss": 0.4135, + "step": 7700 + }, + { + "epoch": 0.19, + "learning_rate": 0.00014970013462115505, + "loss": 0.4099, + "step": 7800 + }, + { + "epoch": 0.19, + "learning_rate": 0.00014967834989528843, + "loss": 0.4077, + "step": 7900 + }, + { + "epoch": 0.19, + "learning_rate": 0.00014965580317815078, + "loss": 0.405, + "step": 8000 + }, + { + "epoch": 0.19, + "eval_runtime": 45.7648, + "eval_samples_per_second": 235.989, + "eval_steps_per_second": 7.386, + "step": 8000 + }, + { + "epoch": 0.19, + "learning_rate": 0.00014963249471630944, + "loss": 0.4017, + "step": 8100 + }, + { + "epoch": 0.2, + "learning_rate": 0.000149608424764662, + "loss": 0.4006, + "step": 8200 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001495835935864336, + "loss": 0.3977, + "step": 8300 + }, + { + "epoch": 0.2, + "learning_rate": 0.00014955800145317397, + "loss": 0.3964, + "step": 8400 + }, + { + "epoch": 0.2, + "learning_rate": 0.00014953164864475466, + "loss": 0.3949, + "step": 8500 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001495045354493657, + "loss": 0.3961, + "step": 8600 + }, + { + "epoch": 0.21, + "learning_rate": 0.00014947666216351272, + "loss": 0.398, + "step": 8700 + }, + { + "epoch": 0.21, + "learning_rate": 0.00014944802909201344, + "loss": 0.3924, + "step": 8800 + }, + { + "epoch": 0.21, + "learning_rate": 0.00014941863654799456, + "loss": 0.3938, + "step": 8900 + }, + { + "epoch": 0.21, + "learning_rate": 0.00014938848485288825, + "loss": 0.3885, + "step": 9000 + }, + { + "epoch": 0.21, + "eval_runtime": 45.9868, + "eval_samples_per_second": 234.85, + "eval_steps_per_second": 7.35, + "step": 9000 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001493575743364286, + "loss": 0.391, + "step": 9100 + }, + { + "epoch": 0.22, + "learning_rate": 0.00014932590533664808, + "loss": 0.3884, + "step": 9200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001492934781998738, + "loss": 0.3856, + "step": 9300 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001492602932807237, + "loss": 0.3843, + "step": 9400 + }, + { + "epoch": 0.23, + "learning_rate": 0.00014922635094210277, + "loss": 0.3848, + "step": 9500 + }, + { + "epoch": 0.23, + "learning_rate": 0.000149191651555199, + "loss": 0.3795, + "step": 9600 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001491561954994793, + "loss": 0.3735, + "step": 9700 + }, + { + "epoch": 0.23, + "learning_rate": 0.00014911998316268537, + "loss": 0.3658, + "step": 9800 + }, + { + "epoch": 0.24, + "learning_rate": 0.00014908301494082963, + "loss": 0.362, + "step": 9900 + }, + { + "epoch": 0.24, + "learning_rate": 0.00014904529123819054, + "loss": 0.3595, + "step": 10000 + }, + { + "epoch": 0.24, + "eval_runtime": 46.3224, + "eval_samples_per_second": 233.148, + "eval_steps_per_second": 7.297, + "step": 10000 + }, + { + "epoch": 0.24, + "learning_rate": 0.00014900681246730852, + "loss": 0.3585, + "step": 10100 + }, + { + "epoch": 0.24, + "learning_rate": 0.00014896757904898125, + "loss": 0.3578, + "step": 10200 + }, + { + "epoch": 0.25, + "learning_rate": 0.00014892759141225904, + "loss": 0.3568, + "step": 10300 + }, + { + "epoch": 0.25, + "learning_rate": 0.00014888684999444035, + "loss": 0.355, + "step": 10400 + }, + { + "epoch": 0.25, + "learning_rate": 0.00014884535524106675, + "loss": 0.3537, + "step": 10500 + }, + { + "epoch": 0.25, + "learning_rate": 0.00014880310760591824, + "loss": 0.3523, + "step": 10600 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001487601075510082, + "loss": 0.3524, + "step": 10700 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001487163555465783, + "loss": 0.3515, + "step": 10800 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001486718520710935, + "loss": 0.3508, + "step": 10900 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014862659761123663, + "loss": 0.3493, + "step": 11000 + }, + { + "epoch": 0.26, + "eval_runtime": 46.1625, + "eval_samples_per_second": 233.956, + "eval_steps_per_second": 7.322, + "step": 11000 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014858059266190327, + "loss": 0.3472, + "step": 11100 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014853383772619612, + "loss": 0.3463, + "step": 11200 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014848633331541967, + "loss": 0.3363, + "step": 11300 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001484380799490746, + "loss": 0.3265, + "step": 11400 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014838907815485194, + "loss": 0.3235, + "step": 11500 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014833932846862748, + "loss": 0.3218, + "step": 11600 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014828883143445582, + "loss": 0.3203, + "step": 11700 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001482375876045644, + "loss": 0.3204, + "step": 11800 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001481855975393476, + "loss": 0.3184, + "step": 11900 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001481328618073604, + "loss": 0.318, + "step": 12000 + }, + { + "epoch": 0.29, + "eval_runtime": 46.1354, + "eval_samples_per_second": 234.094, + "eval_steps_per_second": 7.326, + "step": 12000 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001480793809853123, + "loss": 0.3163, + "step": 12100 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014802515565806107, + "loss": 0.3155, + "step": 12200 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014797018641860612, + "loss": 0.314, + "step": 12300 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001479144738680823, + "loss": 0.3136, + "step": 12400 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014785801861575312, + "loss": 0.3117, + "step": 12500 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014780082127900416, + "loss": 0.3086, + "step": 12600 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014774288248333635, + "loss": 0.3074, + "step": 12700 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014768420286235908, + "loss": 0.3074, + "step": 12800 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014762478305778328, + "loss": 0.3064, + "step": 12900 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001475646237194144, + "loss": 0.3057, + "step": 13000 + }, + { + "epoch": 0.31, + "eval_runtime": 46.1242, + "eval_samples_per_second": 234.15, + "eval_steps_per_second": 7.328, + "step": 13000 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014750372550514533, + "loss": 0.3048, + "step": 13100 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001474420890809492, + "loss": 0.3037, + "step": 13200 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014737971512087202, + "loss": 0.3029, + "step": 13300 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014731660430702552, + "loss": 0.3024, + "step": 13400 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014725275732957937, + "loss": 0.3011, + "step": 13500 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014718817488675387, + "loss": 0.3006, + "step": 13600 + }, + { + "epoch": 0.33, + "learning_rate": 0.00014712285768481235, + "loss": 0.3009, + "step": 13700 + }, + { + "epoch": 0.33, + "learning_rate": 0.00014705680643805323, + "loss": 0.2991, + "step": 13800 + }, + { + "epoch": 0.33, + "learning_rate": 0.00014699002186880232, + "loss": 0.2991, + "step": 13900 + }, + { + "epoch": 0.33, + "learning_rate": 0.00014692250470740503, + "loss": 0.2979, + "step": 14000 + }, + { + "epoch": 0.33, + "eval_runtime": 46.2531, + "eval_samples_per_second": 233.498, + "eval_steps_per_second": 7.308, + "step": 14000 + }, + { + "epoch": 0.34, + "learning_rate": 0.00014685425569221819, + "loss": 0.2975, + "step": 14100 + }, + { + "epoch": 0.34, + "learning_rate": 0.00014678527556960207, + "loss": 0.2955, + "step": 14200 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001467155650939123, + "loss": 0.295, + "step": 14300 + }, + { + "epoch": 0.34, + "learning_rate": 0.00014664512502749141, + "loss": 0.2941, + "step": 14400 + }, + { + "epoch": 0.35, + "learning_rate": 0.00014657395614066075, + "loss": 0.2931, + "step": 14500 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001465020592117118, + "loss": 0.2921, + "step": 14600 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001464294350268979, + "loss": 0.2918, + "step": 14700 + }, + { + "epoch": 0.35, + "learning_rate": 0.00014635608438042546, + "loss": 0.2907, + "step": 14800 + }, + { + "epoch": 0.35, + "learning_rate": 0.00014628200807444543, + "loss": 0.2899, + "step": 14900 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001462072069190444, + "loss": 0.2898, + "step": 15000 + }, + { + "epoch": 0.36, + "eval_runtime": 46.2774, + "eval_samples_per_second": 233.375, + "eval_steps_per_second": 7.304, + "step": 15000 + }, + { + "epoch": 0.36, + "learning_rate": 0.00014613168173223585, + "loss": 0.2885, + "step": 15100 + }, + { + "epoch": 0.36, + "learning_rate": 0.00014605543333995113, + "loss": 0.288, + "step": 15200 + }, + { + "epoch": 0.36, + "learning_rate": 0.00014597846257603038, + "loss": 0.2875, + "step": 15300 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001459007702822136, + "loss": 0.2876, + "step": 15400 + }, + { + "epoch": 0.37, + "learning_rate": 0.00014582235730813128, + "loss": 0.2862, + "step": 15500 + }, + { + "epoch": 0.37, + "learning_rate": 0.00014574322451129507, + "loss": 0.2849, + "step": 15600 + }, + { + "epoch": 0.37, + "learning_rate": 0.00014566337275708863, + "loss": 0.2852, + "step": 15700 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001455828029187579, + "loss": 0.2833, + "step": 15800 + }, + { + "epoch": 0.38, + "learning_rate": 0.00014550151587740178, + "loss": 0.2836, + "step": 15900 + }, + { + "epoch": 0.38, + "learning_rate": 0.00014541951252196225, + "loss": 0.2817, + "step": 16000 + }, + { + "epoch": 0.38, + "eval_runtime": 46.1169, + "eval_samples_per_second": 234.187, + "eval_steps_per_second": 7.329, + "step": 16000 + }, + { + "epoch": 0.38, + "learning_rate": 0.00014533679374921493, + "loss": 0.2824, + "step": 16100 + }, + { + "epoch": 0.39, + "learning_rate": 0.00014525336046375905, + "loss": 0.2817, + "step": 16200 + }, + { + "epoch": 0.39, + "learning_rate": 0.00014516921357800766, + "loss": 0.2812, + "step": 16300 + }, + { + "epoch": 0.39, + "learning_rate": 0.00014508435401217759, + "loss": 0.2812, + "step": 16400 + }, + { + "epoch": 0.39, + "learning_rate": 0.00014499878269427948, + "loss": 0.2795, + "step": 16500 + }, + { + "epoch": 0.4, + "learning_rate": 0.00014491250056010758, + "loss": 0.2788, + "step": 16600 + }, + { + "epoch": 0.4, + "learning_rate": 0.00014482550855322943, + "loss": 0.2775, + "step": 16700 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001447378076249757, + "loss": 0.2773, + "step": 16800 + }, + { + "epoch": 0.4, + "learning_rate": 0.00014464939873442973, + "loss": 0.2769, + "step": 16900 + }, + { + "epoch": 0.4, + "learning_rate": 0.00014456028284841693, + "loss": 0.2765, + "step": 17000 + }, + { + "epoch": 0.4, + "eval_runtime": 46.3516, + "eval_samples_per_second": 233.002, + "eval_steps_per_second": 7.292, + "step": 17000 + }, + { + "epoch": 0.41, + "learning_rate": 0.00014447046094149437, + "loss": 0.2752, + "step": 17100 + }, + { + "epoch": 0.41, + "learning_rate": 0.00014437993399594003, + "loss": 0.2765, + "step": 17200 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001442887030017421, + "loss": 0.2752, + "step": 17300 + }, + { + "epoch": 0.41, + "learning_rate": 0.00014419676895658807, + "loss": 0.2748, + "step": 17400 + }, + { + "epoch": 0.42, + "learning_rate": 0.000144104132865854, + "loss": 0.2739, + "step": 17500 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001440107957425933, + "loss": 0.2729, + "step": 17600 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001439167586075258, + "loss": 0.2722, + "step": 17700 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001438220224890265, + "loss": 0.2725, + "step": 17800 + }, + { + "epoch": 0.43, + "learning_rate": 0.00014372658842311449, + "loss": 0.2726, + "step": 17900 + }, + { + "epoch": 0.43, + "learning_rate": 0.00014363045745344137, + "loss": 0.2715, + "step": 18000 + }, + { + "epoch": 0.43, + "eval_runtime": 46.2247, + "eval_samples_per_second": 233.641, + "eval_steps_per_second": 7.312, + "step": 18000 + }, + { + "epoch": 0.43, + "learning_rate": 0.00014353363063128005, + "loss": 0.2705, + "step": 18100 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001434361090155131, + "loss": 0.2706, + "step": 18200 + }, + { + "epoch": 0.44, + "learning_rate": 0.00014333789367262136, + "loss": 0.2701, + "step": 18300 + }, + { + "epoch": 0.44, + "learning_rate": 0.00014323898567667202, + "loss": 0.2693, + "step": 18400 + }, + { + "epoch": 0.44, + "learning_rate": 0.00014313938610930712, + "loss": 0.2693, + "step": 18500 + }, + { + "epoch": 0.44, + "learning_rate": 0.00014303909605973154, + "loss": 0.2691, + "step": 18600 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001429381166247012, + "loss": 0.2681, + "step": 18700 + }, + { + "epoch": 0.45, + "learning_rate": 0.00014283644890851103, + "loss": 0.2672, + "step": 18800 + }, + { + "epoch": 0.45, + "learning_rate": 0.00014273409402298291, + "loss": 0.2671, + "step": 18900 + }, + { + "epoch": 0.45, + "learning_rate": 0.00014263105308745343, + "loss": 0.2676, + "step": 19000 + }, + { + "epoch": 0.45, + "eval_runtime": 46.3331, + "eval_samples_per_second": 233.095, + "eval_steps_per_second": 7.295, + "step": 19000 + }, + { + "epoch": 0.45, + "learning_rate": 0.00014252732722876176, + "loss": 0.2654, + "step": 19100 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001424229175812373, + "loss": 0.2649, + "step": 19200 + }, + { + "epoch": 0.46, + "learning_rate": 0.00014231782528668717, + "loss": 0.2647, + "step": 19300 + }, + { + "epoch": 0.46, + "learning_rate": 0.00014221205149438394, + "loss": 0.2649, + "step": 19400 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001421055973610528, + "loss": 0.264, + "step": 19500 + }, + { + "epoch": 0.47, + "learning_rate": 0.00014199846405085913, + "loss": 0.2647, + "step": 19600 + }, + { + "epoch": 0.47, + "learning_rate": 0.00014189065273539564, + "loss": 0.2635, + "step": 19700 + }, + { + "epoch": 0.47, + "learning_rate": 0.00014178216459366958, + "loss": 0.2623, + "step": 19800 + }, + { + "epoch": 0.47, + "learning_rate": 0.00014167300081208988, + "loss": 0.2627, + "step": 19900 + }, + { + "epoch": 0.48, + "learning_rate": 0.00014156316258445421, + "loss": 0.2932, + "step": 20000 + }, + { + "epoch": 0.48, + "eval_runtime": 46.169, + "eval_samples_per_second": 233.923, + "eval_steps_per_second": 7.321, + "step": 20000 + }, + { + "epoch": 0.48, + "learning_rate": 0.00014145265111193583, + "loss": 0.2645, + "step": 20100 + }, + { + "epoch": 0.48, + "learning_rate": 0.00014134146760307043, + "loss": 0.2625, + "step": 20200 + }, + { + "epoch": 0.48, + "learning_rate": 0.00014122961327374313, + "loss": 0.2615, + "step": 20300 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001411170893471749, + "loss": 0.2605, + "step": 20400 + }, + { + "epoch": 0.49, + "learning_rate": 0.00014100389705390938, + "loss": 0.26, + "step": 20500 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001408900376317994, + "loss": 0.2583, + "step": 20600 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001407755123259933, + "loss": 0.258, + "step": 20700 + }, + { + "epoch": 0.5, + "learning_rate": 0.00014066032238892152, + "loss": 0.2569, + "step": 20800 + }, + { + "epoch": 0.5, + "learning_rate": 0.00014054446908028272, + "loss": 0.2568, + "step": 20900 + }, + { + "epoch": 0.5, + "learning_rate": 0.00014042795366703018, + "loss": 0.2563, + "step": 21000 + }, + { + "epoch": 0.5, + "eval_runtime": 46.2726, + "eval_samples_per_second": 233.4, + "eval_steps_per_second": 7.305, + "step": 21000 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001403107774233577, + "loss": 0.256, + "step": 21100 + }, + { + "epoch": 0.5, + "learning_rate": 0.00014019294163068597, + "loss": 0.2548, + "step": 21200 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014007444757764835, + "loss": 0.2543, + "step": 21300 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001399552965600768, + "loss": 0.2537, + "step": 21400 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001398354898809877, + "loss": 0.2531, + "step": 21500 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001397150288505678, + "loss": 0.2531, + "step": 21600 + }, + { + "epoch": 0.52, + "learning_rate": 0.00013959391478615959, + "loss": 0.2526, + "step": 21700 + }, + { + "epoch": 0.52, + "learning_rate": 0.00013947214901224706, + "loss": 0.2522, + "step": 21800 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001393497328604412, + "loss": 0.2515, + "step": 21900 + }, + { + "epoch": 0.52, + "learning_rate": 0.00013922666766946545, + "loss": 0.2513, + "step": 22000 + }, + { + "epoch": 0.52, + "eval_runtime": 46.224, + "eval_samples_per_second": 233.645, + "eval_steps_per_second": 7.312, + "step": 22000 + }, + { + "epoch": 0.53, + "learning_rate": 0.00013910295478514106, + "loss": 0.2504, + "step": 22100 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001389785955603722, + "loss": 0.2503, + "step": 22200 + }, + { + "epoch": 0.53, + "learning_rate": 0.00013885359135513154, + "loss": 0.2501, + "step": 22300 + }, + { + "epoch": 0.53, + "learning_rate": 0.000138727943536445, + "loss": 0.2488, + "step": 22400 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013860165347837698, + "loss": 0.2492, + "step": 22500 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013847472256201535, + "loss": 0.2483, + "step": 22600 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013834715217545625, + "loss": 0.248, + "step": 22700 + }, + { + "epoch": 0.54, + "learning_rate": 0.000138218943713789, + "loss": 0.2479, + "step": 22800 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001380900985790808, + "loss": 0.2485, + "step": 22900 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013796061818036138, + "loss": 0.2467, + "step": 23000 + }, + { + "epoch": 0.55, + "eval_runtime": 46.1546, + "eval_samples_per_second": 233.996, + "eval_steps_per_second": 7.323, + "step": 23000 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013783050393360768, + "loss": 0.2468, + "step": 23100 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001376997572617282, + "loss": 0.2463, + "step": 23200 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013756837959454766, + "loss": 0.2456, + "step": 23300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001374363723687911, + "loss": 0.2459, + "step": 23400 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013730373702806846, + "loss": 0.2447, + "step": 23500 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013717047502285855, + "loss": 0.245, + "step": 23600 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001370365878104933, + "loss": 0.2446, + "step": 23700 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013690207685514185, + "loss": 0.2442, + "step": 23800 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001367669436277944, + "loss": 0.2439, + "step": 23900 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001366311896062463, + "loss": 0.2438, + "step": 24000 + }, + { + "epoch": 0.57, + "eval_runtime": 46.5558, + "eval_samples_per_second": 231.98, + "eval_steps_per_second": 7.26, + "step": 24000 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013649481627508181, + "loss": 0.2436, + "step": 24100 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001363578251256578, + "loss": 0.2429, + "step": 24200 + }, + { + "epoch": 0.58, + "learning_rate": 0.00013622021765608754, + "loss": 0.2424, + "step": 24300 + }, + { + "epoch": 0.58, + "learning_rate": 0.00013608199537122425, + "loss": 0.242, + "step": 24400 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001359431597826447, + "loss": 0.2422, + "step": 24500 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001358037124086327, + "loss": 0.2418, + "step": 24600 + }, + { + "epoch": 0.59, + "learning_rate": 0.00013566365477416233, + "loss": 0.2407, + "step": 24700 + }, + { + "epoch": 0.59, + "learning_rate": 0.00013552298841088144, + "loss": 0.2416, + "step": 24800 + }, + { + "epoch": 0.59, + "learning_rate": 0.00013538171485709486, + "loss": 0.2411, + "step": 24900 + }, + { + "epoch": 0.6, + "learning_rate": 0.00013523983565774753, + "loss": 0.2401, + "step": 25000 + }, + { + "epoch": 0.6, + "eval_runtime": 46.0773, + "eval_samples_per_second": 234.389, + "eval_steps_per_second": 7.336, + "step": 25000 + }, + { + "epoch": 0.6, + "learning_rate": 0.00013509735236440766, + "loss": 0.2401, + "step": 25100 + }, + { + "epoch": 0.6, + "learning_rate": 0.00013495426653524972, + "loss": 0.2402, + "step": 25200 + }, + { + "epoch": 0.6, + "learning_rate": 0.00013481057973503742, + "loss": 0.24, + "step": 25300 + }, + { + "epoch": 0.6, + "learning_rate": 0.00013466629353510651, + "loss": 0.239, + "step": 25400 + }, + { + "epoch": 0.61, + "learning_rate": 0.00013452140951334787, + "loss": 0.239, + "step": 25500 + }, + { + "epoch": 0.61, + "learning_rate": 0.00013437592925418985, + "loss": 0.2388, + "step": 25600 + }, + { + "epoch": 0.61, + "learning_rate": 0.00013422985434858133, + "loss": 0.238, + "step": 25700 + }, + { + "epoch": 0.61, + "learning_rate": 0.00013408318639397405, + "loss": 0.2387, + "step": 25800 + }, + { + "epoch": 0.62, + "learning_rate": 0.00013393592699430525, + "loss": 0.2372, + "step": 25900 + }, + { + "epoch": 0.62, + "learning_rate": 0.00013378807775998012, + "loss": 0.2377, + "step": 26000 + }, + { + "epoch": 0.62, + "eval_runtime": 46.2501, + "eval_samples_per_second": 233.513, + "eval_steps_per_second": 7.308, + "step": 26000 + }, + { + "epoch": 0.62, + "learning_rate": 0.00013363964030785422, + "loss": 0.2373, + "step": 26100 + }, + { + "epoch": 0.62, + "learning_rate": 0.00013349061626121578, + "loss": 0.238, + "step": 26200 + }, + { + "epoch": 0.63, + "learning_rate": 0.00013334100724976783, + "loss": 0.2367, + "step": 26300 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001331908149096106, + "loss": 0.2367, + "step": 26400 + }, + { + "epoch": 0.63, + "learning_rate": 0.00013304004088322342, + "loss": 0.2356, + "step": 26500 + }, + { + "epoch": 0.63, + "learning_rate": 0.00013288868681944692, + "loss": 0.2365, + "step": 26600 + }, + { + "epoch": 0.64, + "learning_rate": 0.00013273675437346487, + "loss": 0.236, + "step": 26700 + }, + { + "epoch": 0.64, + "learning_rate": 0.00013258424520678618, + "loss": 0.2356, + "step": 26800 + }, + { + "epoch": 0.64, + "learning_rate": 0.00013243116098722663, + "loss": 0.2363, + "step": 26900 + }, + { + "epoch": 0.64, + "learning_rate": 0.00013227750338889077, + "loss": 0.2345, + "step": 27000 + }, + { + "epoch": 0.64, + "eval_runtime": 46.2738, + "eval_samples_per_second": 233.394, + "eval_steps_per_second": 7.304, + "step": 27000 + }, + { + "epoch": 0.65, + "learning_rate": 0.00013212327409215343, + "loss": 0.2351, + "step": 27100 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001319684747836415, + "loss": 0.2351, + "step": 27200 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001318131071562154, + "loss": 0.2342, + "step": 27300 + }, + { + "epoch": 0.65, + "learning_rate": 0.00013165717290895067, + "loss": 0.2338, + "step": 27400 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001315006737471192, + "loss": 0.234, + "step": 27500 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001313436113821708, + "loss": 0.233, + "step": 27600 + }, + { + "epoch": 0.66, + "learning_rate": 0.00013118598753171425, + "loss": 0.2331, + "step": 27700 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001310278039194988, + "loss": 0.2329, + "step": 27800 + }, + { + "epoch": 0.66, + "learning_rate": 0.00013086906227539506, + "loss": 0.2332, + "step": 27900 + }, + { + "epoch": 0.67, + "learning_rate": 0.00013070976433537623, + "loss": 0.2338, + "step": 28000 + }, + { + "epoch": 0.67, + "eval_runtime": 46.2625, + "eval_samples_per_second": 233.45, + "eval_steps_per_second": 7.306, + "step": 28000 + }, + { + "epoch": 0.67, + "learning_rate": 0.00013054991184149905, + "loss": 0.2325, + "step": 28100 + }, + { + "epoch": 0.67, + "learning_rate": 0.00013038950654188476, + "loss": 0.2312, + "step": 28200 + }, + { + "epoch": 0.67, + "learning_rate": 0.00013022855019070005, + "loss": 0.2323, + "step": 28300 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001300670445481378, + "loss": 0.2319, + "step": 28400 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001299049913803978, + "loss": 0.2324, + "step": 28500 + }, + { + "epoch": 0.68, + "learning_rate": 0.00012974239245966754, + "loss": 0.2313, + "step": 28600 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001295792495641028, + "loss": 0.2318, + "step": 28700 + }, + { + "epoch": 0.69, + "learning_rate": 0.00012941556447780813, + "loss": 0.2309, + "step": 28800 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001292513389908174, + "loss": 0.231, + "step": 28900 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001290865748990742, + "loss": 0.2298, + "step": 29000 + }, + { + "epoch": 0.69, + "eval_runtime": 46.1555, + "eval_samples_per_second": 233.992, + "eval_steps_per_second": 7.323, + "step": 29000 + }, + { + "epoch": 0.69, + "learning_rate": 0.00012892127400441228, + "loss": 0.2302, + "step": 29100 + }, + { + "epoch": 0.7, + "learning_rate": 0.00012875543811453576, + "loss": 0.2305, + "step": 29200 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001285890690429993, + "loss": 0.2293, + "step": 29300 + }, + { + "epoch": 0.7, + "learning_rate": 0.00012842216860918846, + "loss": 0.2298, + "step": 29400 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001282547386382996, + "loss": 0.2296, + "step": 29500 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001280867809613201, + "loss": 0.2291, + "step": 29600 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001279182974150082, + "loss": 0.2279, + "step": 29700 + }, + { + "epoch": 0.71, + "learning_rate": 0.00012774928984187297, + "loss": 0.2278, + "step": 29800 + }, + { + "epoch": 0.71, + "learning_rate": 0.00012757976009015413, + "loss": 0.228, + "step": 29900 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001274097100138019, + "loss": 0.2282, + "step": 30000 + }, + { + "epoch": 0.71, + "eval_runtime": 46.6895, + "eval_samples_per_second": 231.315, + "eval_steps_per_second": 7.239, + "step": 30000 + }, + { + "epoch": 0.72, + "learning_rate": 0.00012723914147245663, + "loss": 0.2276, + "step": 30100 + }, + { + "epoch": 0.72, + "learning_rate": 0.00012706805633142863, + "loss": 0.2276, + "step": 30200 + }, + { + "epoch": 0.72, + "learning_rate": 0.00012689645646167755, + "loss": 0.2281, + "step": 30300 + }, + { + "epoch": 0.72, + "learning_rate": 0.00012672434373979207, + "loss": 0.2265, + "step": 30400 + }, + { + "epoch": 0.73, + "learning_rate": 0.00012655172004796936, + "loss": 0.2286, + "step": 30500 + }, + { + "epoch": 0.73, + "learning_rate": 0.00012637858727399448, + "loss": 0.227, + "step": 30600 + }, + { + "epoch": 0.73, + "learning_rate": 0.00012620494731121966, + "loss": 0.2267, + "step": 30700 + }, + { + "epoch": 0.73, + "learning_rate": 0.00012603080205854372, + "loss": 0.2266, + "step": 30800 + }, + { + "epoch": 0.74, + "learning_rate": 0.00012585615342039126, + "loss": 0.2258, + "step": 30900 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001256810033066918, + "loss": 0.226, + "step": 31000 + }, + { + "epoch": 0.74, + "eval_runtime": 47.0689, + "eval_samples_per_second": 229.451, + "eval_steps_per_second": 7.181, + "step": 31000 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001255053536328589, + "loss": 0.2257, + "step": 31100 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001253292063197693, + "loss": 0.2256, + "step": 31200 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001251525632937418, + "loss": 0.2257, + "step": 31300 + }, + { + "epoch": 0.75, + "learning_rate": 0.00012497542648651615, + "loss": 0.2248, + "step": 31400 + }, + { + "epoch": 0.75, + "learning_rate": 0.00012479779783523216, + "loss": 0.225, + "step": 31500 + }, + { + "epoch": 0.75, + "learning_rate": 0.00012461967928240828, + "loss": 0.2246, + "step": 31600 + }, + { + "epoch": 0.76, + "learning_rate": 0.00012444107277592047, + "loss": 0.2247, + "step": 31700 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001242619802689809, + "loss": 0.2246, + "step": 31800 + }, + { + "epoch": 0.76, + "learning_rate": 0.00012408240372011647, + "loss": 0.2238, + "step": 31900 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001239023450931476, + "loss": 0.2243, + "step": 32000 + }, + { + "epoch": 0.76, + "eval_runtime": 47.1954, + "eval_samples_per_second": 228.836, + "eval_steps_per_second": 7.162, + "step": 32000 + }, + { + "epoch": 0.76, + "learning_rate": 0.00012372180635716656, + "loss": 0.2235, + "step": 32100 + }, + { + "epoch": 0.77, + "learning_rate": 0.00012354078948651604, + "loss": 0.2239, + "step": 32200 + }, + { + "epoch": 0.77, + "learning_rate": 0.00012335929646076758, + "loss": 0.2231, + "step": 32300 + }, + { + "epoch": 0.77, + "learning_rate": 0.00012317732926469976, + "loss": 0.2225, + "step": 32400 + }, + { + "epoch": 0.77, + "learning_rate": 0.00012299488988827675, + "loss": 0.2233, + "step": 32500 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001228119803266263, + "loss": 0.223, + "step": 32600 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001226286025800181, + "loss": 0.2229, + "step": 32700 + }, + { + "epoch": 0.78, + "learning_rate": 0.00012244475865384177, + "loss": 0.222, + "step": 32800 + }, + { + "epoch": 0.78, + "learning_rate": 0.00012226045055858505, + "loss": 0.2217, + "step": 32900 + }, + { + "epoch": 0.79, + "learning_rate": 0.00012207568030981174, + "loss": 0.2222, + "step": 33000 + }, + { + "epoch": 0.79, + "eval_runtime": 47.0101, + "eval_samples_per_second": 229.738, + "eval_steps_per_second": 7.19, + "step": 33000 + }, + { + "epoch": 0.79, + "learning_rate": 0.00012189044992813972, + "loss": 0.2213, + "step": 33100 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001217047614392187, + "loss": 0.2206, + "step": 33200 + }, + { + "epoch": 0.79, + "learning_rate": 0.00012151861687370828, + "loss": 0.2221, + "step": 33300 + }, + { + "epoch": 0.8, + "learning_rate": 0.00012133201826725558, + "loss": 0.2209, + "step": 33400 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001211449676604731, + "loss": 0.2211, + "step": 33500 + }, + { + "epoch": 0.8, + "learning_rate": 0.00012095746709891632, + "loss": 0.2205, + "step": 33600 + }, + { + "epoch": 0.8, + "learning_rate": 0.00012076951863306127, + "loss": 0.2203, + "step": 33700 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001205811243182823, + "loss": 0.22, + "step": 33800 + }, + { + "epoch": 0.81, + "learning_rate": 0.00012039228621482949, + "loss": 0.2192, + "step": 33900 + }, + { + "epoch": 0.81, + "learning_rate": 0.00012020300638780604, + "loss": 0.219, + "step": 34000 + }, + { + "epoch": 0.81, + "eval_runtime": 47.0946, + "eval_samples_per_second": 229.325, + "eval_steps_per_second": 7.177, + "step": 34000 + }, + { + "epoch": 0.81, + "learning_rate": 0.00012001328690714582, + "loss": 0.2194, + "step": 34100 + }, + { + "epoch": 0.81, + "learning_rate": 0.00011982312984759068, + "loss": 0.2194, + "step": 34200 + }, + { + "epoch": 0.82, + "learning_rate": 0.00011963253728866778, + "loss": 0.2189, + "step": 34300 + }, + { + "epoch": 0.82, + "learning_rate": 0.00011944151131466675, + "loss": 0.219, + "step": 34400 + }, + { + "epoch": 0.82, + "learning_rate": 0.00011925005401461709, + "loss": 0.2184, + "step": 34500 + }, + { + "epoch": 0.82, + "learning_rate": 0.00011905816748226513, + "loss": 0.2182, + "step": 34600 + }, + { + "epoch": 0.83, + "learning_rate": 0.00011886585381605125, + "loss": 0.2188, + "step": 34700 + }, + { + "epoch": 0.83, + "learning_rate": 0.00011867311511908693, + "loss": 0.2179, + "step": 34800 + }, + { + "epoch": 0.83, + "learning_rate": 0.00011847995349913162, + "loss": 0.218, + "step": 34900 + }, + { + "epoch": 0.83, + "learning_rate": 0.00011828637106856989, + "loss": 0.2173, + "step": 35000 + }, + { + "epoch": 0.83, + "eval_runtime": 46.7598, + "eval_samples_per_second": 230.968, + "eval_steps_per_second": 7.228, + "step": 35000 + }, + { + "epoch": 0.84, + "learning_rate": 0.00011809236994438816, + "loss": 0.2171, + "step": 35100 + }, + { + "epoch": 0.84, + "learning_rate": 0.00011789795224815164, + "loss": 0.2175, + "step": 35200 + }, + { + "epoch": 0.84, + "learning_rate": 0.00011770312010598116, + "loss": 0.2167, + "step": 35300 + }, + { + "epoch": 0.84, + "learning_rate": 0.00011750787564852973, + "loss": 0.2167, + "step": 35400 + }, + { + "epoch": 0.85, + "learning_rate": 0.00011731222101095955, + "loss": 0.2171, + "step": 35500 + }, + { + "epoch": 0.85, + "learning_rate": 0.00011711615833291833, + "loss": 0.2161, + "step": 35600 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001169196897585161, + "loss": 0.2168, + "step": 35700 + }, + { + "epoch": 0.85, + "learning_rate": 0.00011672281743630175, + "loss": 0.2162, + "step": 35800 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001165255435192394, + "loss": 0.2152, + "step": 35900 + }, + { + "epoch": 0.86, + "learning_rate": 0.00011632787016468506, + "loss": 0.216, + "step": 36000 + }, + { + "epoch": 0.86, + "eval_runtime": 47.0992, + "eval_samples_per_second": 229.303, + "eval_steps_per_second": 7.176, + "step": 36000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001161297995343628, + "loss": 0.2157, + "step": 36100 + }, + { + "epoch": 0.86, + "learning_rate": 0.00011593133379434138, + "loss": 0.215, + "step": 36200 + }, + { + "epoch": 0.86, + "learning_rate": 0.00011573247511501028, + "loss": 0.2154, + "step": 36300 + }, + { + "epoch": 0.87, + "learning_rate": 0.00011553322567105619, + "loss": 0.2155, + "step": 36400 + }, + { + "epoch": 0.87, + "learning_rate": 0.00011533358764143905, + "loss": 0.2149, + "step": 36500 + }, + { + "epoch": 0.87, + "learning_rate": 0.00011513356320936841, + "loss": 0.2144, + "step": 36600 + }, + { + "epoch": 0.87, + "learning_rate": 0.00011493315456227943, + "loss": 0.2147, + "step": 36700 + }, + { + "epoch": 0.88, + "learning_rate": 0.00011473236389180894, + "loss": 0.2145, + "step": 36800 + }, + { + "epoch": 0.88, + "learning_rate": 0.00011453119339377154, + "loss": 0.2146, + "step": 36900 + }, + { + "epoch": 0.88, + "learning_rate": 0.00011432964526813558, + "loss": 0.2145, + "step": 37000 + }, + { + "epoch": 0.88, + "eval_runtime": 46.8321, + "eval_samples_per_second": 230.611, + "eval_steps_per_second": 7.217, + "step": 37000 + }, + { + "epoch": 0.88, + "learning_rate": 0.00011412772171899904, + "loss": 0.2132, + "step": 37100 + }, + { + "epoch": 0.89, + "learning_rate": 0.00011392542495456556, + "loss": 0.2133, + "step": 37200 + }, + { + "epoch": 0.89, + "learning_rate": 0.00011372275718712006, + "loss": 0.2125, + "step": 37300 + }, + { + "epoch": 0.89, + "learning_rate": 0.00011351972063300484, + "loss": 0.2135, + "step": 37400 + }, + { + "epoch": 0.89, + "learning_rate": 0.00011331631751259515, + "loss": 0.213, + "step": 37500 + }, + { + "epoch": 0.9, + "learning_rate": 0.00011311255005027487, + "loss": 0.2132, + "step": 37600 + }, + { + "epoch": 0.9, + "learning_rate": 0.00011290842047441232, + "loss": 0.2125, + "step": 37700 + }, + { + "epoch": 0.9, + "learning_rate": 0.00011270393101733585, + "loss": 0.2122, + "step": 37800 + }, + { + "epoch": 0.9, + "learning_rate": 0.00011249908391530946, + "loss": 0.2113, + "step": 37900 + }, + { + "epoch": 0.91, + "learning_rate": 0.00011229388140850814, + "loss": 0.2119, + "step": 38000 + }, + { + "epoch": 0.91, + "eval_runtime": 46.8036, + "eval_samples_per_second": 230.751, + "eval_steps_per_second": 7.222, + "step": 38000 + }, + { + "epoch": 0.91, + "learning_rate": 0.00011208832574099368, + "loss": 0.2113, + "step": 38100 + }, + { + "epoch": 0.91, + "learning_rate": 0.00011188241916068993, + "loss": 0.2111, + "step": 38200 + }, + { + "epoch": 0.91, + "learning_rate": 0.00011167616391935826, + "loss": 0.2111, + "step": 38300 + }, + { + "epoch": 0.91, + "learning_rate": 0.00011146956227257293, + "loss": 0.2119, + "step": 38400 + }, + { + "epoch": 0.92, + "learning_rate": 0.00011126261647969645, + "loss": 0.2115, + "step": 38500 + }, + { + "epoch": 0.92, + "learning_rate": 0.00011105532880385487, + "loss": 0.2104, + "step": 38600 + }, + { + "epoch": 0.92, + "learning_rate": 0.00011084770151191299, + "loss": 0.2107, + "step": 38700 + }, + { + "epoch": 0.92, + "learning_rate": 0.00011063973687444962, + "loss": 0.2097, + "step": 38800 + }, + { + "epoch": 0.93, + "learning_rate": 0.00011043143716573272, + "loss": 0.2107, + "step": 38900 + }, + { + "epoch": 0.93, + "learning_rate": 0.00011022280466369448, + "loss": 0.2113, + "step": 39000 + }, + { + "epoch": 0.93, + "eval_runtime": 47.0898, + "eval_samples_per_second": 229.349, + "eval_steps_per_second": 7.178, + "step": 39000 + }, + { + "epoch": 0.93, + "learning_rate": 0.00011001384164990662, + "loss": 0.2099, + "step": 39100 + }, + { + "epoch": 0.93, + "learning_rate": 0.00010980455040955506, + "loss": 0.21, + "step": 39200 + }, + { + "epoch": 0.94, + "learning_rate": 0.00010959493323141538, + "loss": 0.2091, + "step": 39300 + }, + { + "epoch": 0.94, + "learning_rate": 0.00010938499240782739, + "loss": 0.2098, + "step": 39400 + }, + { + "epoch": 0.94, + "learning_rate": 0.00010917473023467032, + "loss": 0.2096, + "step": 39500 + }, + { + "epoch": 0.94, + "learning_rate": 0.00010896414901133761, + "loss": 0.2085, + "step": 39600 + }, + { + "epoch": 0.95, + "learning_rate": 0.00010875325104071177, + "loss": 0.2093, + "step": 39700 + }, + { + "epoch": 0.95, + "learning_rate": 0.00010854203862913927, + "loss": 0.2084, + "step": 39800 + }, + { + "epoch": 0.95, + "learning_rate": 0.00010833051408640509, + "loss": 0.2083, + "step": 39900 + }, + { + "epoch": 0.95, + "learning_rate": 0.00010811867972570786, + "loss": 0.2084, + "step": 40000 + }, + { + "epoch": 0.95, + "eval_runtime": 46.8854, + "eval_samples_per_second": 230.349, + "eval_steps_per_second": 7.209, + "step": 40000 + }, + { + "epoch": 0.96, + "learning_rate": 0.00010790653786363416, + "loss": 0.2082, + "step": 40100 + }, + { + "epoch": 0.96, + "learning_rate": 0.00010769409082013337, + "loss": 0.2081, + "step": 40200 + }, + { + "epoch": 0.96, + "learning_rate": 0.00010748134091849238, + "loss": 0.2077, + "step": 40300 + }, + { + "epoch": 0.96, + "learning_rate": 0.00010726829048531, + "loss": 0.2078, + "step": 40400 + }, + { + "epoch": 0.96, + "learning_rate": 0.00010705494185047165, + "loss": 0.2077, + "step": 40500 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001068412973471238, + "loss": 0.2073, + "step": 40600 + }, + { + "epoch": 0.97, + "learning_rate": 0.00010662735931164853, + "loss": 0.2076, + "step": 40700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001064131300836379, + "loss": 0.2069, + "step": 40800 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001061986120058684, + "loss": 0.2067, + "step": 40900 + }, + { + "epoch": 0.98, + "learning_rate": 0.00010598380742427543, + "loss": 0.206, + "step": 41000 + }, + { + "epoch": 0.98, + "eval_runtime": 46.6481, + "eval_samples_per_second": 231.521, + "eval_steps_per_second": 7.246, + "step": 41000 + }, + { + "epoch": 0.98, + "learning_rate": 0.00010576871868792746, + "loss": 0.206, + "step": 41100 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001055533481490004, + "loss": 0.2058, + "step": 41200 + }, + { + "epoch": 0.98, + "learning_rate": 0.000105337698162752, + "loss": 0.206, + "step": 41300 + }, + { + "epoch": 0.99, + "learning_rate": 0.00010512177108749594, + "loss": 0.2057, + "step": 41400 + }, + { + "epoch": 0.99, + "learning_rate": 0.00010490556928457616, + "loss": 0.2039, + "step": 41500 + }, + { + "epoch": 0.99, + "learning_rate": 0.00010468909511834088, + "loss": 0.205, + "step": 41600 + }, + { + "epoch": 0.99, + "learning_rate": 0.00010447235095611692, + "loss": 0.2045, + "step": 41700 + }, + { + "epoch": 1.0, + "learning_rate": 0.00010425533916818376, + "loss": 0.2047, + "step": 41800 + }, + { + "epoch": 1.0, + "learning_rate": 0.00010403806212774747, + "loss": 0.205, + "step": 41900 + }, + { + "epoch": 1.0, + "learning_rate": 0.000103820522210915, + "loss": 0.2042, + "step": 42000 + }, + { + "epoch": 1.0, + "eval_runtime": 46.7967, + "eval_samples_per_second": 230.786, + "eval_steps_per_second": 7.223, + "step": 42000 + }, + { + "epoch": 1.0, + "learning_rate": 0.00010360272179666802, + "loss": 0.204, + "step": 42100 + }, + { + "epoch": 1.01, + "learning_rate": 0.00010338466326683697, + "loss": 0.2037, + "step": 42200 + }, + { + "epoch": 1.01, + "learning_rate": 0.00010316634900607497, + "loss": 0.2033, + "step": 42300 + }, + { + "epoch": 1.01, + "learning_rate": 0.00010294778140183182, + "loss": 0.2035, + "step": 42400 + }, + { + "epoch": 1.01, + "learning_rate": 0.00010272896284432785, + "loss": 0.2037, + "step": 42500 + }, + { + "epoch": 1.01, + "learning_rate": 0.00010250989572652766, + "loss": 0.2028, + "step": 42600 + }, + { + "epoch": 1.02, + "learning_rate": 0.00010229058244411427, + "loss": 0.2019, + "step": 42700 + }, + { + "epoch": 1.02, + "learning_rate": 0.00010207102539546251, + "loss": 0.2032, + "step": 42800 + }, + { + "epoch": 1.02, + "learning_rate": 0.00010185122698161311, + "loss": 0.2026, + "step": 42900 + }, + { + "epoch": 1.02, + "learning_rate": 0.00010163118960624632, + "loss": 0.2024, + "step": 43000 + }, + { + "epoch": 1.02, + "eval_runtime": 46.9319, + "eval_samples_per_second": 230.121, + "eval_steps_per_second": 7.202, + "step": 43000 + }, + { + "epoch": 1.03, + "learning_rate": 0.00010141091567565561, + "loss": 0.2028, + "step": 43100 + }, + { + "epoch": 1.03, + "learning_rate": 0.00010119040759872142, + "loss": 0.2018, + "step": 43200 + }, + { + "epoch": 1.03, + "learning_rate": 0.00010096966778688472, + "loss": 0.2016, + "step": 43300 + }, + { + "epoch": 1.03, + "learning_rate": 0.00010074869865412074, + "loss": 0.2024, + "step": 43400 + }, + { + "epoch": 1.04, + "learning_rate": 0.00010052750261691254, + "loss": 0.2017, + "step": 43500 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001003060820942245, + "loss": 0.2015, + "step": 43600 + }, + { + "epoch": 1.04, + "learning_rate": 0.00010008443950747599, + "loss": 0.2014, + "step": 43700 + }, + { + "epoch": 1.04, + "learning_rate": 9.986257728051483e-05, + "loss": 0.2014, + "step": 43800 + }, + { + "epoch": 1.05, + "learning_rate": 9.964049783959082e-05, + "loss": 0.2012, + "step": 43900 + }, + { + "epoch": 1.05, + "learning_rate": 9.94182036133291e-05, + "loss": 0.201, + "step": 44000 + }, + { + "epoch": 1.05, + "eval_runtime": 47.2136, + "eval_samples_per_second": 228.748, + "eval_steps_per_second": 7.159, + "step": 44000 + }, + { + "epoch": 1.05, + "learning_rate": 9.919569703270376e-05, + "loss": 0.1998, + "step": 44100 + }, + { + "epoch": 1.05, + "learning_rate": 9.89729805310111e-05, + "loss": 0.2004, + "step": 44200 + }, + { + "epoch": 1.06, + "learning_rate": 9.875005654384307e-05, + "loss": 0.2009, + "step": 44300 + }, + { + "epoch": 1.06, + "learning_rate": 9.852692750906071e-05, + "loss": 0.1999, + "step": 44400 + }, + { + "epoch": 1.06, + "learning_rate": 9.830359586676737e-05, + "loss": 0.1997, + "step": 44500 + }, + { + "epoch": 1.06, + "learning_rate": 9.808006405928215e-05, + "loss": 0.2006, + "step": 44600 + }, + { + "epoch": 1.06, + "learning_rate": 9.785633453111306e-05, + "loss": 0.1999, + "step": 44700 + }, + { + "epoch": 1.07, + "learning_rate": 9.763240972893037e-05, + "loss": 0.1992, + "step": 44800 + }, + { + "epoch": 1.07, + "learning_rate": 9.740829210153984e-05, + "loss": 0.1991, + "step": 44900 + }, + { + "epoch": 1.07, + "learning_rate": 9.718398409985593e-05, + "loss": 0.199, + "step": 45000 + }, + { + "epoch": 1.07, + "eval_runtime": 46.9221, + "eval_samples_per_second": 230.169, + "eval_steps_per_second": 7.203, + "step": 45000 + }, + { + "epoch": 1.07, + "learning_rate": 9.695948817687504e-05, + "loss": 0.1987, + "step": 45100 + }, + { + "epoch": 1.08, + "learning_rate": 9.673480678764858e-05, + "loss": 0.1982, + "step": 45200 + }, + { + "epoch": 1.08, + "learning_rate": 9.650994238925626e-05, + "loss": 0.1989, + "step": 45300 + }, + { + "epoch": 1.08, + "learning_rate": 9.628489744077911e-05, + "loss": 0.1985, + "step": 45400 + }, + { + "epoch": 1.08, + "learning_rate": 9.60596744032726e-05, + "loss": 0.1981, + "step": 45500 + }, + { + "epoch": 1.09, + "learning_rate": 9.583427573973982e-05, + "loss": 0.1976, + "step": 45600 + }, + { + "epoch": 1.09, + "learning_rate": 9.560870391510441e-05, + "loss": 0.1981, + "step": 45700 + }, + { + "epoch": 1.09, + "learning_rate": 9.538296139618371e-05, + "loss": 0.1978, + "step": 45800 + }, + { + "epoch": 1.09, + "learning_rate": 9.515705065166178e-05, + "loss": 0.1977, + "step": 45900 + }, + { + "epoch": 1.1, + "learning_rate": 9.493097415206228e-05, + "loss": 0.1974, + "step": 46000 + }, + { + "epoch": 1.1, + "eval_runtime": 47.1161, + "eval_samples_per_second": 229.221, + "eval_steps_per_second": 7.174, + "step": 46000 + }, + { + "epoch": 1.1, + "learning_rate": 9.47047343697216e-05, + "loss": 0.1978, + "step": 46100 + }, + { + "epoch": 1.1, + "learning_rate": 9.447833377876176e-05, + "loss": 0.1974, + "step": 46200 + }, + { + "epoch": 1.1, + "learning_rate": 9.425177485506336e-05, + "loss": 0.1971, + "step": 46300 + }, + { + "epoch": 1.11, + "learning_rate": 9.402506007623848e-05, + "loss": 0.1968, + "step": 46400 + }, + { + "epoch": 1.11, + "learning_rate": 9.379819192160362e-05, + "loss": 0.1969, + "step": 46500 + }, + { + "epoch": 1.11, + "learning_rate": 9.357117287215258e-05, + "loss": 0.1966, + "step": 46600 + }, + { + "epoch": 1.11, + "learning_rate": 9.334400541052928e-05, + "loss": 0.1971, + "step": 46700 + }, + { + "epoch": 1.11, + "learning_rate": 9.311669202100073e-05, + "loss": 0.1962, + "step": 46800 + }, + { + "epoch": 1.12, + "learning_rate": 9.288923518942968e-05, + "loss": 0.1959, + "step": 46900 + }, + { + "epoch": 1.12, + "learning_rate": 9.26616374032477e-05, + "loss": 0.1964, + "step": 47000 + }, + { + "epoch": 1.12, + "eval_runtime": 46.7963, + "eval_samples_per_second": 230.788, + "eval_steps_per_second": 7.223, + "step": 47000 + }, + { + "epoch": 1.12, + "learning_rate": 9.243390115142761e-05, + "loss": 0.196, + "step": 47100 + }, + { + "epoch": 1.12, + "learning_rate": 9.220602892445661e-05, + "loss": 0.1955, + "step": 47200 + }, + { + "epoch": 1.13, + "learning_rate": 9.197802321430889e-05, + "loss": 0.1958, + "step": 47300 + }, + { + "epoch": 1.13, + "learning_rate": 9.174988651441833e-05, + "loss": 0.1951, + "step": 47400 + }, + { + "epoch": 1.13, + "learning_rate": 9.152162131965137e-05, + "loss": 0.1954, + "step": 47500 + }, + { + "epoch": 1.13, + "learning_rate": 9.129323012627956e-05, + "loss": 0.1948, + "step": 47600 + }, + { + "epoch": 1.14, + "learning_rate": 9.106471543195244e-05, + "loss": 0.1954, + "step": 47700 + }, + { + "epoch": 1.14, + "learning_rate": 9.08360797356701e-05, + "loss": 0.1953, + "step": 47800 + }, + { + "epoch": 1.14, + "learning_rate": 9.060732553775582e-05, + "loss": 0.1949, + "step": 47900 + }, + { + "epoch": 1.14, + "learning_rate": 9.037845533982892e-05, + "loss": 0.1947, + "step": 48000 + }, + { + "epoch": 1.14, + "eval_runtime": 46.9646, + "eval_samples_per_second": 229.96, + "eval_steps_per_second": 7.197, + "step": 48000 + }, + { + "epoch": 1.15, + "learning_rate": 9.014947164477721e-05, + "loss": 0.1946, + "step": 48100 + }, + { + "epoch": 1.15, + "learning_rate": 8.992037695672967e-05, + "loss": 0.1938, + "step": 48200 + }, + { + "epoch": 1.15, + "learning_rate": 8.969117378102912e-05, + "loss": 0.1946, + "step": 48300 + }, + { + "epoch": 1.15, + "learning_rate": 8.946186462420478e-05, + "loss": 0.1942, + "step": 48400 + }, + { + "epoch": 1.16, + "learning_rate": 8.923245199394482e-05, + "loss": 0.1934, + "step": 48500 + }, + { + "epoch": 1.16, + "learning_rate": 8.900293839906903e-05, + "loss": 0.194, + "step": 48600 + }, + { + "epoch": 1.16, + "learning_rate": 8.87733263495013e-05, + "loss": 0.1936, + "step": 48700 + }, + { + "epoch": 1.16, + "learning_rate": 8.85436183562422e-05, + "loss": 0.1933, + "step": 48800 + }, + { + "epoch": 1.16, + "learning_rate": 8.83138169313416e-05, + "loss": 0.1933, + "step": 48900 + }, + { + "epoch": 1.17, + "learning_rate": 8.808392458787103e-05, + "loss": 0.1931, + "step": 49000 + }, + { + "epoch": 1.17, + "eval_runtime": 46.9712, + "eval_samples_per_second": 229.928, + "eval_steps_per_second": 7.196, + "step": 49000 + }, + { + "epoch": 1.17, + "learning_rate": 8.78539438398963e-05, + "loss": 0.1922, + "step": 49100 + }, + { + "epoch": 1.17, + "learning_rate": 8.762387720245008e-05, + "loss": 0.1922, + "step": 49200 + }, + { + "epoch": 1.17, + "learning_rate": 8.73937271915042e-05, + "loss": 0.1926, + "step": 49300 + }, + { + "epoch": 1.18, + "learning_rate": 8.716349632394235e-05, + "loss": 0.1924, + "step": 49400 + }, + { + "epoch": 1.18, + "learning_rate": 8.69331871175324e-05, + "loss": 0.1927, + "step": 49500 + }, + { + "epoch": 1.18, + "learning_rate": 8.67028020908989e-05, + "loss": 0.1924, + "step": 49600 + }, + { + "epoch": 1.18, + "learning_rate": 8.647234376349565e-05, + "loss": 0.1921, + "step": 49700 + }, + { + "epoch": 1.19, + "learning_rate": 8.624181465557794e-05, + "loss": 0.1914, + "step": 49800 + }, + { + "epoch": 1.19, + "learning_rate": 8.601121728817519e-05, + "loss": 0.1917, + "step": 49900 + }, + { + "epoch": 1.19, + "learning_rate": 8.578055418306327e-05, + "loss": 0.1918, + "step": 50000 + }, + { + "epoch": 1.19, + "eval_runtime": 47.0452, + "eval_samples_per_second": 229.566, + "eval_steps_per_second": 7.185, + "step": 50000 + }, + { + "epoch": 1.19, + "learning_rate": 8.55498278627369e-05, + "loss": 0.1915, + "step": 50100 + }, + { + "epoch": 1.2, + "learning_rate": 8.531904085038221e-05, + "loss": 0.1912, + "step": 50200 + }, + { + "epoch": 1.2, + "learning_rate": 8.508819566984897e-05, + "loss": 0.1907, + "step": 50300 + }, + { + "epoch": 1.2, + "learning_rate": 8.485729484562307e-05, + "loss": 0.1912, + "step": 50400 + }, + { + "epoch": 1.2, + "learning_rate": 8.462634090279895e-05, + "loss": 0.1907, + "step": 50500 + }, + { + "epoch": 1.21, + "learning_rate": 8.439533636705194e-05, + "loss": 0.1912, + "step": 50600 + }, + { + "epoch": 1.21, + "learning_rate": 8.416428376461061e-05, + "loss": 0.19, + "step": 50700 + }, + { + "epoch": 1.21, + "learning_rate": 8.393318562222916e-05, + "loss": 0.1904, + "step": 50800 + }, + { + "epoch": 1.21, + "learning_rate": 8.370204446715997e-05, + "loss": 0.1902, + "step": 50900 + }, + { + "epoch": 1.21, + "learning_rate": 8.347086282712556e-05, + "loss": 0.191, + "step": 51000 + }, + { + "epoch": 1.21, + "eval_runtime": 46.8635, + "eval_samples_per_second": 230.456, + "eval_steps_per_second": 7.212, + "step": 51000 + }, + { + "epoch": 1.22, + "learning_rate": 8.323964323029136e-05, + "loss": 0.1896, + "step": 51100 + }, + { + "epoch": 1.22, + "learning_rate": 8.300838820523784e-05, + "loss": 0.1903, + "step": 51200 + }, + { + "epoch": 1.22, + "learning_rate": 8.277710028093289e-05, + "loss": 0.1895, + "step": 51300 + }, + { + "epoch": 1.22, + "learning_rate": 8.254578198670421e-05, + "loss": 0.1897, + "step": 51400 + }, + { + "epoch": 1.23, + "learning_rate": 8.231443585221157e-05, + "loss": 0.1895, + "step": 51500 + }, + { + "epoch": 1.23, + "learning_rate": 8.208306440741926e-05, + "loss": 0.1898, + "step": 51600 + }, + { + "epoch": 1.23, + "learning_rate": 8.185167018256834e-05, + "loss": 0.1899, + "step": 51700 + }, + { + "epoch": 1.23, + "learning_rate": 8.162025570814896e-05, + "loss": 0.1894, + "step": 51800 + }, + { + "epoch": 1.24, + "learning_rate": 8.138882351487275e-05, + "loss": 0.1895, + "step": 51900 + }, + { + "epoch": 1.24, + "learning_rate": 8.115737613364511e-05, + "loss": 0.1895, + "step": 52000 + }, + { + "epoch": 1.24, + "eval_runtime": 47.2771, + "eval_samples_per_second": 228.441, + "eval_steps_per_second": 7.149, + "step": 52000 + }, + { + "epoch": 1.24, + "learning_rate": 8.092591609553747e-05, + "loss": 0.1894, + "step": 52100 + }, + { + "epoch": 1.24, + "learning_rate": 8.069444593175975e-05, + "loss": 0.1897, + "step": 52200 + }, + { + "epoch": 1.25, + "learning_rate": 8.046296817363259e-05, + "loss": 0.1887, + "step": 52300 + }, + { + "epoch": 1.25, + "learning_rate": 8.023148535255965e-05, + "loss": 0.1886, + "step": 52400 + }, + { + "epoch": 1.25, + "learning_rate": 7.999999999999999e-05, + "loss": 0.1885, + "step": 52500 + }, + { + "epoch": 1.25, + "learning_rate": 7.976851464744033e-05, + "loss": 0.1888, + "step": 52600 + }, + { + "epoch": 1.26, + "learning_rate": 7.953703182636741e-05, + "loss": 0.1882, + "step": 52700 + }, + { + "epoch": 1.26, + "learning_rate": 7.930555406824026e-05, + "loss": 0.1879, + "step": 52800 + }, + { + "epoch": 1.26, + "learning_rate": 7.907408390446254e-05, + "loss": 0.1887, + "step": 52900 + }, + { + "epoch": 1.26, + "learning_rate": 7.884262386635489e-05, + "loss": 0.1876, + "step": 53000 + }, + { + "epoch": 1.26, + "eval_runtime": 47.1816, + "eval_samples_per_second": 228.903, + "eval_steps_per_second": 7.164, + "step": 53000 + }, + { + "epoch": 1.26, + "learning_rate": 7.861117648512725e-05, + "loss": 0.1875, + "step": 53100 + }, + { + "epoch": 1.27, + "learning_rate": 7.837974429185103e-05, + "loss": 0.1875, + "step": 53200 + }, + { + "epoch": 1.27, + "learning_rate": 7.814832981743164e-05, + "loss": 0.1873, + "step": 53300 + }, + { + "epoch": 1.27, + "learning_rate": 7.791693559258072e-05, + "loss": 0.1871, + "step": 53400 + }, + { + "epoch": 1.27, + "learning_rate": 7.768556414778842e-05, + "loss": 0.1876, + "step": 53500 + }, + { + "epoch": 1.28, + "learning_rate": 7.74542180132958e-05, + "loss": 0.1871, + "step": 53600 + }, + { + "epoch": 1.28, + "learning_rate": 7.72228997190671e-05, + "loss": 0.1877, + "step": 53700 + }, + { + "epoch": 1.28, + "learning_rate": 7.699161179476217e-05, + "loss": 0.1864, + "step": 53800 + }, + { + "epoch": 1.28, + "learning_rate": 7.676035676970863e-05, + "loss": 0.1864, + "step": 53900 + }, + { + "epoch": 1.29, + "learning_rate": 7.652913717287443e-05, + "loss": 0.1862, + "step": 54000 + }, + { + "epoch": 1.29, + "eval_runtime": 46.9744, + "eval_samples_per_second": 229.912, + "eval_steps_per_second": 7.195, + "step": 54000 + }, + { + "epoch": 1.29, + "learning_rate": 7.629795553284005e-05, + "loss": 0.1868, + "step": 54100 + }, + { + "epoch": 1.29, + "learning_rate": 7.606681437777081e-05, + "loss": 0.1867, + "step": 54200 + }, + { + "epoch": 1.29, + "learning_rate": 7.583571623538939e-05, + "loss": 0.1858, + "step": 54300 + }, + { + "epoch": 1.3, + "learning_rate": 7.560466363294806e-05, + "loss": 0.1865, + "step": 54400 + }, + { + "epoch": 1.3, + "learning_rate": 7.537365909720104e-05, + "loss": 0.1859, + "step": 54500 + }, + { + "epoch": 1.3, + "learning_rate": 7.514270515437691e-05, + "loss": 0.1862, + "step": 54600 + }, + { + "epoch": 1.3, + "learning_rate": 7.491180433015101e-05, + "loss": 0.1852, + "step": 54700 + }, + { + "epoch": 1.31, + "learning_rate": 7.468095914961777e-05, + "loss": 0.1864, + "step": 54800 + }, + { + "epoch": 1.31, + "learning_rate": 7.445017213726307e-05, + "loss": 0.1856, + "step": 54900 + }, + { + "epoch": 1.31, + "learning_rate": 7.421944581693674e-05, + "loss": 0.1852, + "step": 55000 + }, + { + "epoch": 1.31, + "eval_runtime": 75.8452, + "eval_samples_per_second": 142.395, + "eval_steps_per_second": 4.456, + "step": 55000 + }, + { + "epoch": 1.0, + "learning_rate": 7.39887827118248e-05, + "loss": 0.1855, + "step": 55100 + }, + { + "epoch": 1.0, + "learning_rate": 7.375818534442207e-05, + "loss": 0.1852, + "step": 55200 + }, + { + "epoch": 1.01, + "learning_rate": 7.352765623650435e-05, + "loss": 0.1858, + "step": 55300 + }, + { + "epoch": 1.01, + "learning_rate": 7.329719790910108e-05, + "loss": 0.1842, + "step": 55400 + }, + { + "epoch": 1.01, + "learning_rate": 7.30668128824676e-05, + "loss": 0.185, + "step": 55500 + }, + { + "epoch": 1.01, + "learning_rate": 7.283650367605764e-05, + "loss": 0.1851, + "step": 55600 + }, + { + "epoch": 1.02, + "learning_rate": 7.260627280849581e-05, + "loss": 0.1842, + "step": 55700 + }, + { + "epoch": 1.02, + "learning_rate": 7.23761227975499e-05, + "loss": 0.1847, + "step": 55800 + }, + { + "epoch": 1.02, + "learning_rate": 7.21460561601037e-05, + "loss": 0.1849, + "step": 55900 + }, + { + "epoch": 1.02, + "learning_rate": 7.191607541212897e-05, + "loss": 0.1848, + "step": 56000 + }, + { + "epoch": 1.02, + "eval_runtime": 47.6864, + "eval_samples_per_second": 226.48, + "eval_steps_per_second": 7.088, + "step": 56000 + }, + { + "epoch": 1.03, + "learning_rate": 7.168618306865838e-05, + "loss": 0.1848, + "step": 56100 + }, + { + "epoch": 1.03, + "learning_rate": 7.145638164375779e-05, + "loss": 0.1842, + "step": 56200 + }, + { + "epoch": 1.03, + "learning_rate": 7.122667365049869e-05, + "loss": 0.1846, + "step": 56300 + }, + { + "epoch": 1.03, + "learning_rate": 7.099706160093098e-05, + "loss": 0.1834, + "step": 56400 + }, + { + "epoch": 1.04, + "learning_rate": 7.076754800605516e-05, + "loss": 0.1837, + "step": 56500 + }, + { + "epoch": 1.04, + "learning_rate": 7.053813537579523e-05, + "loss": 0.1835, + "step": 56600 + }, + { + "epoch": 1.04, + "learning_rate": 7.030882621897088e-05, + "loss": 0.1833, + "step": 56700 + }, + { + "epoch": 1.04, + "learning_rate": 7.00796230432703e-05, + "loss": 0.1837, + "step": 56800 + }, + { + "epoch": 1.05, + "learning_rate": 6.985052835522279e-05, + "loss": 0.1833, + "step": 56900 + }, + { + "epoch": 1.05, + "learning_rate": 6.962154466017105e-05, + "loss": 0.1827, + "step": 57000 + }, + { + "epoch": 1.05, + "eval_runtime": 47.1326, + "eval_samples_per_second": 229.141, + "eval_steps_per_second": 7.171, + "step": 57000 + }, + { + "epoch": 1.05, + "learning_rate": 6.939267446224418e-05, + "loss": 0.1819, + "step": 57100 + }, + { + "epoch": 1.05, + "learning_rate": 6.91639202643299e-05, + "loss": 0.1834, + "step": 57200 + }, + { + "epoch": 1.05, + "learning_rate": 6.893528456804756e-05, + "loss": 0.1836, + "step": 57300 + }, + { + "epoch": 1.06, + "learning_rate": 6.870676987372044e-05, + "loss": 0.1832, + "step": 57400 + }, + { + "epoch": 1.06, + "learning_rate": 6.847837868034861e-05, + "loss": 0.1833, + "step": 57500 + }, + { + "epoch": 1.06, + "learning_rate": 6.825011348558167e-05, + "loss": 0.1826, + "step": 57600 + }, + { + "epoch": 1.06, + "learning_rate": 6.802197678569109e-05, + "loss": 0.1826, + "step": 57700 + }, + { + "epoch": 1.07, + "learning_rate": 6.779397107554339e-05, + "loss": 0.1821, + "step": 57800 + }, + { + "epoch": 1.07, + "learning_rate": 6.756609884857239e-05, + "loss": 0.1826, + "step": 57900 + }, + { + "epoch": 1.07, + "learning_rate": 6.733836259675233e-05, + "loss": 0.1822, + "step": 58000 + }, + { + "epoch": 1.07, + "eval_runtime": 47.2136, + "eval_samples_per_second": 228.748, + "eval_steps_per_second": 7.159, + "step": 58000 + }, + { + "epoch": 1.07, + "learning_rate": 6.71107648105703e-05, + "loss": 0.1814, + "step": 58100 + }, + { + "epoch": 1.08, + "learning_rate": 6.688330797899925e-05, + "loss": 0.1825, + "step": 58200 + }, + { + "epoch": 1.08, + "learning_rate": 6.665599458947072e-05, + "loss": 0.182, + "step": 58300 + }, + { + "epoch": 1.08, + "learning_rate": 6.642882712784742e-05, + "loss": 0.1821, + "step": 58400 + }, + { + "epoch": 1.08, + "learning_rate": 6.620180807839639e-05, + "loss": 0.1819, + "step": 58500 + }, + { + "epoch": 1.09, + "learning_rate": 6.597493992376152e-05, + "loss": 0.1824, + "step": 58600 + }, + { + "epoch": 1.09, + "learning_rate": 6.574822514493664e-05, + "loss": 0.1821, + "step": 58700 + }, + { + "epoch": 1.09, + "learning_rate": 6.552166622123824e-05, + "loss": 0.1817, + "step": 58800 + }, + { + "epoch": 1.09, + "learning_rate": 6.52952656302784e-05, + "loss": 0.181, + "step": 58900 + }, + { + "epoch": 1.1, + "learning_rate": 6.506902584793773e-05, + "loss": 0.1814, + "step": 59000 + }, + { + "epoch": 1.1, + "eval_runtime": 47.2104, + "eval_samples_per_second": 228.763, + "eval_steps_per_second": 7.159, + "step": 59000 + }, + { + "epoch": 1.1, + "learning_rate": 6.484294934833822e-05, + "loss": 0.182, + "step": 59100 + }, + { + "epoch": 1.1, + "learning_rate": 6.461703860381628e-05, + "loss": 0.1811, + "step": 59200 + }, + { + "epoch": 1.1, + "learning_rate": 6.439129608489559e-05, + "loss": 0.1801, + "step": 59300 + }, + { + "epoch": 1.1, + "learning_rate": 6.41657242602602e-05, + "loss": 0.1811, + "step": 59400 + }, + { + "epoch": 1.11, + "learning_rate": 6.39403255967274e-05, + "loss": 0.1811, + "step": 59500 + }, + { + "epoch": 1.11, + "learning_rate": 6.371510255922088e-05, + "loss": 0.1811, + "step": 59600 + }, + { + "epoch": 1.11, + "learning_rate": 6.349005761074372e-05, + "loss": 0.1808, + "step": 59700 + }, + { + "epoch": 1.11, + "learning_rate": 6.326519321235139e-05, + "loss": 0.1803, + "step": 59800 + }, + { + "epoch": 1.12, + "learning_rate": 6.304051182312496e-05, + "loss": 0.1809, + "step": 59900 + }, + { + "epoch": 1.12, + "learning_rate": 6.281601590014407e-05, + "loss": 0.1807, + "step": 60000 + }, + { + "epoch": 1.12, + "eval_runtime": 47.3036, + "eval_samples_per_second": 228.313, + "eval_steps_per_second": 7.145, + "step": 60000 + }, + { + "epoch": 1.12, + "learning_rate": 6.259170789846017e-05, + "loss": 0.1806, + "step": 60100 + }, + { + "epoch": 1.12, + "learning_rate": 6.236759027106965e-05, + "loss": 0.1803, + "step": 60200 + }, + { + "epoch": 1.13, + "learning_rate": 6.214366546888694e-05, + "loss": 0.1805, + "step": 60300 + }, + { + "epoch": 1.13, + "learning_rate": 6.191993594071785e-05, + "loss": 0.1798, + "step": 60400 + }, + { + "epoch": 1.13, + "learning_rate": 6.169640413323262e-05, + "loss": 0.1791, + "step": 60500 + }, + { + "epoch": 1.13, + "learning_rate": 6.147307249093929e-05, + "loss": 0.1793, + "step": 60600 + }, + { + "epoch": 1.14, + "learning_rate": 6.124994345615693e-05, + "loss": 0.18, + "step": 60700 + }, + { + "epoch": 1.14, + "learning_rate": 6.102701946898891e-05, + "loss": 0.1795, + "step": 60800 + }, + { + "epoch": 1.14, + "learning_rate": 6.0804302967296225e-05, + "loss": 0.1791, + "step": 60900 + }, + { + "epoch": 1.14, + "learning_rate": 6.058179638667089e-05, + "loss": 0.1798, + "step": 61000 + }, + { + "epoch": 1.14, + "eval_runtime": 47.2093, + "eval_samples_per_second": 228.768, + "eval_steps_per_second": 7.16, + "step": 61000 + }, + { + "epoch": 1.15, + "learning_rate": 6.035950216040917e-05, + "loss": 0.1793, + "step": 61100 + }, + { + "epoch": 1.15, + "learning_rate": 6.0137422719485145e-05, + "loss": 0.1797, + "step": 61200 + }, + { + "epoch": 1.15, + "learning_rate": 5.991556049252401e-05, + "loss": 0.1789, + "step": 61300 + }, + { + "epoch": 1.15, + "learning_rate": 5.969391790577551e-05, + "loss": 0.1793, + "step": 61400 + }, + { + "epoch": 1.15, + "learning_rate": 5.947249738308747e-05, + "loss": 0.179, + "step": 61500 + }, + { + "epoch": 1.16, + "learning_rate": 5.925130134587924e-05, + "loss": 0.1785, + "step": 61600 + }, + { + "epoch": 1.16, + "learning_rate": 5.903033221311528e-05, + "loss": 0.1787, + "step": 61700 + }, + { + "epoch": 1.16, + "learning_rate": 5.880959240127858e-05, + "loss": 0.179, + "step": 61800 + }, + { + "epoch": 1.16, + "learning_rate": 5.858908432434438e-05, + "loss": 0.1784, + "step": 61900 + }, + { + "epoch": 1.17, + "learning_rate": 5.8368810393753684e-05, + "loss": 0.1789, + "step": 62000 + }, + { + "epoch": 1.17, + "eval_runtime": 47.3247, + "eval_samples_per_second": 228.211, + "eval_steps_per_second": 7.142, + "step": 62000 + }, + { + "epoch": 1.17, + "learning_rate": 5.814877301838688e-05, + "loss": 0.1783, + "step": 62100 + }, + { + "epoch": 1.17, + "learning_rate": 5.7928974604537494e-05, + "loss": 0.1783, + "step": 62200 + }, + { + "epoch": 1.17, + "learning_rate": 5.770941755588573e-05, + "loss": 0.1785, + "step": 62300 + }, + { + "epoch": 1.18, + "learning_rate": 5.749010427347233e-05, + "loss": 0.1784, + "step": 62400 + }, + { + "epoch": 1.18, + "learning_rate": 5.7271037155672156e-05, + "loss": 0.1777, + "step": 62500 + }, + { + "epoch": 1.18, + "learning_rate": 5.7052218598168154e-05, + "loss": 0.1786, + "step": 62600 + }, + { + "epoch": 1.18, + "learning_rate": 5.6833650993925016e-05, + "loss": 0.1782, + "step": 62700 + }, + { + "epoch": 1.19, + "learning_rate": 5.661533673316303e-05, + "loss": 0.1776, + "step": 62800 + }, + { + "epoch": 1.19, + "learning_rate": 5.639727820333198e-05, + "loss": 0.178, + "step": 62900 + }, + { + "epoch": 1.19, + "learning_rate": 5.617947778908498e-05, + "loss": 0.1782, + "step": 63000 + }, + { + "epoch": 1.19, + "eval_runtime": 47.2795, + "eval_samples_per_second": 228.429, + "eval_steps_per_second": 7.149, + "step": 63000 + }, + { + "epoch": 1.19, + "learning_rate": 5.596193787225254e-05, + "loss": 0.1771, + "step": 63100 + }, + { + "epoch": 1.2, + "learning_rate": 5.574466083181624e-05, + "loss": 0.1777, + "step": 63200 + }, + { + "epoch": 1.2, + "learning_rate": 5.552764904388305e-05, + "loss": 0.1773, + "step": 63300 + }, + { + "epoch": 1.2, + "learning_rate": 5.5310904881659116e-05, + "loss": 0.177, + "step": 63400 + }, + { + "epoch": 1.2, + "learning_rate": 5.5094430715423835e-05, + "loss": 0.1766, + "step": 63500 + }, + { + "epoch": 1.2, + "learning_rate": 5.487822891250406e-05, + "loss": 0.1771, + "step": 63600 + }, + { + "epoch": 1.21, + "learning_rate": 5.4662301837247985e-05, + "loss": 0.177, + "step": 63700 + }, + { + "epoch": 1.21, + "learning_rate": 5.4446651850999604e-05, + "loss": 0.1765, + "step": 63800 + }, + { + "epoch": 1.21, + "learning_rate": 5.4231281312072544e-05, + "loss": 0.1774, + "step": 63900 + }, + { + "epoch": 1.21, + "learning_rate": 5.401619257572453e-05, + "loss": 0.1766, + "step": 64000 + }, + { + "epoch": 1.21, + "eval_runtime": 47.4102, + "eval_samples_per_second": 227.799, + "eval_steps_per_second": 7.129, + "step": 64000 + }, + { + "epoch": 1.22, + "learning_rate": 5.3801387994131576e-05, + "loss": 0.1769, + "step": 64100 + }, + { + "epoch": 1.22, + "learning_rate": 5.358686991636209e-05, + "loss": 0.1768, + "step": 64200 + }, + { + "epoch": 1.22, + "learning_rate": 5.3372640688351476e-05, + "loss": 0.1767, + "step": 64300 + }, + { + "epoch": 1.22, + "learning_rate": 5.315870265287618e-05, + "loss": 0.1762, + "step": 64400 + }, + { + "epoch": 1.23, + "learning_rate": 5.294505814952835e-05, + "loss": 0.1771, + "step": 64500 + }, + { + "epoch": 1.23, + "learning_rate": 5.2731709514689995e-05, + "loss": 0.1759, + "step": 64600 + }, + { + "epoch": 1.23, + "learning_rate": 5.25186590815076e-05, + "loss": 0.1759, + "step": 64700 + }, + { + "epoch": 1.23, + "learning_rate": 5.2305909179866635e-05, + "loss": 0.1765, + "step": 64800 + }, + { + "epoch": 1.24, + "learning_rate": 5.209346213636584e-05, + "loss": 0.1763, + "step": 64900 + }, + { + "epoch": 1.24, + "learning_rate": 5.188132027429215e-05, + "loss": 0.1757, + "step": 65000 + }, + { + "epoch": 1.24, + "eval_runtime": 47.3383, + "eval_samples_per_second": 228.145, + "eval_steps_per_second": 7.14, + "step": 65000 + }, + { + "epoch": 1.24, + "learning_rate": 5.166948591359489e-05, + "loss": 0.1757, + "step": 65100 + }, + { + "epoch": 1.24, + "learning_rate": 5.145796137086076e-05, + "loss": 0.176, + "step": 65200 + }, + { + "epoch": 1.25, + "learning_rate": 5.124674895928823e-05, + "loss": 0.1759, + "step": 65300 + }, + { + "epoch": 1.25, + "learning_rate": 5.103585098866237e-05, + "loss": 0.1758, + "step": 65400 + }, + { + "epoch": 1.25, + "learning_rate": 5.082526976532968e-05, + "loss": 0.1754, + "step": 65500 + }, + { + "epoch": 1.25, + "learning_rate": 5.061500759217261e-05, + "loss": 0.1751, + "step": 65600 + }, + { + "epoch": 1.25, + "learning_rate": 5.04050667685846e-05, + "loss": 0.1759, + "step": 65700 + }, + { + "epoch": 1.26, + "learning_rate": 5.01954495904449e-05, + "loss": 0.1761, + "step": 65800 + }, + { + "epoch": 1.26, + "learning_rate": 4.998615835009339e-05, + "loss": 0.1757, + "step": 65900 + }, + { + "epoch": 1.26, + "learning_rate": 4.97771953363055e-05, + "loss": 0.1751, + "step": 66000 + }, + { + "epoch": 1.26, + "eval_runtime": 47.6107, + "eval_samples_per_second": 226.84, + "eval_steps_per_second": 7.099, + "step": 66000 + }, + { + "epoch": 1.26, + "learning_rate": 4.956856283426728e-05, + "loss": 0.1747, + "step": 66100 + }, + { + "epoch": 1.27, + "learning_rate": 4.936026312555037e-05, + "loss": 0.1746, + "step": 66200 + }, + { + "epoch": 1.27, + "learning_rate": 4.915229848808698e-05, + "loss": 0.1747, + "step": 66300 + }, + { + "epoch": 1.27, + "learning_rate": 4.8944671196145136e-05, + "loss": 0.1744, + "step": 66400 + }, + { + "epoch": 1.27, + "learning_rate": 4.8737383520303546e-05, + "loss": 0.1748, + "step": 66500 + }, + { + "epoch": 1.28, + "learning_rate": 4.853043772742709e-05, + "loss": 0.1748, + "step": 66600 + }, + { + "epoch": 1.28, + "learning_rate": 4.832383608064172e-05, + "loss": 0.1746, + "step": 66700 + }, + { + "epoch": 1.28, + "learning_rate": 4.811758083931005e-05, + "loss": 0.1754, + "step": 66800 + }, + { + "epoch": 1.28, + "learning_rate": 4.791167425900632e-05, + "loss": 0.1744, + "step": 66900 + }, + { + "epoch": 1.29, + "learning_rate": 4.770611859149185e-05, + "loss": 0.1742, + "step": 67000 + }, + { + "epoch": 1.29, + "eval_runtime": 47.4501, + "eval_samples_per_second": 227.608, + "eval_steps_per_second": 7.123, + "step": 67000 + }, + { + "epoch": 1.29, + "learning_rate": 4.7500916084690564e-05, + "loss": 0.174, + "step": 67100 + }, + { + "epoch": 1.29, + "learning_rate": 4.729606898266411e-05, + "loss": 0.1742, + "step": 67200 + }, + { + "epoch": 1.29, + "learning_rate": 4.709157952558768e-05, + "loss": 0.1743, + "step": 67300 + }, + { + "epoch": 1.3, + "learning_rate": 4.688744994972514e-05, + "loss": 0.175, + "step": 67400 + }, + { + "epoch": 1.3, + "learning_rate": 4.668368248740485e-05, + "loss": 0.1748, + "step": 67500 + }, + { + "epoch": 1.3, + "learning_rate": 4.6480279366995116e-05, + "loss": 0.1734, + "step": 67600 + }, + { + "epoch": 1.3, + "learning_rate": 4.6277242812879914e-05, + "loss": 0.1736, + "step": 67700 + }, + { + "epoch": 1.3, + "learning_rate": 4.607457504543447e-05, + "loss": 0.1739, + "step": 67800 + }, + { + "epoch": 1.31, + "learning_rate": 4.5872278281000955e-05, + "loss": 0.1739, + "step": 67900 + }, + { + "epoch": 1.31, + "learning_rate": 4.567035473186444e-05, + "loss": 0.1743, + "step": 68000 + }, + { + "epoch": 1.31, + "eval_runtime": 47.6447, + "eval_samples_per_second": 226.678, + "eval_steps_per_second": 7.094, + "step": 68000 + }, + { + "epoch": 1.31, + "learning_rate": 4.546880660622845e-05, + "loss": 0.1737, + "step": 68100 + }, + { + "epoch": 1.31, + "learning_rate": 4.5267636108191036e-05, + "loss": 0.174, + "step": 68200 + }, + { + "epoch": 1.32, + "learning_rate": 4.5066845437720555e-05, + "loss": 0.1735, + "step": 68300 + }, + { + "epoch": 1.32, + "learning_rate": 4.4866436790631564e-05, + "loss": 0.1733, + "step": 68400 + }, + { + "epoch": 1.32, + "learning_rate": 4.4666412358560955e-05, + "loss": 0.1733, + "step": 68500 + }, + { + "epoch": 1.32, + "learning_rate": 4.4466774328943796e-05, + "loss": 0.1729, + "step": 68600 + }, + { + "epoch": 1.33, + "learning_rate": 4.426752488498972e-05, + "loss": 0.1735, + "step": 68700 + }, + { + "epoch": 1.33, + "learning_rate": 4.406866620565862e-05, + "loss": 0.173, + "step": 68800 + }, + { + "epoch": 1.33, + "learning_rate": 4.3870200465637164e-05, + "loss": 0.1732, + "step": 68900 + }, + { + "epoch": 1.33, + "learning_rate": 4.3672129835314955e-05, + "loss": 0.1727, + "step": 69000 + }, + { + "epoch": 1.33, + "eval_runtime": 47.5417, + "eval_samples_per_second": 227.169, + "eval_steps_per_second": 7.11, + "step": 69000 + }, + { + "epoch": 1.34, + "learning_rate": 4.347445648076057e-05, + "loss": 0.1738, + "step": 69100 + }, + { + "epoch": 1.34, + "learning_rate": 4.327718256369826e-05, + "loss": 0.1725, + "step": 69200 + }, + { + "epoch": 1.34, + "learning_rate": 4.3080310241483885e-05, + "loss": 0.1731, + "step": 69300 + }, + { + "epoch": 1.34, + "learning_rate": 4.2883841667081675e-05, + "loss": 0.1731, + "step": 69400 + }, + { + "epoch": 1.35, + "learning_rate": 4.268777898904044e-05, + "loss": 0.1726, + "step": 69500 + }, + { + "epoch": 1.35, + "learning_rate": 4.2492124351470214e-05, + "loss": 0.1723, + "step": 69600 + }, + { + "epoch": 1.35, + "learning_rate": 4.2296879894018835e-05, + "loss": 0.1727, + "step": 69700 + }, + { + "epoch": 1.35, + "learning_rate": 4.210204775184834e-05, + "loss": 0.1723, + "step": 69800 + }, + { + "epoch": 1.35, + "learning_rate": 4.190763005561186e-05, + "loss": 0.172, + "step": 69900 + }, + { + "epoch": 1.36, + "learning_rate": 4.171362893143013e-05, + "loss": 0.1724, + "step": 70000 + }, + { + "epoch": 1.36, + "eval_runtime": 47.6533, + "eval_samples_per_second": 226.637, + "eval_steps_per_second": 7.093, + "step": 70000 + }, + { + "epoch": 1.36, + "learning_rate": 4.1520046500868384e-05, + "loss": 0.1724, + "step": 70100 + }, + { + "epoch": 1.36, + "learning_rate": 4.1326884880913074e-05, + "loss": 0.1721, + "step": 70200 + }, + { + "epoch": 1.36, + "learning_rate": 4.1134146183948724e-05, + "loss": 0.1723, + "step": 70300 + }, + { + "epoch": 1.37, + "learning_rate": 4.0941832517734885e-05, + "loss": 0.1717, + "step": 70400 + }, + { + "epoch": 1.37, + "learning_rate": 4.0749945985382915e-05, + "loss": 0.1717, + "step": 70500 + }, + { + "epoch": 1.37, + "learning_rate": 4.0558488685333235e-05, + "loss": 0.1713, + "step": 70600 + }, + { + "epoch": 1.37, + "learning_rate": 4.036746271133223e-05, + "loss": 0.1724, + "step": 70700 + }, + { + "epoch": 1.38, + "learning_rate": 4.0176870152409324e-05, + "loss": 0.1708, + "step": 70800 + }, + { + "epoch": 1.38, + "learning_rate": 3.998671309285417e-05, + "loss": 0.1717, + "step": 70900 + }, + { + "epoch": 1.38, + "learning_rate": 3.979699361219395e-05, + "loss": 0.1706, + "step": 71000 + }, + { + "epoch": 1.38, + "eval_runtime": 47.7292, + "eval_samples_per_second": 226.277, + "eval_steps_per_second": 7.082, + "step": 71000 + }, + { + "epoch": 1.38, + "learning_rate": 3.960771378517049e-05, + "loss": 0.171, + "step": 71100 + }, + { + "epoch": 1.39, + "learning_rate": 3.941887568171766e-05, + "loss": 0.1708, + "step": 71200 + }, + { + "epoch": 1.39, + "learning_rate": 3.923048136693873e-05, + "loss": 0.1717, + "step": 71300 + }, + { + "epoch": 1.39, + "learning_rate": 3.904253290108369e-05, + "loss": 0.1716, + "step": 71400 + }, + { + "epoch": 1.39, + "learning_rate": 3.885503233952689e-05, + "loss": 0.1706, + "step": 71500 + }, + { + "epoch": 1.4, + "learning_rate": 3.86679817327444e-05, + "loss": 0.171, + "step": 71600 + }, + { + "epoch": 1.4, + "learning_rate": 3.848138312629171e-05, + "loss": 0.1702, + "step": 71700 + }, + { + "epoch": 1.4, + "learning_rate": 3.8295238560781317e-05, + "loss": 0.1706, + "step": 71800 + }, + { + "epoch": 1.4, + "learning_rate": 3.810955007186029e-05, + "loss": 0.1708, + "step": 71900 + }, + { + "epoch": 1.4, + "learning_rate": 3.792431969018824e-05, + "loss": 0.1709, + "step": 72000 + }, + { + "epoch": 1.4, + "eval_runtime": 47.6465, + "eval_samples_per_second": 226.669, + "eval_steps_per_second": 7.094, + "step": 72000 + }, + { + "epoch": 1.41, + "learning_rate": 3.7739549441414945e-05, + "loss": 0.1703, + "step": 72100 + }, + { + "epoch": 1.41, + "learning_rate": 3.755524134615825e-05, + "loss": 0.171, + "step": 72200 + }, + { + "epoch": 1.41, + "learning_rate": 3.7371397419981925e-05, + "loss": 0.1706, + "step": 72300 + }, + { + "epoch": 1.41, + "learning_rate": 3.7188019673373706e-05, + "loss": 0.1707, + "step": 72400 + }, + { + "epoch": 1.42, + "learning_rate": 3.700511011172325e-05, + "loss": 0.1706, + "step": 72500 + }, + { + "epoch": 1.42, + "learning_rate": 3.682267073530023e-05, + "loss": 0.1703, + "step": 72600 + }, + { + "epoch": 1.42, + "learning_rate": 3.664070353923245e-05, + "loss": 0.1698, + "step": 72700 + }, + { + "epoch": 1.42, + "learning_rate": 3.645921051348396e-05, + "loss": 0.1705, + "step": 72800 + }, + { + "epoch": 1.43, + "learning_rate": 3.627819364283345e-05, + "loss": 0.1708, + "step": 72900 + }, + { + "epoch": 1.43, + "learning_rate": 3.6097654906852405e-05, + "loss": 0.1706, + "step": 73000 + }, + { + "epoch": 1.43, + "eval_runtime": 47.2921, + "eval_samples_per_second": 228.368, + "eval_steps_per_second": 7.147, + "step": 73000 + }, + { + "epoch": 1.43, + "learning_rate": 3.591759627988353e-05, + "loss": 0.17, + "step": 73100 + }, + { + "epoch": 1.43, + "learning_rate": 3.573801973101913e-05, + "loss": 0.1702, + "step": 73200 + }, + { + "epoch": 1.44, + "learning_rate": 3.5558927224079534e-05, + "loss": 0.1702, + "step": 73300 + }, + { + "epoch": 1.44, + "learning_rate": 3.5380320717591716e-05, + "loss": 0.17, + "step": 73400 + }, + { + "epoch": 1.44, + "learning_rate": 3.5202202164767836e-05, + "loss": 0.17, + "step": 73500 + }, + { + "epoch": 1.44, + "learning_rate": 3.5024573513483864e-05, + "loss": 0.1706, + "step": 73600 + }, + { + "epoch": 1.45, + "learning_rate": 3.484743670625822e-05, + "loss": 0.1701, + "step": 73700 + }, + { + "epoch": 1.45, + "learning_rate": 3.467079368023068e-05, + "loss": 0.1691, + "step": 73800 + }, + { + "epoch": 1.45, + "learning_rate": 3.449464636714107e-05, + "loss": 0.1698, + "step": 73900 + }, + { + "epoch": 1.45, + "learning_rate": 3.431899669330819e-05, + "loss": 0.1703, + "step": 74000 + }, + { + "epoch": 1.45, + "eval_runtime": 47.7148, + "eval_samples_per_second": 226.345, + "eval_steps_per_second": 7.084, + "step": 74000 + }, + { + "epoch": 1.45, + "learning_rate": 3.4143846579608744e-05, + "loss": 0.1688, + "step": 74100 + }, + { + "epoch": 1.46, + "learning_rate": 3.396919794145629e-05, + "loss": 0.169, + "step": 74200 + }, + { + "epoch": 1.46, + "learning_rate": 3.3795052688780345e-05, + "loss": 0.1691, + "step": 74300 + }, + { + "epoch": 1.46, + "learning_rate": 3.362141272600552e-05, + "loss": 0.1695, + "step": 74400 + }, + { + "epoch": 1.46, + "learning_rate": 3.3448279952030615e-05, + "loss": 0.1692, + "step": 74500 + }, + { + "epoch": 1.47, + "learning_rate": 3.327565626020793e-05, + "loss": 0.1697, + "step": 74600 + }, + { + "epoch": 1.47, + "learning_rate": 3.3103543538322455e-05, + "loss": 0.1694, + "step": 74700 + }, + { + "epoch": 1.47, + "learning_rate": 3.293194366857137e-05, + "loss": 0.1686, + "step": 74800 + }, + { + "epoch": 1.47, + "learning_rate": 3.276085852754336e-05, + "loss": 0.1686, + "step": 74900 + }, + { + "epoch": 1.48, + "learning_rate": 3.259028998619814e-05, + "loss": 0.1688, + "step": 75000 + }, + { + "epoch": 1.48, + "eval_runtime": 47.4078, + "eval_samples_per_second": 227.811, + "eval_steps_per_second": 7.13, + "step": 75000 + }, + { + "epoch": 1.48, + "learning_rate": 3.2420239909845894e-05, + "loss": 0.1688, + "step": 75100 + }, + { + "epoch": 1.48, + "learning_rate": 3.2250710158127045e-05, + "loss": 0.1692, + "step": 75200 + }, + { + "epoch": 1.48, + "learning_rate": 3.2081702584991786e-05, + "loss": 0.1692, + "step": 75300 + }, + { + "epoch": 1.49, + "learning_rate": 3.191321903867988e-05, + "loss": 0.1689, + "step": 75400 + }, + { + "epoch": 1.49, + "learning_rate": 3.174526136170039e-05, + "loss": 0.1691, + "step": 75500 + }, + { + "epoch": 1.49, + "learning_rate": 3.157783139081155e-05, + "loss": 0.1686, + "step": 75600 + }, + { + "epoch": 1.49, + "learning_rate": 3.141093095700072e-05, + "loss": 0.1687, + "step": 75700 + }, + { + "epoch": 1.5, + "learning_rate": 3.1244561885464244e-05, + "loss": 0.1683, + "step": 75800 + }, + { + "epoch": 1.5, + "learning_rate": 3.107872599558769e-05, + "loss": 0.1687, + "step": 75900 + }, + { + "epoch": 1.5, + "learning_rate": 3.0913425100925795e-05, + "loss": 0.1685, + "step": 76000 + }, + { + "epoch": 1.5, + "eval_runtime": 47.5367, + "eval_samples_per_second": 227.193, + "eval_steps_per_second": 7.11, + "step": 76000 + }, + { + "epoch": 1.5, + "learning_rate": 3.0748661009182616e-05, + "loss": 0.1684, + "step": 76100 + }, + { + "epoch": 1.5, + "learning_rate": 3.0584435522191896e-05, + "loss": 0.1684, + "step": 76200 + }, + { + "epoch": 1.51, + "learning_rate": 3.0420750435897183e-05, + "loss": 0.1684, + "step": 76300 + }, + { + "epoch": 1.51, + "learning_rate": 3.025760754033246e-05, + "loss": 0.1679, + "step": 76400 + }, + { + "epoch": 1.51, + "learning_rate": 3.0095008619602206e-05, + "loss": 0.1676, + "step": 76500 + }, + { + "epoch": 1.51, + "learning_rate": 2.993295545186223e-05, + "loss": 0.1685, + "step": 76600 + }, + { + "epoch": 1.52, + "learning_rate": 2.977144980929996e-05, + "loss": 0.1681, + "step": 76700 + }, + { + "epoch": 1.52, + "learning_rate": 2.961049345811523e-05, + "loss": 0.1685, + "step": 76800 + }, + { + "epoch": 1.52, + "learning_rate": 2.945008815850097e-05, + "loss": 0.1679, + "step": 76900 + }, + { + "epoch": 1.52, + "learning_rate": 2.929023566462377e-05, + "loss": 0.1682, + "step": 77000 + }, + { + "epoch": 1.52, + "eval_runtime": 47.5934, + "eval_samples_per_second": 226.922, + "eval_steps_per_second": 7.102, + "step": 77000 + }, + { + "epoch": 1.53, + "learning_rate": 2.9130937724604947e-05, + "loss": 0.1678, + "step": 77100 + }, + { + "epoch": 1.53, + "learning_rate": 2.8972196080501208e-05, + "loss": 0.1678, + "step": 77200 + }, + { + "epoch": 1.53, + "learning_rate": 2.8814012468285748e-05, + "loss": 0.1682, + "step": 77300 + }, + { + "epoch": 1.53, + "learning_rate": 2.865638861782922e-05, + "loss": 0.1678, + "step": 77400 + }, + { + "epoch": 1.54, + "learning_rate": 2.849932625288079e-05, + "loss": 0.1681, + "step": 77500 + }, + { + "epoch": 1.54, + "learning_rate": 2.8342827091049336e-05, + "loss": 0.1678, + "step": 77600 + }, + { + "epoch": 1.54, + "learning_rate": 2.8186892843784587e-05, + "loss": 0.1677, + "step": 77700 + }, + { + "epoch": 1.54, + "learning_rate": 2.803152521635851e-05, + "loss": 0.1679, + "step": 77800 + }, + { + "epoch": 1.55, + "learning_rate": 2.7876725907846578e-05, + "loss": 0.1676, + "step": 77900 + }, + { + "epoch": 1.55, + "learning_rate": 2.7722496611109243e-05, + "loss": 0.167, + "step": 78000 + }, + { + "epoch": 1.55, + "eval_runtime": 47.8007, + "eval_samples_per_second": 225.938, + "eval_steps_per_second": 7.071, + "step": 78000 + }, + { + "epoch": 1.55, + "learning_rate": 2.7568839012773365e-05, + "loss": 0.1673, + "step": 78100 + }, + { + "epoch": 1.55, + "learning_rate": 2.7415754793213826e-05, + "loss": 0.1676, + "step": 78200 + }, + { + "epoch": 1.55, + "learning_rate": 2.7263245626535116e-05, + "loss": 0.1673, + "step": 78300 + }, + { + "epoch": 1.56, + "learning_rate": 2.7111313180553077e-05, + "loss": 0.1673, + "step": 78400 + }, + { + "epoch": 1.56, + "learning_rate": 2.6959959116776587e-05, + "loss": 0.1663, + "step": 78500 + }, + { + "epoch": 1.56, + "learning_rate": 2.6809185090389406e-05, + "loss": 0.1674, + "step": 78600 + }, + { + "epoch": 1.56, + "learning_rate": 2.6658992750232167e-05, + "loss": 0.1666, + "step": 78700 + }, + { + "epoch": 1.57, + "learning_rate": 2.6509383738784218e-05, + "loss": 0.1671, + "step": 78800 + }, + { + "epoch": 1.57, + "learning_rate": 2.6360359692145757e-05, + "loss": 0.1669, + "step": 78900 + }, + { + "epoch": 1.57, + "learning_rate": 2.6211922240019883e-05, + "loss": 0.1671, + "step": 79000 + }, + { + "epoch": 1.57, + "eval_runtime": 47.5708, + "eval_samples_per_second": 227.03, + "eval_steps_per_second": 7.105, + "step": 79000 + }, + { + "epoch": 1.57, + "learning_rate": 2.6064073005694758e-05, + "loss": 0.1669, + "step": 79100 + }, + { + "epoch": 1.58, + "learning_rate": 2.591681360602595e-05, + "loss": 0.1665, + "step": 79200 + }, + { + "epoch": 1.58, + "learning_rate": 2.577014565141866e-05, + "loss": 0.1666, + "step": 79300 + }, + { + "epoch": 1.58, + "learning_rate": 2.562407074581014e-05, + "loss": 0.1663, + "step": 79400 + }, + { + "epoch": 1.58, + "learning_rate": 2.5478590486652137e-05, + "loss": 0.1667, + "step": 79500 + }, + { + "epoch": 1.59, + "learning_rate": 2.533370646489347e-05, + "loss": 0.1665, + "step": 79600 + }, + { + "epoch": 1.59, + "learning_rate": 2.5189420264962586e-05, + "loss": 0.1658, + "step": 79700 + }, + { + "epoch": 1.59, + "learning_rate": 2.504573346475026e-05, + "loss": 0.1665, + "step": 79800 + }, + { + "epoch": 1.59, + "learning_rate": 2.4902647635592324e-05, + "loss": 0.1666, + "step": 79900 + }, + { + "epoch": 1.6, + "learning_rate": 2.476016434225246e-05, + "loss": 0.1662, + "step": 80000 + }, + { + "epoch": 1.6, + "eval_runtime": 47.6449, + "eval_samples_per_second": 226.677, + "eval_steps_per_second": 7.094, + "step": 80000 + }, + { + "epoch": 1.6, + "learning_rate": 2.461828514290513e-05, + "loss": 0.1663, + "step": 80100 + }, + { + "epoch": 1.6, + "learning_rate": 2.447701158911855e-05, + "loss": 0.1664, + "step": 80200 + }, + { + "epoch": 1.6, + "learning_rate": 2.4336345225837658e-05, + "loss": 0.1664, + "step": 80300 + }, + { + "epoch": 1.6, + "learning_rate": 2.4196287591367296e-05, + "loss": 0.1656, + "step": 80400 + }, + { + "epoch": 1.61, + "learning_rate": 2.405684021735527e-05, + "loss": 0.166, + "step": 80500 + }, + { + "epoch": 1.61, + "learning_rate": 2.3918004628775736e-05, + "loss": 0.166, + "step": 80600 + }, + { + "epoch": 1.61, + "learning_rate": 2.3779782343912463e-05, + "loss": 0.1656, + "step": 80700 + }, + { + "epoch": 1.61, + "learning_rate": 2.364217487434221e-05, + "loss": 0.1665, + "step": 80800 + }, + { + "epoch": 1.62, + "learning_rate": 2.3505183724918196e-05, + "loss": 0.1653, + "step": 80900 + }, + { + "epoch": 1.62, + "learning_rate": 2.3368810393753687e-05, + "loss": 0.166, + "step": 81000 + }, + { + "epoch": 1.62, + "eval_runtime": 47.4693, + "eval_samples_per_second": 227.516, + "eval_steps_per_second": 7.12, + "step": 81000 + }, + { + "epoch": 1.62, + "learning_rate": 2.32330563722056e-05, + "loss": 0.1653, + "step": 81100 + }, + { + "epoch": 1.62, + "learning_rate": 2.309792314485815e-05, + "loss": 0.1659, + "step": 81200 + }, + { + "epoch": 1.63, + "learning_rate": 2.2963412189506695e-05, + "loss": 0.1652, + "step": 81300 + }, + { + "epoch": 1.63, + "learning_rate": 2.282952497714145e-05, + "loss": 0.1658, + "step": 81400 + }, + { + "epoch": 1.63, + "learning_rate": 2.2696262971931538e-05, + "loss": 0.1649, + "step": 81500 + }, + { + "epoch": 1.63, + "learning_rate": 2.2563627631208887e-05, + "loss": 0.1657, + "step": 81600 + }, + { + "epoch": 1.64, + "learning_rate": 2.2431620405452336e-05, + "loss": 0.1656, + "step": 81700 + }, + { + "epoch": 1.64, + "learning_rate": 2.230024273827179e-05, + "loss": 0.1655, + "step": 81800 + }, + { + "epoch": 1.64, + "learning_rate": 2.216949606639231e-05, + "loss": 0.1659, + "step": 81900 + }, + { + "epoch": 1.64, + "learning_rate": 2.2039381819638596e-05, + "loss": 0.1649, + "step": 82000 + }, + { + "epoch": 1.64, + "eval_runtime": 47.7941, + "eval_samples_per_second": 225.969, + "eval_steps_per_second": 7.072, + "step": 82000 + }, + { + "epoch": 1.65, + "learning_rate": 2.1909901420919184e-05, + "loss": 0.1655, + "step": 82100 + }, + { + "epoch": 1.65, + "learning_rate": 2.1781056286210997e-05, + "loss": 0.1653, + "step": 82200 + }, + { + "epoch": 1.65, + "learning_rate": 2.1652847824543744e-05, + "loss": 0.1651, + "step": 82300 + }, + { + "epoch": 1.65, + "learning_rate": 2.1525277437984636e-05, + "loss": 0.165, + "step": 82400 + }, + { + "epoch": 1.65, + "learning_rate": 2.1398346521623e-05, + "loss": 0.1652, + "step": 82500 + }, + { + "epoch": 1.66, + "learning_rate": 2.1272056463554978e-05, + "loss": 0.1645, + "step": 82600 + }, + { + "epoch": 1.66, + "learning_rate": 2.114640864486845e-05, + "loss": 0.165, + "step": 82700 + }, + { + "epoch": 1.66, + "learning_rate": 2.1021404439627775e-05, + "loss": 0.165, + "step": 82800 + }, + { + "epoch": 1.66, + "learning_rate": 2.089704521485896e-05, + "loss": 0.1652, + "step": 82900 + }, + { + "epoch": 1.67, + "learning_rate": 2.0773332330534513e-05, + "loss": 0.1655, + "step": 83000 + }, + { + "epoch": 1.67, + "eval_runtime": 47.9606, + "eval_samples_per_second": 225.185, + "eval_steps_per_second": 7.047, + "step": 83000 + }, + { + "epoch": 1.67, + "learning_rate": 2.0650267139558772e-05, + "loss": 0.1651, + "step": 83100 + }, + { + "epoch": 1.67, + "learning_rate": 2.052785098775293e-05, + "loss": 0.1637, + "step": 83200 + }, + { + "epoch": 1.67, + "learning_rate": 2.04060852138404e-05, + "loss": 0.1651, + "step": 83300 + }, + { + "epoch": 1.68, + "learning_rate": 2.028497114943219e-05, + "loss": 0.1646, + "step": 83400 + }, + { + "epoch": 1.68, + "learning_rate": 2.0164510119012263e-05, + "loss": 0.1652, + "step": 83500 + }, + { + "epoch": 1.68, + "learning_rate": 2.0044703439923217e-05, + "loss": 0.1649, + "step": 83600 + }, + { + "epoch": 1.68, + "learning_rate": 1.9925552422351654e-05, + "loss": 0.1652, + "step": 83700 + }, + { + "epoch": 1.69, + "learning_rate": 1.9807058369314016e-05, + "loss": 0.1644, + "step": 83800 + }, + { + "epoch": 1.69, + "learning_rate": 1.968922257664231e-05, + "loss": 0.1647, + "step": 83900 + }, + { + "epoch": 1.69, + "learning_rate": 1.9572046332969825e-05, + "loss": 0.1638, + "step": 84000 + }, + { + "epoch": 1.69, + "eval_runtime": 47.6904, + "eval_samples_per_second": 226.46, + "eval_steps_per_second": 7.087, + "step": 84000 + }, + { + "epoch": 1.69, + "learning_rate": 1.945553091971727e-05, + "loss": 0.1646, + "step": 84100 + }, + { + "epoch": 1.7, + "learning_rate": 1.933967761107847e-05, + "loss": 0.165, + "step": 84200 + }, + { + "epoch": 1.7, + "learning_rate": 1.9224487674006694e-05, + "loss": 0.164, + "step": 84300 + }, + { + "epoch": 1.7, + "learning_rate": 1.9109962368200602e-05, + "loss": 0.1646, + "step": 84400 + }, + { + "epoch": 1.7, + "learning_rate": 1.8996102946090586e-05, + "loss": 0.1647, + "step": 84500 + }, + { + "epoch": 1.71, + "learning_rate": 1.888291065282509e-05, + "loss": 0.1642, + "step": 84600 + }, + { + "epoch": 1.71, + "learning_rate": 1.8770386726256865e-05, + "loss": 0.1634, + "step": 84700 + }, + { + "epoch": 1.71, + "learning_rate": 1.8658532396929565e-05, + "loss": 0.1638, + "step": 84800 + }, + { + "epoch": 1.71, + "learning_rate": 1.8547348888064178e-05, + "loss": 0.1642, + "step": 84900 + }, + { + "epoch": 1.71, + "learning_rate": 1.8436837415545772e-05, + "loss": 0.1646, + "step": 85000 + }, + { + "epoch": 1.71, + "eval_runtime": 47.7791, + "eval_samples_per_second": 226.04, + "eval_steps_per_second": 7.074, + "step": 85000 + }, + { + "epoch": 2.0, + "learning_rate": 1.8326999187910095e-05, + "loss": 0.1625, + "step": 85100 + }, + { + "epoch": 2.0, + "learning_rate": 1.8217835406330415e-05, + "loss": 0.1622, + "step": 85200 + }, + { + "epoch": 2.01, + "learning_rate": 1.810934726460436e-05, + "loss": 0.1628, + "step": 85300 + }, + { + "epoch": 2.01, + "learning_rate": 1.800153594914084e-05, + "loss": 0.1613, + "step": 85400 + }, + { + "epoch": 2.01, + "learning_rate": 1.7894402638947176e-05, + "loss": 0.1622, + "step": 85500 + }, + { + "epoch": 2.01, + "learning_rate": 1.778794850561604e-05, + "loss": 0.1622, + "step": 85600 + }, + { + "epoch": 2.02, + "learning_rate": 1.7682174713312805e-05, + "loss": 0.1615, + "step": 85700 + }, + { + "epoch": 2.02, + "learning_rate": 1.75770824187627e-05, + "loss": 0.1621, + "step": 85800 + }, + { + "epoch": 2.02, + "learning_rate": 1.747267277123821e-05, + "loss": 0.1623, + "step": 85900 + }, + { + "epoch": 2.02, + "learning_rate": 1.7368946912546556e-05, + "loss": 0.1622, + "step": 86000 + }, + { + "epoch": 2.02, + "eval_runtime": 45.2924, + "eval_samples_per_second": 238.45, + "eval_steps_per_second": 7.463, + "step": 86000 + }, + { + "epoch": 2.03, + "learning_rate": 1.726590597701708e-05, + "loss": 0.1623, + "step": 86100 + }, + { + "epoch": 2.03, + "learning_rate": 1.7163551091488952e-05, + "loss": 0.1619, + "step": 86200 + }, + { + "epoch": 2.03, + "learning_rate": 1.7061883375298788e-05, + "loss": 0.1622, + "step": 86300 + }, + { + "epoch": 2.03, + "learning_rate": 1.6960903940268456e-05, + "loss": 0.1613, + "step": 86400 + }, + { + "epoch": 2.04, + "learning_rate": 1.6860613890692876e-05, + "loss": 0.1615, + "step": 86500 + }, + { + "epoch": 2.04, + "learning_rate": 1.6761014323327962e-05, + "loss": 0.1613, + "step": 86600 + }, + { + "epoch": 2.04, + "learning_rate": 1.6662106327378645e-05, + "loss": 0.1612, + "step": 86700 + }, + { + "epoch": 2.04, + "learning_rate": 1.6563890984486884e-05, + "loss": 0.1617, + "step": 86800 + }, + { + "epoch": 2.05, + "learning_rate": 1.6466369368719955e-05, + "loss": 0.1614, + "step": 86900 + }, + { + "epoch": 2.05, + "learning_rate": 1.6369542546558626e-05, + "loss": 0.1608, + "step": 87000 + }, + { + "epoch": 2.05, + "eval_runtime": 45.3963, + "eval_samples_per_second": 237.905, + "eval_steps_per_second": 7.446, + "step": 87000 + }, + { + "epoch": 2.05, + "learning_rate": 1.6273411576885517e-05, + "loss": 0.1601, + "step": 87100 + }, + { + "epoch": 2.05, + "learning_rate": 1.617797751097349e-05, + "loss": 0.1617, + "step": 87200 + }, + { + "epoch": 2.05, + "learning_rate": 1.608324139247421e-05, + "loss": 0.1618, + "step": 87300 + }, + { + "epoch": 2.06, + "learning_rate": 1.5989204257406693e-05, + "loss": 0.1616, + "step": 87400 + }, + { + "epoch": 2.06, + "learning_rate": 1.5895867134145974e-05, + "loss": 0.1618, + "step": 87500 + }, + { + "epoch": 2.06, + "learning_rate": 1.5803231043411912e-05, + "loss": 0.1611, + "step": 87600 + }, + { + "epoch": 2.06, + "learning_rate": 1.5711296998257902e-05, + "loss": 0.1611, + "step": 87700 + }, + { + "epoch": 2.07, + "learning_rate": 1.562006600405996e-05, + "loss": 0.1608, + "step": 87800 + }, + { + "epoch": 2.07, + "learning_rate": 1.5529539058505624e-05, + "loss": 0.1612, + "step": 87900 + }, + { + "epoch": 2.07, + "learning_rate": 1.543971715158307e-05, + "loss": 0.1611, + "step": 88000 + }, + { + "epoch": 2.07, + "eval_runtime": 45.4783, + "eval_samples_per_second": 237.476, + "eval_steps_per_second": 7.432, + "step": 88000 + }, + { + "epoch": 2.07, + "learning_rate": 1.535060126557028e-05, + "loss": 0.1601, + "step": 88100 + }, + { + "epoch": 2.08, + "learning_rate": 1.5262192375024284e-05, + "loss": 0.1615, + "step": 88200 + }, + { + "epoch": 2.08, + "learning_rate": 1.5174491446770566e-05, + "loss": 0.161, + "step": 88300 + }, + { + "epoch": 2.08, + "learning_rate": 1.508749943989242e-05, + "loss": 0.1612, + "step": 88400 + }, + { + "epoch": 2.08, + "learning_rate": 1.500121730572051e-05, + "loss": 0.161, + "step": 88500 + }, + { + "epoch": 2.09, + "learning_rate": 1.4915645987822406e-05, + "loss": 0.1613, + "step": 88600 + }, + { + "epoch": 2.09, + "learning_rate": 1.4830786421992347e-05, + "loss": 0.1611, + "step": 88700 + }, + { + "epoch": 2.09, + "learning_rate": 1.4746639536240942e-05, + "loss": 0.161, + "step": 88800 + }, + { + "epoch": 2.09, + "learning_rate": 1.4663206250785055e-05, + "loss": 0.1605, + "step": 88900 + }, + { + "epoch": 2.1, + "learning_rate": 1.4580487478037748e-05, + "loss": 0.1609, + "step": 89000 + }, + { + "epoch": 2.1, + "eval_runtime": 45.5774, + "eval_samples_per_second": 236.96, + "eval_steps_per_second": 7.416, + "step": 89000 + }, + { + "epoch": 2.1, + "learning_rate": 1.4498484122598232e-05, + "loss": 0.1615, + "step": 89100 + }, + { + "epoch": 2.1, + "learning_rate": 1.4417197081242083e-05, + "loss": 0.1605, + "step": 89200 + }, + { + "epoch": 2.1, + "learning_rate": 1.433662724291136e-05, + "loss": 0.1596, + "step": 89300 + }, + { + "epoch": 2.1, + "learning_rate": 1.4256775488704904e-05, + "loss": 0.1608, + "step": 89400 + }, + { + "epoch": 2.11, + "learning_rate": 1.4177642691868717e-05, + "loss": 0.1608, + "step": 89500 + }, + { + "epoch": 2.11, + "learning_rate": 1.4099229717786368e-05, + "loss": 0.1609, + "step": 89600 + }, + { + "epoch": 2.11, + "learning_rate": 1.4021537423969588e-05, + "loss": 0.1607, + "step": 89700 + }, + { + "epoch": 2.11, + "learning_rate": 1.3944566660048863e-05, + "loss": 0.1603, + "step": 89800 + }, + { + "epoch": 2.12, + "learning_rate": 1.3868318267764128e-05, + "loss": 0.1608, + "step": 89900 + }, + { + "epoch": 2.12, + "learning_rate": 1.3792793080955574e-05, + "loss": 0.1607, + "step": 90000 + }, + { + "epoch": 2.12, + "eval_runtime": 45.4935, + "eval_samples_per_second": 237.397, + "eval_steps_per_second": 7.43, + "step": 90000 + }, + { + "epoch": 2.12, + "learning_rate": 1.3717991925554562e-05, + "loss": 0.1608, + "step": 90100 + }, + { + "epoch": 2.12, + "learning_rate": 1.3643915619574529e-05, + "loss": 0.1604, + "step": 90200 + }, + { + "epoch": 2.13, + "learning_rate": 1.35705649731021e-05, + "loss": 0.1607, + "step": 90300 + }, + { + "epoch": 2.13, + "learning_rate": 1.3497940788288195e-05, + "loss": 0.1602, + "step": 90400 + }, + { + "epoch": 2.13, + "learning_rate": 1.3426043859339253e-05, + "loss": 0.1595, + "step": 90500 + }, + { + "epoch": 2.13, + "learning_rate": 1.3354874972508582e-05, + "loss": 0.1598, + "step": 90600 + }, + { + "epoch": 2.14, + "learning_rate": 1.3284434906087695e-05, + "loss": 0.1602, + "step": 90700 + }, + { + "epoch": 2.14, + "learning_rate": 1.3214724430397915e-05, + "loss": 0.1599, + "step": 90800 + }, + { + "epoch": 2.14, + "learning_rate": 1.314574430778182e-05, + "loss": 0.1598, + "step": 90900 + }, + { + "epoch": 2.14, + "learning_rate": 1.3077495292594966e-05, + "loss": 0.1604, + "step": 91000 + }, + { + "epoch": 2.14, + "eval_runtime": 45.6099, + "eval_samples_per_second": 236.791, + "eval_steps_per_second": 7.411, + "step": 91000 + }, + { + "epoch": 2.15, + "learning_rate": 1.3009978131197669e-05, + "loss": 0.16, + "step": 91100 + }, + { + "epoch": 2.15, + "learning_rate": 1.2943193561946762e-05, + "loss": 0.1604, + "step": 91200 + }, + { + "epoch": 2.15, + "learning_rate": 1.2877142315187628e-05, + "loss": 0.1597, + "step": 91300 + }, + { + "epoch": 2.15, + "learning_rate": 1.28118251132461e-05, + "loss": 0.1601, + "step": 91400 + }, + { + "epoch": 2.15, + "learning_rate": 1.274724267042063e-05, + "loss": 0.1599, + "step": 91500 + }, + { + "epoch": 2.16, + "learning_rate": 1.2683395692974472e-05, + "loss": 0.1595, + "step": 91600 + }, + { + "epoch": 2.16, + "learning_rate": 1.2620284879127947e-05, + "loss": 0.1598, + "step": 91700 + }, + { + "epoch": 2.16, + "learning_rate": 1.2557910919050803e-05, + "loss": 0.1602, + "step": 91800 + }, + { + "epoch": 2.16, + "learning_rate": 1.2496274494854666e-05, + "loss": 0.1596, + "step": 91900 + }, + { + "epoch": 2.17, + "learning_rate": 1.24353762805856e-05, + "loss": 0.1601, + "step": 92000 + }, + { + "epoch": 2.17, + "eval_runtime": 45.5979, + "eval_samples_per_second": 236.853, + "eval_steps_per_second": 7.413, + "step": 92000 + }, + { + "epoch": 2.17, + "learning_rate": 1.2375216942216713e-05, + "loss": 0.1596, + "step": 92100 + }, + { + "epoch": 2.17, + "learning_rate": 1.2315797137640906e-05, + "loss": 0.1598, + "step": 92200 + }, + { + "epoch": 2.17, + "learning_rate": 1.225711751666363e-05, + "loss": 0.16, + "step": 92300 + }, + { + "epoch": 2.18, + "learning_rate": 1.2199178720995825e-05, + "loss": 0.1598, + "step": 92400 + }, + { + "epoch": 2.18, + "learning_rate": 1.2141981384246874e-05, + "loss": 0.1593, + "step": 92500 + }, + { + "epoch": 2.18, + "learning_rate": 1.2085526131917685e-05, + "loss": 0.1602, + "step": 92600 + }, + { + "epoch": 2.18, + "learning_rate": 1.2029813581393866e-05, + "loss": 0.1598, + "step": 92700 + }, + { + "epoch": 2.19, + "learning_rate": 1.197484434193893e-05, + "loss": 0.1593, + "step": 92800 + }, + { + "epoch": 2.19, + "learning_rate": 1.192061901468768e-05, + "loss": 0.1597, + "step": 92900 + }, + { + "epoch": 2.19, + "learning_rate": 1.1867138192639601e-05, + "loss": 0.16, + "step": 93000 + }, + { + "epoch": 2.19, + "eval_runtime": 45.5858, + "eval_samples_per_second": 236.916, + "eval_steps_per_second": 7.415, + "step": 93000 + }, + { + "epoch": 2.19, + "learning_rate": 1.1814402460652382e-05, + "loss": 0.159, + "step": 93100 + }, + { + "epoch": 2.2, + "learning_rate": 1.176241239543558e-05, + "loss": 0.1597, + "step": 93200 + }, + { + "epoch": 2.2, + "learning_rate": 1.171116856554418e-05, + "loss": 0.1594, + "step": 93300 + }, + { + "epoch": 2.2, + "learning_rate": 1.1660671531372517e-05, + "loss": 0.1591, + "step": 93400 + }, + { + "epoch": 2.2, + "learning_rate": 1.1610921845148052e-05, + "loss": 0.1587, + "step": 93500 + }, + { + "epoch": 2.2, + "learning_rate": 1.156192005092539e-05, + "loss": 0.1593, + "step": 93600 + }, + { + "epoch": 2.21, + "learning_rate": 1.1513666684580308e-05, + "loss": 0.1593, + "step": 93700 + }, + { + "epoch": 2.21, + "learning_rate": 1.1466162273803876e-05, + "loss": 0.1587, + "step": 93800 + }, + { + "epoch": 2.21, + "learning_rate": 1.1419407338096732e-05, + "loss": 0.1598, + "step": 93900 + }, + { + "epoch": 2.21, + "learning_rate": 1.1373402388763346e-05, + "loss": 0.159, + "step": 94000 + }, + { + "epoch": 2.21, + "eval_runtime": 45.6844, + "eval_samples_per_second": 236.404, + "eval_steps_per_second": 7.399, + "step": 94000 + }, + { + "epoch": 2.22, + "learning_rate": 1.1328147928906494e-05, + "loss": 0.1594, + "step": 94100 + }, + { + "epoch": 2.22, + "learning_rate": 1.1283644453421678e-05, + "loss": 0.1593, + "step": 94200 + }, + { + "epoch": 2.22, + "learning_rate": 1.1239892448991798e-05, + "loss": 0.1593, + "step": 94300 + }, + { + "epoch": 2.22, + "learning_rate": 1.1196892394081743e-05, + "loss": 0.1588, + "step": 94400 + }, + { + "epoch": 2.23, + "learning_rate": 1.1154644758933235e-05, + "loss": 0.1598, + "step": 94500 + }, + { + "epoch": 2.23, + "learning_rate": 1.1113150005559644e-05, + "loss": 0.1587, + "step": 94600 + }, + { + "epoch": 2.23, + "learning_rate": 1.1072408587740942e-05, + "loss": 0.1589, + "step": 94700 + }, + { + "epoch": 2.23, + "learning_rate": 1.1032420951018755e-05, + "loss": 0.1594, + "step": 94800 + }, + { + "epoch": 2.24, + "learning_rate": 1.0993187532691458e-05, + "loss": 0.1593, + "step": 94900 + }, + { + "epoch": 2.24, + "learning_rate": 1.0954708761809438e-05, + "loss": 0.1588, + "step": 95000 + }, + { + "epoch": 2.24, + "eval_runtime": 45.7912, + "eval_samples_per_second": 235.853, + "eval_steps_per_second": 7.381, + "step": 95000 + }, + { + "epoch": 2.24, + "learning_rate": 1.091698505917036e-05, + "loss": 0.1587, + "step": 95100 + }, + { + "epoch": 2.24, + "learning_rate": 1.0880016837314599e-05, + "loss": 0.1592, + "step": 95200 + }, + { + "epoch": 2.25, + "learning_rate": 1.084380450052071e-05, + "loss": 0.159, + "step": 95300 + }, + { + "epoch": 2.25, + "learning_rate": 1.0808348444801e-05, + "loss": 0.1592, + "step": 95400 + }, + { + "epoch": 2.25, + "learning_rate": 1.0773649057897206e-05, + "loss": 0.1588, + "step": 95500 + }, + { + "epoch": 2.25, + "learning_rate": 1.073970671927628e-05, + "loss": 0.1585, + "step": 95600 + }, + { + "epoch": 2.25, + "learning_rate": 1.0706521800126198e-05, + "loss": 0.1593, + "step": 95700 + }, + { + "epoch": 2.26, + "learning_rate": 1.0674094663351906e-05, + "loss": 0.1595, + "step": 95800 + }, + { + "epoch": 2.26, + "learning_rate": 1.0642425663571383e-05, + "loss": 0.1593, + "step": 95900 + }, + { + "epoch": 2.26, + "learning_rate": 1.0611515147111736e-05, + "loss": 0.1587, + "step": 96000 + }, + { + "epoch": 2.26, + "eval_runtime": 45.5789, + "eval_samples_per_second": 236.952, + "eval_steps_per_second": 7.416, + "step": 96000 + }, + { + "epoch": 2.26, + "learning_rate": 1.0581363452005424e-05, + "loss": 0.1583, + "step": 96100 + }, + { + "epoch": 2.27, + "learning_rate": 1.0551970907986557e-05, + "loss": 0.1584, + "step": 96200 + }, + { + "epoch": 2.27, + "learning_rate": 1.0523337836487271e-05, + "loss": 0.1585, + "step": 96300 + }, + { + "epoch": 2.27, + "learning_rate": 1.0495464550634267e-05, + "loss": 0.1583, + "step": 96400 + }, + { + "epoch": 2.27, + "learning_rate": 1.046835135524533e-05, + "loss": 0.1587, + "step": 96500 + }, + { + "epoch": 2.28, + "learning_rate": 1.044199854682601e-05, + "loss": 0.1588, + "step": 96600 + }, + { + "epoch": 2.28, + "learning_rate": 1.0416406413566414e-05, + "loss": 0.1586, + "step": 96700 + }, + { + "epoch": 2.28, + "learning_rate": 1.0391575235337991e-05, + "loss": 0.1596, + "step": 96800 + }, + { + "epoch": 2.28, + "learning_rate": 1.0367505283690547e-05, + "loss": 0.1585, + "step": 96900 + }, + { + "epoch": 2.29, + "learning_rate": 1.0344196821849202e-05, + "loss": 0.1584, + "step": 97000 + }, + { + "epoch": 2.29, + "eval_runtime": 45.8042, + "eval_samples_per_second": 235.786, + "eval_steps_per_second": 7.379, + "step": 97000 + }, + { + "epoch": 2.29, + "learning_rate": 1.032165010471157e-05, + "loss": 0.1582, + "step": 97100 + }, + { + "epoch": 2.29, + "learning_rate": 1.0299865378844936e-05, + "loss": 0.1586, + "step": 97200 + }, + { + "epoch": 2.29, + "learning_rate": 1.0278842882483569e-05, + "loss": 0.1587, + "step": 97300 + }, + { + "epoch": 2.3, + "learning_rate": 1.025858284552612e-05, + "loss": 0.1594, + "step": 97400 + }, + { + "epoch": 2.3, + "learning_rate": 1.023908548953311e-05, + "loss": 0.1593, + "step": 97500 + }, + { + "epoch": 2.3, + "learning_rate": 1.02203510277245e-05, + "loss": 0.158, + "step": 97600 + }, + { + "epoch": 2.3, + "learning_rate": 1.0202379664977364e-05, + "loss": 0.1582, + "step": 97700 + }, + { + "epoch": 2.3, + "learning_rate": 1.018517159782365e-05, + "loss": 0.1586, + "step": 97800 + }, + { + "epoch": 2.31, + "learning_rate": 1.0168727014448004e-05, + "loss": 0.1586, + "step": 97900 + }, + { + "epoch": 2.31, + "learning_rate": 1.0153046094685783e-05, + "loss": 0.1591, + "step": 98000 + }, + { + "epoch": 2.31, + "eval_runtime": 45.7753, + "eval_samples_per_second": 235.935, + "eval_steps_per_second": 7.384, + "step": 98000 + }, + { + "epoch": 2.31, + "learning_rate": 1.0138129010020992e-05, + "loss": 0.1586, + "step": 98100 + }, + { + "epoch": 2.31, + "learning_rate": 1.0123975923584488e-05, + "loss": 0.1588, + "step": 98200 + }, + { + "epoch": 2.32, + "learning_rate": 1.0110586990152152e-05, + "loss": 0.1585, + "step": 98300 + }, + { + "epoch": 2.32, + "learning_rate": 1.0097962356143219e-05, + "loss": 0.1583, + "step": 98400 + }, + { + "epoch": 2.32, + "learning_rate": 1.0086102159618668e-05, + "loss": 0.1584, + "step": 98500 + }, + { + "epoch": 2.32, + "learning_rate": 1.0075006530279694e-05, + "loss": 0.1579, + "step": 98600 + }, + { + "epoch": 2.33, + "learning_rate": 1.0064675589466339e-05, + "loss": 0.1586, + "step": 98700 + }, + { + "epoch": 2.33, + "learning_rate": 1.0055109450156098e-05, + "loss": 0.1583, + "step": 98800 + }, + { + "epoch": 2.33, + "learning_rate": 1.0046308216962759e-05, + "loss": 0.1585, + "step": 98900 + }, + { + "epoch": 2.33, + "learning_rate": 1.0038271986135177e-05, + "loss": 0.1581, + "step": 99000 + }, + { + "epoch": 2.33, + "eval_runtime": 45.6993, + "eval_samples_per_second": 236.327, + "eval_steps_per_second": 7.396, + "step": 99000 + }, + { + "epoch": 2.34, + "learning_rate": 1.0031000845556304e-05, + "loss": 0.1592, + "step": 99100 + }, + { + "epoch": 2.34, + "learning_rate": 1.0024494874742152e-05, + "loss": 0.158, + "step": 99200 + }, + { + "epoch": 2.34, + "learning_rate": 1.0018754144840986e-05, + "loss": 0.1586, + "step": 99300 + }, + { + "epoch": 2.34, + "learning_rate": 1.0013778718632507e-05, + "loss": 0.1585, + "step": 99400 + }, + { + "epoch": 2.35, + "learning_rate": 1.000956865052717e-05, + "loss": 0.1582, + "step": 99500 + }, + { + "epoch": 2.35, + "learning_rate": 1.0006123986565623e-05, + "loss": 0.1579, + "step": 99600 + }, + { + "epoch": 2.35, + "learning_rate": 1.0003444764418138e-05, + "loss": 0.1585, + "step": 99700 + }, + { + "epoch": 2.35, + "learning_rate": 1.000153101338428e-05, + "loss": 0.1581, + "step": 99800 + }, + { + "epoch": 2.35, + "learning_rate": 1.00003827543925e-05, + "loss": 0.1578, + "step": 99900 + }, + { + "epoch": 2.36, + "learning_rate": 1e-05, + "loss": 0.1583, + "step": 100000 + }, + { + "epoch": 2.36, + "eval_runtime": 45.8451, + "eval_samples_per_second": 235.576, + "eval_steps_per_second": 7.373, + "step": 100000 + } + ], + "max_steps": 100000, + "num_train_epochs": 3, + "total_flos": 7.009987307584106e+21, + "trial_name": null, + "trial_params": null +}