diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,4744 +1,3379 @@ { - "best_metric": 6.480716228485107, - "best_model_checkpoint": "/data1/attanasiog/babylm/roberta-tiny-10M/checkpoint-5900", - "epoch": 31.184852374839537, - "global_step": 6050, + "best_metric": 2.7371606826782227, + "best_model_checkpoint": "/data1/attanasiog/babylm/roberta-tiny-10M/checkpoint-4150", + "epoch": 89.57546563904945, + "global_step": 4300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.05, - "learning_rate": 4.000000000000001e-06, - "loss": 10.8666, + "epoch": 0.21, + "learning_rate": 8e-05, + "loss": 10.4287, "step": 10 }, { - "epoch": 0.1, - "learning_rate": 8.000000000000001e-06, - "loss": 10.8334, + "epoch": 0.41, + "learning_rate": 0.00016, + "loss": 9.0477, "step": 20 }, { - "epoch": 0.15, - "learning_rate": 1.2e-05, - "loss": 10.7554, + "epoch": 0.62, + "learning_rate": 0.00024, + "loss": 7.8228, "step": 30 }, { - "epoch": 0.21, - "learning_rate": 1.6000000000000003e-05, - "loss": 10.6494, + "epoch": 0.82, + "learning_rate": 0.00032, + "loss": 7.3343, "step": 40 }, { - "epoch": 0.26, - "learning_rate": 2e-05, - "loss": 10.5287, + "epoch": 1.04, + "learning_rate": 0.0004, + "loss": 7.8031, "step": 50 }, { - "epoch": 0.26, - "eval_accuracy": 0.048115526656560687, - "eval_loss": 10.44776725769043, - "eval_runtime": 168.8841, - "eval_samples_per_second": 142.435, - "eval_steps_per_second": 2.226, + "epoch": 1.04, + "eval_accuracy": 0.06061240850112075, + "eval_loss": 7.355990886688232, + "eval_runtime": 145.9523, + "eval_samples_per_second": 164.814, + "eval_steps_per_second": 5.152, "step": 50 }, { - "epoch": 0.31, - "learning_rate": 2e-05, - "loss": 10.4004, + "epoch": 1.25, + "learning_rate": 0.000399995625676045, + "loss": 7.2898, "step": 60 }, { - "epoch": 0.36, - "learning_rate": 2e-05, - "loss": 10.2909, + "epoch": 1.45, + "learning_rate": 0.0003999825028955268, + "loss": 7.1829, "step": 70 }, { - "epoch": 0.41, - "learning_rate": 2e-05, - "loss": 10.1876, + "epoch": 1.66, + "learning_rate": 0.0003999606322324786, + "loss": 7.0831, "step": 80 }, { - "epoch": 0.46, - "learning_rate": 2e-05, - "loss": 10.092, + "epoch": 1.86, + "learning_rate": 0.0003999300146435939, + "loss": 6.8807, "step": 90 }, { - "epoch": 0.51, - "learning_rate": 2e-05, - "loss": 10.0073, + "epoch": 2.08, + "learning_rate": 0.00039989065146818525, + "loss": 7.1948, "step": 100 }, { - "epoch": 0.51, - "eval_accuracy": 0.04883696521337363, - "eval_loss": 9.954952239990234, - "eval_runtime": 168.8432, - "eval_samples_per_second": 142.469, - "eval_steps_per_second": 2.227, + "epoch": 2.08, + "eval_accuracy": 0.11823707432860285, + "eval_loss": 6.737408638000488, + "eval_runtime": 145.8622, + "eval_samples_per_second": 164.916, + "eval_steps_per_second": 5.156, "step": 100 }, { - "epoch": 0.56, - "learning_rate": 2e-05, - "loss": 9.9271, + "epoch": 2.29, + "learning_rate": 0.0003998425444281255, + "loss": 6.659, "step": 110 }, { - "epoch": 0.62, - "learning_rate": 2e-05, - "loss": 9.8418, + "epoch": 2.49, + "learning_rate": 0.00039978569562777234, + "loss": 6.5924, "step": 120 }, { - "epoch": 0.67, - "learning_rate": 2e-05, - "loss": 9.7731, + "epoch": 2.7, + "learning_rate": 0.0003997201075538765, + "loss": 6.5237, "step": 130 }, { - "epoch": 0.72, - "learning_rate": 2e-05, - "loss": 9.6976, + "epoch": 2.9, + "learning_rate": 0.0003996457830754729, + "loss": 6.4927, "step": 140 }, { - "epoch": 0.77, - "learning_rate": 2e-05, - "loss": 9.6268, + "epoch": 3.12, + "learning_rate": 0.00039956272544375493, + "loss": 6.8927, "step": 150 }, { - "epoch": 0.77, - "eval_accuracy": 0.04877340177584069, - "eval_loss": 9.586450576782227, - "eval_runtime": 168.9917, - "eval_samples_per_second": 142.344, - "eval_steps_per_second": 2.225, + "epoch": 3.12, + "eval_accuracy": 0.1414597356195163, + "eval_loss": 6.502169609069824, + "eval_runtime": 145.9635, + "eval_samples_per_second": 164.801, + "eval_steps_per_second": 5.152, "step": 150 }, { - "epoch": 0.82, - "learning_rate": 2e-05, - "loss": 9.5519, + "epoch": 3.33, + "learning_rate": 0.00039947093829193245, + "loss": 6.4155, "step": 160 }, { - "epoch": 0.87, - "learning_rate": 2e-05, - "loss": 9.4822, + "epoch": 3.53, + "learning_rate": 0.00039937042563507283, + "loss": 6.4041, "step": 170 }, { - "epoch": 0.92, - "learning_rate": 2e-05, - "loss": 9.4239, + "epoch": 3.74, + "learning_rate": 0.00039926119186992537, + "loss": 6.3875, "step": 180 }, { - "epoch": 0.98, - "learning_rate": 2e-05, - "loss": 9.3531, + "epoch": 3.95, + "learning_rate": 0.0003991432417747288, + "loss": 6.3543, "step": 190 }, { - "epoch": 1.03, - "learning_rate": 2e-05, - "loss": 9.9837, + "epoch": 4.16, + "learning_rate": 0.0003990165805090023, + "loss": 6.7339, "step": 200 }, { - "epoch": 1.03, - "eval_accuracy": 0.04709288795319612, - "eval_loss": 9.250182151794434, - "eval_runtime": 169.1628, - "eval_samples_per_second": 142.2, - "eval_steps_per_second": 2.223, + "epoch": 4.16, + "eval_accuracy": 0.1482938589304516, + "eval_loss": 6.400519847869873, + "eval_runtime": 145.8639, + "eval_samples_per_second": 164.914, + "eval_steps_per_second": 5.155, "step": 200 }, { - "epoch": 1.08, - "learning_rate": 2e-05, - "loss": 9.2265, + "epoch": 4.37, + "learning_rate": 0.00039888121361332003, + "loss": 6.3085, "step": 210 }, { - "epoch": 1.13, - "learning_rate": 2e-05, - "loss": 9.1555, + "epoch": 4.58, + "learning_rate": 0.0003987371470090686, + "loss": 6.3213, "step": 220 }, { - "epoch": 1.18, - "learning_rate": 2e-05, - "loss": 9.0872, + "epoch": 4.78, + "learning_rate": 0.00039858438699818784, + "loss": 6.2931, "step": 230 }, { - "epoch": 1.24, - "learning_rate": 2e-05, - "loss": 9.0276, + "epoch": 4.99, + "learning_rate": 0.0003984229402628956, + "loss": 6.2716, "step": 240 }, { - "epoch": 1.29, - "learning_rate": 2e-05, - "loss": 8.9701, + "epoch": 5.21, + "learning_rate": 0.00039825281386539503, + "loss": 6.6609, "step": 250 }, { - "epoch": 1.29, - "eval_accuracy": 0.04664432564609462, - "eval_loss": 8.936966896057129, - "eval_runtime": 168.9806, - "eval_samples_per_second": 142.354, - "eval_steps_per_second": 2.225, + "epoch": 5.21, + "eval_accuracy": 0.1509599365008845, + "eval_loss": 6.3535308837890625, + "eval_runtime": 145.9161, + "eval_samples_per_second": 164.855, + "eval_steps_per_second": 5.154, "step": 250 }, { - "epoch": 1.34, - "learning_rate": 2e-05, - "loss": 8.913, + "epoch": 5.41, + "learning_rate": 0.000398074015247566, + "loss": 6.2501, "step": 260 }, { - "epoch": 1.39, - "learning_rate": 2e-05, - "loss": 8.8506, + "epoch": 5.62, + "learning_rate": 0.0003978865522306392, + "loss": 6.2436, "step": 270 }, { - "epoch": 1.44, - "learning_rate": 2e-05, - "loss": 8.7763, + "epoch": 5.82, + "learning_rate": 0.0003976904330148543, + "loss": 6.2418, "step": 280 }, { - "epoch": 1.49, - "learning_rate": 2e-05, - "loss": 8.7262, + "epoch": 6.04, + "learning_rate": 0.00039748566617910113, + "loss": 6.6426, "step": 290 }, { - "epoch": 1.54, - "learning_rate": 2e-05, - "loss": 8.6689, + "epoch": 6.25, + "learning_rate": 0.0003972722606805445, + "loss": 6.1972, "step": 300 }, { - "epoch": 1.54, - "eval_accuracy": 0.04727021760311789, - "eval_loss": 8.644742965698242, - "eval_runtime": 168.948, - "eval_samples_per_second": 142.381, - "eval_steps_per_second": 2.226, + "epoch": 6.25, + "eval_accuracy": 0.15188271193711186, + "eval_loss": 6.332435607910156, + "eval_runtime": 146.0198, + "eval_samples_per_second": 164.738, + "eval_steps_per_second": 5.15, "step": 300 }, { - "epoch": 1.6, - "learning_rate": 2e-05, - "loss": 8.6119, + "epoch": 6.45, + "learning_rate": 0.00039705022585423216, + "loss": 6.2183, "step": 310 }, { - "epoch": 1.65, - "learning_rate": 2e-05, - "loss": 8.5511, + "epoch": 6.66, + "learning_rate": 0.0003968195714126868, + "loss": 6.1899, "step": 320 }, { - "epoch": 1.7, - "learning_rate": 2e-05, - "loss": 8.503, + "epoch": 6.86, + "learning_rate": 0.00039658030744548075, + "loss": 6.192, "step": 330 }, { - "epoch": 1.75, - "learning_rate": 2e-05, - "loss": 8.4466, + "epoch": 7.08, + "learning_rate": 0.0003963324444187952, + "loss": 6.5971, "step": 340 }, { - "epoch": 1.8, - "learning_rate": 2e-05, - "loss": 8.3893, + "epoch": 7.29, + "learning_rate": 0.0003960759931749619, + "loss": 6.1685, "step": 350 }, { - "epoch": 1.8, - "eval_accuracy": 0.04733514138538159, - "eval_loss": 8.379368782043457, - "eval_runtime": 169.22, - "eval_samples_per_second": 142.152, - "eval_steps_per_second": 2.222, + "epoch": 7.29, + "eval_accuracy": 0.15276707185574287, + "eval_loss": 6.302943706512451, + "eval_runtime": 145.9601, + "eval_samples_per_second": 164.805, + "eval_steps_per_second": 5.152, "step": 350 }, { - "epoch": 1.85, - "learning_rate": 2e-05, - "loss": 8.3375, + "epoch": 7.49, + "learning_rate": 0.00039581096493198893, + "loss": 6.1653, "step": 360 }, { - "epoch": 1.9, - "learning_rate": 2e-05, - "loss": 8.2824, + "epoch": 7.7, + "learning_rate": 0.0003955373712830703, + "loss": 6.1623, "step": 370 }, { - "epoch": 1.96, - "learning_rate": 2e-05, - "loss": 8.2317, + "epoch": 7.9, + "learning_rate": 0.00039525522419607854, + "loss": 6.1604, "step": 380 }, { - "epoch": 2.01, - "learning_rate": 2e-05, - "loss": 8.8178, + "epoch": 8.12, + "learning_rate": 0.0003949645360130412, + "loss": 6.5496, "step": 390 }, { - "epoch": 2.06, - "learning_rate": 2e-05, - "loss": 8.1697, + "epoch": 8.33, + "learning_rate": 0.0003946653194496012, + "loss": 6.1302, "step": 400 }, { - "epoch": 2.06, - "eval_accuracy": 0.050607646781225514, - "eval_loss": 8.134186744689941, - "eval_runtime": 169.087, - "eval_samples_per_second": 142.264, - "eval_steps_per_second": 2.224, + "epoch": 8.33, + "eval_accuracy": 0.152128546451089, + "eval_loss": 6.2827558517456055, + "eval_runtime": 145.9935, + "eval_samples_per_second": 164.768, + "eval_steps_per_second": 5.151, "step": 400 }, { - "epoch": 2.11, - "learning_rate": 2e-05, - "loss": 8.0758, + "epoch": 8.53, + "learning_rate": 0.00039435758759446025, + "loss": 6.1368, "step": 410 }, { - "epoch": 2.16, - "learning_rate": 2e-05, - "loss": 8.048, + "epoch": 8.74, + "learning_rate": 0.00039404135390880664, + "loss": 6.1171, "step": 420 }, { - "epoch": 2.22, - "learning_rate": 2e-05, - "loss": 7.9864, + "epoch": 8.95, + "learning_rate": 0.0003937166322257262, + "loss": 6.1463, "step": 430 }, { - "epoch": 2.27, - "learning_rate": 2e-05, - "loss": 7.9734, + "epoch": 9.16, + "learning_rate": 0.00039338343674959745, + "loss": 6.537, "step": 440 }, { - "epoch": 2.32, - "learning_rate": 2e-05, - "loss": 7.926, + "epoch": 9.37, + "learning_rate": 0.00039304178205546976, + "loss": 6.093, "step": 450 }, { - "epoch": 2.32, - "eval_accuracy": 0.06168861888884143, - "eval_loss": 7.922093868255615, - "eval_runtime": 168.9407, - "eval_samples_per_second": 142.387, - "eval_steps_per_second": 2.226, + "epoch": 9.37, + "eval_accuracy": 0.15364162638834264, + "eval_loss": 6.256844520568848, + "eval_runtime": 146.039, + "eval_samples_per_second": 164.716, + "eval_steps_per_second": 5.149, "step": 450 }, { - "epoch": 2.37, - "learning_rate": 2e-05, - "loss": 7.8712, + "epoch": 9.58, + "learning_rate": 0.00039269168308842634, + "loss": 6.0973, "step": 460 }, { - "epoch": 2.42, - "learning_rate": 2e-05, - "loss": 7.8416, + "epoch": 9.78, + "learning_rate": 0.00039233315516293006, + "loss": 6.1012, "step": 470 }, { - "epoch": 2.47, - "learning_rate": 2e-05, - "loss": 7.8118, + "epoch": 9.99, + "learning_rate": 0.00039196621396215403, + "loss": 6.0809, "step": 480 }, { - "epoch": 2.52, - "learning_rate": 2e-05, - "loss": 7.7706, + "epoch": 10.21, + "learning_rate": 0.000391590875537295, + "loss": 6.4765, "step": 490 }, { - "epoch": 2.58, - "learning_rate": 2e-05, - "loss": 7.7329, + "epoch": 10.41, + "learning_rate": 0.00039120715630687155, + "loss": 6.0543, "step": 500 }, { - "epoch": 2.58, - "eval_accuracy": 0.06273108274751771, - "eval_loss": 7.739767074584961, - "eval_runtime": 169.0756, - "eval_samples_per_second": 142.274, - "eval_steps_per_second": 2.224, + "epoch": 10.41, + "eval_accuracy": 0.15444620739515735, + "eval_loss": 6.24298620223999, + "eval_runtime": 145.9243, + "eval_samples_per_second": 164.846, + "eval_steps_per_second": 5.153, "step": 500 }, { - "epoch": 2.63, - "learning_rate": 2e-05, - "loss": 7.7048, + "epoch": 10.62, + "learning_rate": 0.000390815073056006, + "loss": 6.0777, "step": 510 }, { - "epoch": 2.68, - "learning_rate": 2e-05, - "loss": 7.6478, + "epoch": 10.82, + "learning_rate": 0.00039041464293568983, + "loss": 6.0697, "step": 520 }, { - "epoch": 2.73, - "learning_rate": 2e-05, - "loss": 7.6647, + "epoch": 11.04, + "learning_rate": 0.00039000588346203374, + "loss": 6.4636, "step": 530 }, { - "epoch": 2.78, - "learning_rate": 2e-05, - "loss": 7.6017, + "epoch": 11.25, + "learning_rate": 0.0003895888125155014, + "loss": 6.0487, "step": 540 }, { - "epoch": 2.83, - "learning_rate": 2e-05, - "loss": 7.582, + "epoch": 11.45, + "learning_rate": 0.00038916344834012695, + "loss": 6.0479, "step": 550 }, { - "epoch": 2.83, - "eval_accuracy": 0.06907213036340595, - "eval_loss": 7.5844244956970215, - "eval_runtime": 169.0372, - "eval_samples_per_second": 142.306, - "eval_steps_per_second": 2.224, + "epoch": 11.45, + "eval_accuracy": 0.1541217862327054, + "eval_loss": 6.234572887420654, + "eval_runtime": 145.8799, + "eval_samples_per_second": 164.896, + "eval_steps_per_second": 5.155, "step": 550 }, { - "epoch": 2.88, - "learning_rate": 2e-05, - "loss": 7.5512, + "epoch": 11.66, + "learning_rate": 0.00038872980954271757, + "loss": 6.0617, "step": 560 }, { - "epoch": 2.93, - "learning_rate": 2e-05, - "loss": 7.505, + "epoch": 11.86, + "learning_rate": 0.00038828791509203895, + "loss": 6.0441, "step": 570 }, { - "epoch": 2.99, - "learning_rate": 2e-05, - "loss": 7.523, + "epoch": 12.08, + "learning_rate": 0.00038783778431798597, + "loss": 6.4461, "step": 580 }, { - "epoch": 3.04, - "learning_rate": 2e-05, - "loss": 8.0083, + "epoch": 12.29, + "learning_rate": 0.0003873794369107369, + "loss": 6.0258, "step": 590 }, { - "epoch": 3.09, - "learning_rate": 2e-05, - "loss": 7.4419, + "epoch": 12.49, + "learning_rate": 0.0003869128929198922, + "loss": 6.0372, "step": 600 }, { - "epoch": 3.09, - "eval_accuracy": 0.07293786059545417, - "eval_loss": 7.46204137802124, - "eval_runtime": 169.0193, - "eval_samples_per_second": 142.321, - "eval_steps_per_second": 2.225, + "epoch": 12.49, + "eval_accuracy": 0.1545538581772011, + "eval_loss": 6.223215103149414, + "eval_runtime": 145.9665, + "eval_samples_per_second": 164.798, + "eval_steps_per_second": 5.152, "step": 600 }, { - "epoch": 3.14, - "learning_rate": 2e-05, - "loss": 7.3961, + "epoch": 12.7, + "learning_rate": 0.0003864381727535973, + "loss": 6.0353, "step": 610 }, { - "epoch": 3.2, - "learning_rate": 2e-05, - "loss": 7.3728, + "epoch": 12.9, + "learning_rate": 0.00038595529717765027, + "loss": 6.041, "step": 620 }, { - "epoch": 3.25, - "learning_rate": 2e-05, - "loss": 7.3709, + "epoch": 13.12, + "learning_rate": 0.0003854642873145931, + "loss": 6.4207, "step": 630 }, { - "epoch": 3.3, - "learning_rate": 2e-05, - "loss": 7.3295, + "epoch": 13.33, + "learning_rate": 0.00038496516464278776, + "loss": 6.006, "step": 640 }, { - "epoch": 3.35, - "learning_rate": 2e-05, - "loss": 7.3658, + "epoch": 13.53, + "learning_rate": 0.00038445795099547697, + "loss": 6.0127, "step": 650 }, { - "epoch": 3.35, - "eval_accuracy": 0.07808497071718955, - "eval_loss": 7.373513698577881, - "eval_runtime": 169.1773, - "eval_samples_per_second": 142.188, - "eval_steps_per_second": 2.223, + "epoch": 13.53, + "eval_accuracy": 0.15411265298876436, + "eval_loss": 6.213912010192871, + "eval_runtime": 145.9328, + "eval_samples_per_second": 164.836, + "eval_steps_per_second": 5.153, "step": 650 }, { - "epoch": 3.4, - "learning_rate": 2e-05, - "loss": 7.342, + "epoch": 13.74, + "learning_rate": 0.0003839426685598287, + "loss": 6.0006, "step": 660 }, { - "epoch": 3.45, - "learning_rate": 2e-05, - "loss": 7.3369, + "epoch": 13.95, + "learning_rate": 0.000383419339875966, + "loss": 6.0152, "step": 670 }, { - "epoch": 3.5, - "learning_rate": 2e-05, - "loss": 7.292, + "epoch": 14.16, + "learning_rate": 0.00038288798783598087, + "loss": 6.3908, "step": 680 }, { - "epoch": 3.55, - "learning_rate": 2e-05, - "loss": 7.2974, + "epoch": 14.37, + "learning_rate": 0.0003823486356829329, + "loss": 5.9744, "step": 690 }, { - "epoch": 3.61, - "learning_rate": 2e-05, - "loss": 7.2857, + "epoch": 14.58, + "learning_rate": 0.0003818013070098325, + "loss": 5.968, "step": 700 }, { - "epoch": 3.61, - "eval_accuracy": 0.08012986753100795, - "eval_loss": 7.304928302764893, - "eval_runtime": 169.0533, - "eval_samples_per_second": 142.292, - "eval_steps_per_second": 2.224, + "epoch": 14.58, + "eval_accuracy": 0.15472111663693397, + "eval_loss": 6.20527458190918, + "eval_runtime": 145.9446, + "eval_samples_per_second": 164.823, + "eval_steps_per_second": 5.153, "step": 700 }, { - "epoch": 3.66, - "learning_rate": 2e-05, - "loss": 7.2677, + "epoch": 14.78, + "learning_rate": 0.0003812460257586089, + "loss": 5.9813, "step": 710 }, { - "epoch": 3.71, - "learning_rate": 2e-05, - "loss": 7.2592, + "epoch": 14.99, + "learning_rate": 0.000380682816219063, + "loss": 6.0108, "step": 720 }, { - "epoch": 3.76, - "learning_rate": 2e-05, - "loss": 7.2564, + "epoch": 15.21, + "learning_rate": 0.00038011170302780446, + "loss": 6.3495, "step": 730 }, { - "epoch": 3.81, - "learning_rate": 2e-05, - "loss": 7.2111, + "epoch": 15.41, + "learning_rate": 0.00037953271116717444, + "loss": 5.9708, "step": 740 }, { - "epoch": 3.86, - "learning_rate": 2e-05, - "loss": 7.224, + "epoch": 15.62, + "learning_rate": 0.0003789458659641527, + "loss": 5.9635, "step": 750 }, { - "epoch": 3.86, - "eval_accuracy": 0.08311436007920397, - "eval_loss": 7.2554144859313965, - "eval_runtime": 169.0481, - "eval_samples_per_second": 142.297, - "eval_steps_per_second": 2.224, + "epoch": 15.62, + "eval_accuracy": 0.15486276242328167, + "eval_loss": 6.199557781219482, + "eval_runtime": 145.9791, + "eval_samples_per_second": 164.784, + "eval_steps_per_second": 5.151, "step": 750 }, { - "epoch": 3.91, - "learning_rate": 2e-05, - "loss": 7.2082, + "epoch": 15.82, + "learning_rate": 0.0003783511930892495, + "loss": 5.9756, "step": 760 }, { - "epoch": 3.97, - "learning_rate": 2e-05, - "loss": 7.2165, + "epoch": 16.04, + "learning_rate": 0.00037774871855538275, + "loss": 6.3631, "step": 770 }, { - "epoch": 4.02, - "learning_rate": 2e-05, - "loss": 7.7143, + "epoch": 16.25, + "learning_rate": 0.00037713846871674045, + "loss": 5.9497, "step": 780 }, { - "epoch": 4.07, - "learning_rate": 2e-05, - "loss": 7.1994, + "epoch": 16.45, + "learning_rate": 0.0003765204702676274, + "loss": 5.9433, "step": 790 }, { - "epoch": 4.12, - "learning_rate": 2e-05, - "loss": 7.1851, + "epoch": 16.66, + "learning_rate": 0.0003758947502412978, + "loss": 5.9479, "step": 800 }, { - "epoch": 4.12, - "eval_accuracy": 0.0853165114983084, - "eval_loss": 7.208171844482422, - "eval_runtime": 169.0677, - "eval_samples_per_second": 142.28, - "eval_steps_per_second": 2.224, + "epoch": 16.66, + "eval_accuracy": 0.15478355696794033, + "eval_loss": 6.195274353027344, + "eval_runtime": 145.939, + "eval_samples_per_second": 164.829, + "eval_steps_per_second": 5.153, "step": 800 }, { - "epoch": 4.17, - "learning_rate": 2e-05, - "loss": 7.1016, + "epoch": 16.86, + "learning_rate": 0.0003752613360087727, + "loss": 5.9614, "step": 810 }, { - "epoch": 4.23, - "learning_rate": 2e-05, - "loss": 7.1461, + "epoch": 17.08, + "learning_rate": 0.00037462025527764265, + "loss": 6.326, "step": 820 }, { - "epoch": 4.28, - "learning_rate": 2e-05, - "loss": 7.1433, + "epoch": 17.29, + "learning_rate": 0.00037397153609085553, + "loss": 5.9293, "step": 830 }, { - "epoch": 4.33, - "learning_rate": 2e-05, - "loss": 7.1318, + "epoch": 17.49, + "learning_rate": 0.0003733152068254901, + "loss": 5.9305, "step": 840 }, { - "epoch": 4.38, - "learning_rate": 2e-05, - "loss": 7.1327, + "epoch": 17.7, + "learning_rate": 0.00037265129619151483, + "loss": 5.9371, "step": 850 }, { - "epoch": 4.38, - "eval_accuracy": 0.08777750406730751, - "eval_loss": 7.16784143447876, - "eval_runtime": 169.0166, - "eval_samples_per_second": 142.323, - "eval_steps_per_second": 2.225, + "epoch": 17.7, + "eval_accuracy": 0.15451778319531595, + "eval_loss": 6.1887054443359375, + "eval_runtime": 145.8431, + "eval_samples_per_second": 164.938, + "eval_steps_per_second": 5.156, "step": 850 }, { - "epoch": 4.43, - "learning_rate": 2e-05, - "loss": 7.1428, + "epoch": 17.9, + "learning_rate": 0.00037197983323053143, + "loss": 5.9348, "step": 860 }, { - "epoch": 4.48, - "learning_rate": 2e-05, - "loss": 7.1264, + "epoch": 18.12, + "learning_rate": 0.00037130084731450515, + "loss": 6.2994, "step": 870 }, { - "epoch": 4.53, - "learning_rate": 2e-05, - "loss": 7.1309, + "epoch": 18.33, + "learning_rate": 0.0003706143681444795, + "loss": 5.8969, "step": 880 }, { - "epoch": 4.59, - "learning_rate": 2e-05, - "loss": 7.1299, + "epoch": 18.53, + "learning_rate": 0.0003699204257492774, + "loss": 5.9219, "step": 890 }, { - "epoch": 4.64, - "learning_rate": 2e-05, - "loss": 7.0947, + "epoch": 18.74, + "learning_rate": 0.0003692190504841871, + "loss": 5.9046, "step": 900 }, { - "epoch": 4.64, - "eval_accuracy": 0.09090520258816484, - "eval_loss": 7.132561683654785, - "eval_runtime": 168.9513, - "eval_samples_per_second": 142.378, - "eval_steps_per_second": 2.225, + "epoch": 18.74, + "eval_accuracy": 0.1545486653674884, + "eval_loss": 6.161332130432129, + "eval_runtime": 145.9406, + "eval_samples_per_second": 164.827, + "eval_steps_per_second": 5.153, "step": 900 }, { - "epoch": 4.69, - "learning_rate": 2e-05, - "loss": 7.0953, + "epoch": 18.95, + "learning_rate": 0.00036851027302963493, + "loss": 5.9011, "step": 910 }, { - "epoch": 4.74, - "learning_rate": 2e-05, - "loss": 7.1143, + "epoch": 19.16, + "learning_rate": 0.00036779412438984294, + "loss": 6.2593, "step": 920 }, { - "epoch": 4.79, - "learning_rate": 2e-05, - "loss": 7.0972, + "epoch": 19.37, + "learning_rate": 0.0003670706358914725, + "loss": 5.8755, "step": 930 }, { - "epoch": 4.84, - "learning_rate": 2e-05, - "loss": 7.109, + "epoch": 19.58, + "learning_rate": 0.0003663398391822543, + "loss": 5.8396, "step": 940 }, { - "epoch": 4.89, - "learning_rate": 2e-05, - "loss": 7.0761, + "epoch": 19.78, + "learning_rate": 0.00036560176622960403, + "loss": 5.8368, "step": 950 }, { - "epoch": 4.89, - "eval_accuracy": 0.09188660007613132, - "eval_loss": 7.10692834854126, - "eval_runtime": 168.925, - "eval_samples_per_second": 142.4, - "eval_steps_per_second": 2.226, + "epoch": 19.78, + "eval_accuracy": 0.15570189218059025, + "eval_loss": 6.095159530639648, + "eval_runtime": 145.7599, + "eval_samples_per_second": 165.032, + "eval_steps_per_second": 5.159, "step": 950 }, { - "epoch": 4.94, - "learning_rate": 2e-05, - "loss": 7.0359, + "epoch": 19.99, + "learning_rate": 0.00036485644931922353, + "loss": 5.8184, "step": 960 }, { - "epoch": 5.0, - "learning_rate": 2e-05, - "loss": 7.0479, + "epoch": 20.21, + "learning_rate": 0.0003641039210536889, + "loss": 6.1866, "step": 970 }, { - "epoch": 5.05, - "learning_rate": 2e-05, - "loss": 7.5649, + "epoch": 20.41, + "learning_rate": 0.0003633442143510245, + "loss": 5.7848, "step": 980 }, { - "epoch": 5.1, - "learning_rate": 2e-05, - "loss": 7.0153, + "epoch": 20.62, + "learning_rate": 0.00036257736244326246, + "loss": 5.7807, "step": 990 }, { - "epoch": 5.15, - "learning_rate": 2e-05, - "loss": 7.0551, + "epoch": 20.82, + "learning_rate": 0.0003618033988749895, + "loss": 5.7914, "step": 1000 }, { - "epoch": 5.15, - "eval_accuracy": 0.0943313810914526, - "eval_loss": 7.080649375915527, - "eval_runtime": 169.1004, - "eval_samples_per_second": 142.253, - "eval_steps_per_second": 2.224, + "epoch": 20.82, + "eval_accuracy": 0.15694020859066315, + "eval_loss": 6.032991409301758, + "eval_runtime": 145.9881, + "eval_samples_per_second": 164.774, + "eval_steps_per_second": 5.151, "step": 1000 }, { - "epoch": 5.21, - "learning_rate": 2e-05, - "loss": 7.0119, + "epoch": 21.04, + "learning_rate": 0.0003610223575018795, + "loss": 6.1552, "step": 1010 }, { - "epoch": 5.26, - "learning_rate": 2e-05, - "loss": 7.0506, + "epoch": 21.25, + "learning_rate": 0.00036023427248921215, + "loss": 5.7428, "step": 1020 }, { - "epoch": 5.31, - "learning_rate": 2e-05, - "loss": 7.0266, + "epoch": 21.45, + "learning_rate": 0.0003594391783103792, + "loss": 5.7276, "step": 1030 }, { - "epoch": 5.36, - "learning_rate": 2e-05, - "loss": 7.0121, + "epoch": 21.66, + "learning_rate": 0.00035863710974537563, + "loss": 5.7289, "step": 1040 }, { - "epoch": 5.41, - "learning_rate": 2e-05, - "loss": 7.0389, + "epoch": 21.86, + "learning_rate": 0.00035782810187927875, + "loss": 5.7026, "step": 1050 }, { - "epoch": 5.41, - "eval_accuracy": 0.09524817664480986, - "eval_loss": 7.058758735656738, - "eval_runtime": 169.0231, - "eval_samples_per_second": 142.318, - "eval_steps_per_second": 2.225, + "epoch": 21.86, + "eval_accuracy": 0.16123595961673237, + "eval_loss": 5.942953109741211, + "eval_runtime": 145.9911, + "eval_samples_per_second": 164.77, + "eval_steps_per_second": 5.151, "step": 1050 }, { - "epoch": 5.46, - "learning_rate": 2e-05, - "loss": 7.002, + "epoch": 22.08, + "learning_rate": 0.0003570121901007136, + "loss": 6.0423, "step": 1060 }, { - "epoch": 5.51, - "learning_rate": 2e-05, - "loss": 7.0223, + "epoch": 22.29, + "learning_rate": 0.0003561894101003044, + "loss": 5.6495, "step": 1070 }, { - "epoch": 5.56, - "learning_rate": 2e-05, - "loss": 6.9822, + "epoch": 22.49, + "learning_rate": 0.00035535979786911396, + "loss": 5.6223, "step": 1080 }, { - "epoch": 5.62, - "learning_rate": 2e-05, - "loss": 6.9993, + "epoch": 22.7, + "learning_rate": 0.00035452338969706876, + "loss": 5.5675, "step": 1090 }, { - "epoch": 5.67, - "learning_rate": 2e-05, - "loss": 7.0226, + "epoch": 22.9, + "learning_rate": 0.00035368022217137184, + "loss": 5.491, "step": 1100 }, { - "epoch": 5.67, - "eval_accuracy": 0.09638965813145792, - "eval_loss": 7.037946701049805, - "eval_runtime": 169.0695, - "eval_samples_per_second": 142.279, - "eval_steps_per_second": 2.224, + "epoch": 22.9, + "eval_accuracy": 0.19736824293775215, + "eval_loss": 5.609994888305664, + "eval_runtime": 146.0961, + "eval_samples_per_second": 164.652, + "eval_steps_per_second": 5.147, "step": 1100 }, { - "epoch": 5.72, - "learning_rate": 2e-05, - "loss": 7.0049, + "epoch": 23.12, + "learning_rate": 0.00035283033217490227, + "loss": 5.6961, "step": 1110 }, { - "epoch": 5.77, - "learning_rate": 2e-05, - "loss": 6.9835, + "epoch": 23.33, + "learning_rate": 0.00035197375688460176, + "loss": 5.239, "step": 1120 }, { - "epoch": 5.82, - "learning_rate": 2e-05, - "loss": 6.9924, + "epoch": 23.53, + "learning_rate": 0.0003511105337698484, + "loss": 5.1252, "step": 1130 }, { - "epoch": 5.87, - "learning_rate": 2e-05, - "loss": 6.9933, + "epoch": 23.74, + "learning_rate": 0.0003502407005908177, + "loss": 5.0182, "step": 1140 }, { - "epoch": 5.92, - "learning_rate": 2e-05, - "loss": 6.9992, + "epoch": 23.95, + "learning_rate": 0.0003493642953968308, + "loss": 4.9289, "step": 1150 }, { - "epoch": 5.92, - "eval_accuracy": 0.09752125105444163, - "eval_loss": 7.01423454284668, - "eval_runtime": 169.1114, - "eval_samples_per_second": 142.244, - "eval_steps_per_second": 2.223, + "epoch": 23.95, + "eval_accuracy": 0.27019214299497635, + "eval_loss": 4.960735321044922, + "eval_runtime": 146.1516, + "eval_samples_per_second": 164.589, + "eval_steps_per_second": 5.145, "step": 1150 }, { - "epoch": 5.98, - "learning_rate": 2e-05, - "loss": 6.9667, + "epoch": 24.16, + "learning_rate": 0.00034848135652469, + "loss": 5.1346, "step": 1160 }, { - "epoch": 6.03, - "learning_rate": 2e-05, - "loss": 7.4753, + "epoch": 24.37, + "learning_rate": 0.00034759192259700196, + "loss": 4.7377, "step": 1170 }, { - "epoch": 6.08, - "learning_rate": 2e-05, - "loss": 6.9355, + "epoch": 24.58, + "learning_rate": 0.000346696032520488, + "loss": 4.6538, "step": 1180 }, { - "epoch": 6.13, - "learning_rate": 2e-05, - "loss": 6.9903, + "epoch": 24.78, + "learning_rate": 0.00034579372548428235, + "loss": 4.608, "step": 1190 }, { - "epoch": 6.18, - "learning_rate": 2e-05, - "loss": 6.9382, + "epoch": 24.99, + "learning_rate": 0.00034488504095821784, + "loss": 4.5214, "step": 1200 }, { - "epoch": 6.18, - "eval_accuracy": 0.09855111513654274, - "eval_loss": 6.997906684875488, - "eval_runtime": 169.2157, - "eval_samples_per_second": 142.156, - "eval_steps_per_second": 2.222, + "epoch": 24.99, + "eval_accuracy": 0.3050591252908655, + "eval_loss": 4.579548358917236, + "eval_runtime": 146.015, + "eval_samples_per_second": 164.743, + "eval_steps_per_second": 5.15, "step": 1200 }, { - "epoch": 6.24, - "learning_rate": 2e-05, - "loss": 6.9745, + "epoch": 25.21, + "learning_rate": 0.0003439700186910993, + "loss": 4.7508, "step": 1210 }, { - "epoch": 6.29, - "learning_rate": 2e-05, - "loss": 6.9784, + "epoch": 25.41, + "learning_rate": 0.00034304869870896513, + "loss": 4.4132, "step": 1220 }, { - "epoch": 6.34, - "learning_rate": 2e-05, - "loss": 6.9698, + "epoch": 25.62, + "learning_rate": 0.00034212112131333587, + "loss": 4.3489, "step": 1230 }, { - "epoch": 6.39, - "learning_rate": 2e-05, - "loss": 6.9203, + "epoch": 25.82, + "learning_rate": 0.0003411873270794518, + "loss": 4.3454, "step": 1240 }, { - "epoch": 6.44, - "learning_rate": 2e-05, - "loss": 6.956, + "epoch": 26.04, + "learning_rate": 0.00034024735685449773, + "loss": 4.5663, "step": 1250 }, { - "epoch": 6.44, - "eval_accuracy": 0.09872446648175658, - "eval_loss": 6.982813358306885, - "eval_runtime": 168.9414, - "eval_samples_per_second": 142.387, - "eval_steps_per_second": 2.226, + "epoch": 26.04, + "eval_accuracy": 0.32645309469898054, + "eval_loss": 4.345365047454834, + "eval_runtime": 146.0915, + "eval_samples_per_second": 164.657, + "eval_steps_per_second": 5.147, "step": 1250 }, { - "epoch": 6.49, - "learning_rate": 2e-05, - "loss": 6.9476, + "epoch": 26.25, + "learning_rate": 0.00033930125175581647, + "loss": 4.2188, "step": 1260 }, { - "epoch": 6.54, - "learning_rate": 2e-05, - "loss": 6.9329, + "epoch": 26.45, + "learning_rate": 0.0003383490531691099, + "loss": 4.1928, "step": 1270 }, { - "epoch": 6.6, - "learning_rate": 2e-05, - "loss": 6.9346, + "epoch": 26.66, + "learning_rate": 0.0003373908027466289, + "loss": 4.1575, "step": 1280 }, { - "epoch": 6.65, - "learning_rate": 2e-05, - "loss": 6.9047, + "epoch": 26.86, + "learning_rate": 0.00033642654240535134, + "loss": 4.1106, "step": 1290 }, { - "epoch": 6.7, - "learning_rate": 2e-05, - "loss": 6.9425, + "epoch": 27.08, + "learning_rate": 0.00033545631432514825, + "loss": 4.3717, "step": 1300 }, { - "epoch": 6.7, - "eval_accuracy": 0.1007779473820151, - "eval_loss": 6.961859703063965, - "eval_runtime": 169.104, - "eval_samples_per_second": 142.25, - "eval_steps_per_second": 2.223, + "epoch": 27.08, + "eval_accuracy": 0.3412254938630985, + "eval_loss": 4.1738104820251465, + "eval_runtime": 145.9707, + "eval_samples_per_second": 164.793, + "eval_steps_per_second": 5.152, "step": 1300 }, { - "epoch": 6.75, - "learning_rate": 2e-05, - "loss": 6.9086, + "epoch": 27.29, + "learning_rate": 0.00033448016094693895, + "loss": 4.007, "step": 1310 }, { - "epoch": 6.8, - "learning_rate": 2e-05, - "loss": 6.9056, + "epoch": 27.49, + "learning_rate": 0.0003334981249708345, + "loss": 4.003, "step": 1320 }, { - "epoch": 6.85, - "learning_rate": 2e-05, - "loss": 6.902, + "epoch": 27.7, + "learning_rate": 0.00033251024935427, + "loss": 3.9491, "step": 1330 }, { - "epoch": 6.9, - "learning_rate": 2e-05, - "loss": 6.9013, + "epoch": 27.9, + "learning_rate": 0.0003315165773101249, + "loss": 3.9411, "step": 1340 }, { - "epoch": 6.96, - "learning_rate": 2e-05, - "loss": 6.8872, + "epoch": 28.12, + "learning_rate": 0.00033051715230483374, + "loss": 4.1483, "step": 1350 }, { - "epoch": 6.96, - "eval_accuracy": 0.10138427920145159, - "eval_loss": 6.946805477142334, - "eval_runtime": 169.0, - "eval_samples_per_second": 142.337, - "eval_steps_per_second": 2.225, + "epoch": 28.12, + "eval_accuracy": 0.35552299245507185, + "eval_loss": 4.033575534820557, + "eval_runtime": 145.9738, + "eval_samples_per_second": 164.79, + "eval_steps_per_second": 5.152, "step": 1350 }, { - "epoch": 7.01, - "learning_rate": 2e-05, - "loss": 7.4327, + "epoch": 28.33, + "learning_rate": 0.0003295120180564838, + "loss": 3.8395, "step": 1360 }, { - "epoch": 7.06, - "learning_rate": 2e-05, - "loss": 6.8642, + "epoch": 28.53, + "learning_rate": 0.00032850121853290334, + "loss": 3.8271, "step": 1370 }, { - "epoch": 7.11, - "learning_rate": 2e-05, - "loss": 6.9062, + "epoch": 28.74, + "learning_rate": 0.000327484797949738, + "loss": 3.8272, "step": 1380 }, { - "epoch": 7.16, - "learning_rate": 2e-05, - "loss": 6.9209, + "epoch": 28.95, + "learning_rate": 0.00032646280076851684, + "loss": 3.7855, "step": 1390 }, { - "epoch": 7.22, - "learning_rate": 2e-05, - "loss": 6.8848, + "epoch": 29.16, + "learning_rate": 0.0003254352716947074, + "loss": 3.9988, "step": 1400 }, { - "epoch": 7.22, - "eval_accuracy": 0.1024037766438749, - "eval_loss": 6.931956768035889, - "eval_runtime": 169.0148, - "eval_samples_per_second": 142.325, - "eval_steps_per_second": 2.225, + "epoch": 29.16, + "eval_accuracy": 0.3677331361148426, + "eval_loss": 3.91800594329834, + "eval_runtime": 146.056, + "eval_samples_per_second": 164.697, + "eval_steps_per_second": 5.149, "step": 1400 }, { - "epoch": 7.27, - "learning_rate": 2e-05, - "loss": 6.9138, + "epoch": 29.37, + "learning_rate": 0.0003244022556757602, + "loss": 3.7379, "step": 1410 }, { - "epoch": 7.32, - "learning_rate": 2e-05, - "loss": 6.8736, + "epoch": 29.58, + "learning_rate": 0.0003233637978991422, + "loss": 3.6974, "step": 1420 }, { - "epoch": 7.37, - "learning_rate": 2e-05, - "loss": 6.9051, + "epoch": 29.78, + "learning_rate": 0.00032231994379036086, + "loss": 3.6966, "step": 1430 }, { - "epoch": 7.42, - "learning_rate": 2e-05, - "loss": 6.8833, + "epoch": 29.99, + "learning_rate": 0.0003212707390109765, + "loss": 3.6594, "step": 1440 }, { - "epoch": 7.47, - "learning_rate": 2e-05, - "loss": 6.8578, + "epoch": 30.21, + "learning_rate": 0.00032021622945660504, + "loss": 3.8695, "step": 1450 }, { - "epoch": 7.47, - "eval_accuracy": 0.10391318947463475, - "eval_loss": 6.9190216064453125, - "eval_runtime": 168.9483, - "eval_samples_per_second": 142.381, - "eval_steps_per_second": 2.226, + "epoch": 30.21, + "eval_accuracy": 0.37818666192863265, + "eval_loss": 3.81080961227417, + "eval_runtime": 146.0723, + "eval_samples_per_second": 164.679, + "eval_steps_per_second": 5.148, "step": 1450 }, { - "epoch": 7.52, - "learning_rate": 2e-05, - "loss": 6.8757, + "epoch": 30.41, + "learning_rate": 0.0003191564612549106, + "loss": 3.598, "step": 1460 }, { - "epoch": 7.58, - "learning_rate": 2e-05, - "loss": 6.8638, + "epoch": 30.62, + "learning_rate": 0.0003180914807635874, + "loss": 3.5942, "step": 1470 }, { - "epoch": 7.63, - "learning_rate": 2e-05, - "loss": 6.8717, + "epoch": 30.82, + "learning_rate": 0.00031702133456833236, + "loss": 3.585, "step": 1480 }, { - "epoch": 7.68, - "learning_rate": 2e-05, - "loss": 6.8967, + "epoch": 31.04, + "learning_rate": 0.00031594606948080663, + "loss": 3.7908, "step": 1490 }, { - "epoch": 7.73, - "learning_rate": 2e-05, - "loss": 6.8699, + "epoch": 31.25, + "learning_rate": 0.00031486573253658874, + "loss": 3.5017, "step": 1500 }, { - "epoch": 7.73, - "eval_accuracy": 0.10498492735216337, - "eval_loss": 6.902231216430664, - "eval_runtime": 168.9322, - "eval_samples_per_second": 142.394, - "eval_steps_per_second": 2.226, + "epoch": 31.25, + "eval_accuracy": 0.3878577124364749, + "eval_loss": 3.7240185737609863, + "eval_runtime": 145.8744, + "eval_samples_per_second": 164.902, + "eval_steps_per_second": 5.155, "step": 1500 }, { - "epoch": 7.78, - "learning_rate": 2e-05, - "loss": 6.867, + "epoch": 31.45, + "learning_rate": 0.00031378037099311627, + "loss": 3.5206, "step": 1510 }, { - "epoch": 7.83, - "learning_rate": 2e-05, - "loss": 6.8533, + "epoch": 31.66, + "learning_rate": 0.00031269003232761933, + "loss": 3.5049, "step": 1520 }, { - "epoch": 7.88, - "learning_rate": 2e-05, - "loss": 6.8669, + "epoch": 31.86, + "learning_rate": 0.0003115947642350433, + "loss": 3.4852, "step": 1530 }, { - "epoch": 7.93, - "learning_rate": 2e-05, - "loss": 6.862, + "epoch": 32.08, + "learning_rate": 0.00031049461462596267, + "loss": 3.6894, "step": 1540 }, { - "epoch": 7.99, - "learning_rate": 2e-05, - "loss": 6.8402, + "epoch": 32.29, + "learning_rate": 0.00030938963162448544, + "loss": 3.4311, "step": 1550 }, { - "epoch": 7.99, - "eval_accuracy": 0.10569349761970831, - "eval_loss": 6.8910064697265625, - "eval_runtime": 168.8729, - "eval_samples_per_second": 142.444, - "eval_steps_per_second": 2.227, + "epoch": 32.29, + "eval_accuracy": 0.3973612765821424, + "eval_loss": 3.6425790786743164, + "eval_runtime": 146.1194, + "eval_samples_per_second": 164.626, + "eval_steps_per_second": 5.146, "step": 1550 }, { - "epoch": 8.04, - "learning_rate": 2e-05, - "loss": 7.3336, + "epoch": 32.49, + "learning_rate": 0.0003082798635661476, + "loss": 3.4258, "step": 1560 }, { - "epoch": 8.09, - "learning_rate": 2e-05, - "loss": 6.8451, + "epoch": 32.7, + "learning_rate": 0.0003071653589957993, + "loss": 3.4076, "step": 1570 }, { - "epoch": 8.14, - "learning_rate": 2e-05, - "loss": 6.8418, + "epoch": 32.9, + "learning_rate": 0.000306046166665481, + "loss": 3.4117, "step": 1580 }, { - "epoch": 8.2, - "learning_rate": 2e-05, - "loss": 6.8648, + "epoch": 33.12, + "learning_rate": 0.00030492233553229076, + "loss": 3.5985, "step": 1590 }, { - "epoch": 8.25, - "learning_rate": 2e-05, - "loss": 6.8172, + "epoch": 33.33, + "learning_rate": 0.00030379391475624304, + "loss": 3.3517, "step": 1600 }, { - "epoch": 8.25, - "eval_accuracy": 0.1068832420259414, - "eval_loss": 6.8729939460754395, - "eval_runtime": 168.964, - "eval_samples_per_second": 142.368, - "eval_steps_per_second": 2.225, + "epoch": 33.33, + "eval_accuracy": 0.40682330082568596, + "eval_loss": 3.5615479946136475, + "eval_runtime": 146.0666, + "eval_samples_per_second": 164.685, + "eval_steps_per_second": 5.148, "step": 1600 }, { - "epoch": 8.3, - "learning_rate": 2e-05, - "loss": 6.832, + "epoch": 33.53, + "learning_rate": 0.0003026609536981183, + "loss": 3.3431, "step": 1610 }, { - "epoch": 8.35, - "learning_rate": 2e-05, - "loss": 6.8032, + "epoch": 33.74, + "learning_rate": 0.0003015235019173034, + "loss": 3.3546, "step": 1620 }, { - "epoch": 8.4, - "learning_rate": 2e-05, - "loss": 6.8527, + "epoch": 33.95, + "learning_rate": 0.00030038160916962404, + "loss": 3.3378, "step": 1630 }, { - "epoch": 8.45, - "learning_rate": 2e-05, - "loss": 6.8175, + "epoch": 34.16, + "learning_rate": 0.00029923532540516843, + "loss": 3.5305, "step": 1640 }, { - "epoch": 8.5, - "learning_rate": 2e-05, - "loss": 6.823, + "epoch": 34.37, + "learning_rate": 0.00029808470076610167, + "loss": 3.2856, "step": 1650 }, { - "epoch": 8.5, - "eval_accuracy": 0.10725955083624485, - "eval_loss": 6.86623477935791, - "eval_runtime": 168.9967, - "eval_samples_per_second": 142.34, - "eval_steps_per_second": 2.225, + "epoch": 34.37, + "eval_accuracy": 0.41555171151451314, + "eval_loss": 3.4914703369140625, + "eval_runtime": 146.1721, + "eval_samples_per_second": 164.566, + "eval_steps_per_second": 5.145, "step": 1650 }, { - "epoch": 8.55, - "learning_rate": 2e-05, - "loss": 6.8428, + "epoch": 34.58, + "learning_rate": 0.00029692978558447305, + "loss": 3.273, "step": 1660 }, { - "epoch": 8.61, - "learning_rate": 2e-05, - "loss": 6.8136, + "epoch": 34.78, + "learning_rate": 0.0002957706303800139, + "loss": 3.278, "step": 1670 }, { - "epoch": 8.66, - "learning_rate": 2e-05, - "loss": 6.828, + "epoch": 34.99, + "learning_rate": 0.0002946072858579282, + "loss": 3.2614, "step": 1680 }, { - "epoch": 8.71, - "learning_rate": 2e-05, - "loss": 6.8215, + "epoch": 35.21, + "learning_rate": 0.0002934398029066739, + "loss": 3.4456, "step": 1690 }, { - "epoch": 8.76, - "learning_rate": 2e-05, - "loss": 6.8028, + "epoch": 35.41, + "learning_rate": 0.0002922682325957376, + "loss": 3.227, "step": 1700 }, { - "epoch": 8.76, - "eval_accuracy": 0.1081789668246189, - "eval_loss": 6.848670482635498, - "eval_runtime": 168.985, - "eval_samples_per_second": 142.35, - "eval_steps_per_second": 2.225, + "epoch": 35.41, + "eval_accuracy": 0.4255488250901363, + "eval_loss": 3.41792893409729, + "eval_runtime": 146.0068, + "eval_samples_per_second": 164.753, + "eval_steps_per_second": 5.15, "step": 1700 }, { - "epoch": 8.81, - "learning_rate": 2e-05, - "loss": 6.8065, + "epoch": 35.62, + "learning_rate": 0.00029109262617339987, + "loss": 3.1995, "step": 1710 }, { - "epoch": 8.86, - "learning_rate": 2e-05, - "loss": 6.8185, + "epoch": 35.82, + "learning_rate": 0.0002899130350644941, + "loss": 3.2058, "step": 1720 }, { - "epoch": 8.91, - "learning_rate": 2e-05, - "loss": 6.8206, + "epoch": 36.04, + "learning_rate": 0.00028872951086815685, + "loss": 3.4183, "step": 1730 }, { - "epoch": 8.97, - "learning_rate": 2e-05, - "loss": 6.8051, + "epoch": 36.25, + "learning_rate": 0.00028754210535557036, + "loss": 3.1514, "step": 1740 }, { - "epoch": 9.02, - "learning_rate": 2e-05, - "loss": 7.3146, + "epoch": 36.45, + "learning_rate": 0.00028635087046769857, + "loss": 3.1675, "step": 1750 }, { - "epoch": 9.02, - "eval_accuracy": 0.10825639090130809, - "eval_loss": 6.8400139808654785, - "eval_runtime": 168.9057, - "eval_samples_per_second": 142.417, - "eval_steps_per_second": 2.226, + "epoch": 36.45, + "eval_accuracy": 0.43245804160401624, + "eval_loss": 3.3635590076446533, + "eval_runtime": 146.1639, + "eval_samples_per_second": 164.575, + "eval_steps_per_second": 5.145, "step": 1750 }, { - "epoch": 9.07, - "learning_rate": 2e-05, - "loss": 6.8451, + "epoch": 36.66, + "learning_rate": 0.00028515585831301456, + "loss": 3.1645, "step": 1760 }, { - "epoch": 9.12, - "learning_rate": 2e-05, - "loss": 6.8243, + "epoch": 36.86, + "learning_rate": 0.0002839571211652212, + "loss": 3.1617, "step": 1770 }, { - "epoch": 9.17, - "learning_rate": 2e-05, - "loss": 6.8415, + "epoch": 37.08, + "learning_rate": 0.00028275471146096466, + "loss": 3.3333, "step": 1780 }, { - "epoch": 9.23, - "learning_rate": 2e-05, - "loss": 6.7823, + "epoch": 37.29, + "learning_rate": 0.00028154868179754074, + "loss": 3.1167, "step": 1790 }, { - "epoch": 9.28, - "learning_rate": 2e-05, - "loss": 6.8014, + "epoch": 37.49, + "learning_rate": 0.0002803390849305939, + "loss": 3.0908, "step": 1800 }, { - "epoch": 9.28, - "eval_accuracy": 0.10921870585675643, - "eval_loss": 6.8303141593933105, - "eval_runtime": 169.0668, - "eval_samples_per_second": 142.281, - "eval_steps_per_second": 2.224, + "epoch": 37.49, + "eval_accuracy": 0.43940471782078516, + "eval_loss": 3.30828595161438, + "eval_runtime": 146.1043, + "eval_samples_per_second": 164.643, + "eval_steps_per_second": 5.147, "step": 1800 }, { - "epoch": 9.33, - "learning_rate": 2e-05, - "loss": 6.7885, + "epoch": 37.7, + "learning_rate": 0.0002791259737718097, + "loss": 3.1214, "step": 1810 }, { - "epoch": 9.38, - "learning_rate": 2e-05, - "loss": 6.7663, + "epoch": 37.9, + "learning_rate": 0.0002779094013866001, + "loss": 3.0987, "step": 1820 }, { - "epoch": 9.43, - "learning_rate": 2e-05, - "loss": 6.7476, + "epoch": 38.12, + "learning_rate": 0.00027668942099178234, + "loss": 3.2767, "step": 1830 }, { - "epoch": 9.48, - "learning_rate": 2e-05, - "loss": 6.8016, + "epoch": 38.33, + "learning_rate": 0.00027546608595325117, + "loss": 3.0716, "step": 1840 }, { - "epoch": 9.53, - "learning_rate": 2e-05, - "loss": 6.8028, + "epoch": 38.53, + "learning_rate": 0.00027423944978364416, + "loss": 3.0561, "step": 1850 }, { - "epoch": 9.53, - "eval_accuracy": 0.108833053639469, - "eval_loss": 6.822628021240234, - "eval_runtime": 169.002, - "eval_samples_per_second": 142.336, - "eval_steps_per_second": 2.225, + "epoch": 38.53, + "eval_accuracy": 0.44727625227121054, + "eval_loss": 3.25723934173584, + "eval_runtime": 145.9616, + "eval_samples_per_second": 164.804, + "eval_steps_per_second": 5.152, "step": 1850 }, { - "epoch": 9.59, - "learning_rate": 2e-05, - "loss": 6.7865, + "epoch": 38.74, + "learning_rate": 0.00027300956614000115, + "loss": 3.0564, "step": 1860 }, { - "epoch": 9.64, - "learning_rate": 2e-05, - "loss": 6.7951, + "epoch": 38.95, + "learning_rate": 0.00027177648882141704, + "loss": 3.0583, "step": 1870 }, { - "epoch": 9.69, - "learning_rate": 2e-05, - "loss": 6.7739, + "epoch": 39.16, + "learning_rate": 0.0002705402717666883, + "loss": 3.2319, "step": 1880 }, { - "epoch": 9.74, - "learning_rate": 2e-05, - "loss": 6.7718, + "epoch": 39.37, + "learning_rate": 0.00026930096905195363, + "loss": 3.0204, "step": 1890 }, { - "epoch": 9.79, - "learning_rate": 2e-05, - "loss": 6.7817, + "epoch": 39.58, + "learning_rate": 0.00026805863488832865, + "loss": 3.0139, "step": 1900 }, { - "epoch": 9.79, - "eval_accuracy": 0.11072009541672845, - "eval_loss": 6.807923316955566, - "eval_runtime": 168.9193, - "eval_samples_per_second": 142.405, - "eval_steps_per_second": 2.226, + "epoch": 39.58, + "eval_accuracy": 0.4525324485267982, + "eval_loss": 3.215851306915283, + "eval_runtime": 146.1327, + "eval_samples_per_second": 164.611, + "eval_steps_per_second": 5.146, "step": 1900 }, { - "epoch": 9.84, - "learning_rate": 2e-05, - "loss": 6.7725, + "epoch": 39.78, + "learning_rate": 0.00026681332361953424, + "loss": 3.0053, "step": 1910 }, { - "epoch": 9.89, - "learning_rate": 2e-05, - "loss": 6.7347, + "epoch": 39.99, + "learning_rate": 0.0002655650897195195, + "loss": 3.0171, "step": 1920 }, { - "epoch": 9.94, - "learning_rate": 2e-05, - "loss": 6.7559, + "epoch": 40.21, + "learning_rate": 0.0002643139877900791, + "loss": 3.1749, "step": 1930 }, { - "epoch": 10.0, - "learning_rate": 2e-05, - "loss": 6.7409, + "epoch": 40.41, + "learning_rate": 0.00026306007255846436, + "loss": 2.9764, "step": 1940 }, { - "epoch": 10.05, - "learning_rate": 2e-05, - "loss": 7.28, + "epoch": 40.62, + "learning_rate": 0.00026180339887498953, + "loss": 2.9837, "step": 1950 }, { - "epoch": 10.05, - "eval_accuracy": 0.11147970049664972, - "eval_loss": 6.802090167999268, - "eval_runtime": 169.0764, - "eval_samples_per_second": 142.273, - "eval_steps_per_second": 2.224, + "epoch": 40.62, + "eval_accuracy": 0.45754832554207525, + "eval_loss": 3.1789309978485107, + "eval_runtime": 146.1778, + "eval_samples_per_second": 164.56, + "eval_steps_per_second": 5.144, "step": 1950 }, { - "epoch": 10.1, - "learning_rate": 2e-05, - "loss": 6.7606, + "epoch": 40.82, + "learning_rate": 0.00026054402171063267, + "loss": 2.9752, "step": 1960 }, { - "epoch": 10.15, - "learning_rate": 2e-05, - "loss": 6.7658, + "epoch": 41.04, + "learning_rate": 0.0002592819961546308, + "loss": 3.1648, "step": 1970 }, { - "epoch": 10.21, - "learning_rate": 2e-05, - "loss": 6.7526, + "epoch": 41.25, + "learning_rate": 0.00025801737741207005, + "loss": 2.9438, "step": 1980 }, { - "epoch": 10.26, - "learning_rate": 2e-05, - "loss": 6.7531, + "epoch": 41.45, + "learning_rate": 0.000256750220801471, + "loss": 2.941, "step": 1990 }, { - "epoch": 10.31, - "learning_rate": 2e-05, - "loss": 6.7624, + "epoch": 41.66, + "learning_rate": 0.0002554805817523689, + "loss": 2.9387, "step": 2000 }, { - "epoch": 10.31, - "eval_accuracy": 0.11175787497505224, - "eval_loss": 6.793049335479736, - "eval_runtime": 169.0207, - "eval_samples_per_second": 142.32, - "eval_steps_per_second": 2.225, + "epoch": 41.66, + "eval_accuracy": 0.46179467604077673, + "eval_loss": 3.1430864334106445, + "eval_runtime": 146.0529, + "eval_samples_per_second": 164.701, + "eval_steps_per_second": 5.149, "step": 2000 }, { - "epoch": 10.36, - "learning_rate": 2e-05, - "loss": 6.7658, + "epoch": 41.86, + "learning_rate": 0.0002542085158028889, + "loss": 2.9371, "step": 2010 }, { - "epoch": 10.41, - "learning_rate": 2e-05, - "loss": 6.7513, + "epoch": 42.08, + "learning_rate": 0.00025293407859731633, + "loss": 3.1085, "step": 2020 }, { - "epoch": 10.46, - "learning_rate": 2e-05, - "loss": 6.7618, + "epoch": 42.29, + "learning_rate": 0.00025165732588366334, + "loss": 2.8999, "step": 2030 }, { - "epoch": 10.51, - "learning_rate": 2e-05, - "loss": 6.7146, + "epoch": 42.49, + "learning_rate": 0.00025037831351122967, + "loss": 2.9159, "step": 2040 }, { - "epoch": 10.56, - "learning_rate": 2e-05, - "loss": 6.7416, + "epoch": 42.7, + "learning_rate": 0.0002490970974281599, + "loss": 2.9034, "step": 2050 }, { - "epoch": 10.56, - "eval_accuracy": 0.11243676542273737, - "eval_loss": 6.786773681640625, - "eval_runtime": 168.9144, - "eval_samples_per_second": 142.409, - "eval_steps_per_second": 2.226, + "epoch": 42.7, + "eval_accuracy": 0.46535935335872575, + "eval_loss": 3.116283655166626, + "eval_runtime": 146.1195, + "eval_samples_per_second": 164.626, + "eval_steps_per_second": 5.146, "step": 2050 }, { - "epoch": 10.62, - "learning_rate": 2e-05, - "loss": 6.7629, + "epoch": 42.9, + "learning_rate": 0.00024781373367899597, + "loss": 2.8936, "step": 2060 }, { - "epoch": 10.67, - "learning_rate": 2e-05, - "loss": 6.7654, + "epoch": 43.12, + "learning_rate": 0.00024652827840222606, + "loss": 3.0697, "step": 2070 }, { - "epoch": 10.72, - "learning_rate": 2e-05, - "loss": 6.7179, + "epoch": 43.33, + "learning_rate": 0.00024524078782782807, + "loss": 2.8913, "step": 2080 }, { - "epoch": 10.77, - "learning_rate": 2e-05, - "loss": 6.7545, + "epoch": 43.53, + "learning_rate": 0.00024395131827481062, + "loss": 2.8624, "step": 2090 }, { - "epoch": 10.82, - "learning_rate": 2e-05, - "loss": 6.7288, + "epoch": 43.74, + "learning_rate": 0.0002426599261487494, + "loss": 2.8822, "step": 2100 }, { - "epoch": 10.82, - "eval_accuracy": 0.11334799297462662, - "eval_loss": 6.780516147613525, - "eval_runtime": 168.9613, - "eval_samples_per_second": 142.37, - "eval_steps_per_second": 2.225, + "epoch": 43.74, + "eval_accuracy": 0.46941429535485324, + "eval_loss": 3.0841524600982666, + "eval_runtime": 146.1268, + "eval_samples_per_second": 164.617, + "eval_steps_per_second": 5.146, "step": 2100 }, { - "epoch": 10.87, - "learning_rate": 2e-05, - "loss": 6.751, + "epoch": 43.95, + "learning_rate": 0.00024136666793931935, + "loss": 2.8655, "step": 2110 }, { - "epoch": 10.92, - "learning_rate": 2e-05, - "loss": 6.7746, + "epoch": 44.16, + "learning_rate": 0.00024007160021782427, + "loss": 3.0323, "step": 2120 }, { - "epoch": 10.98, - "learning_rate": 2e-05, - "loss": 6.7309, + "epoch": 44.37, + "learning_rate": 0.0002387747796347217, + "loss": 2.8446, "step": 2130 }, { - "epoch": 11.03, - "learning_rate": 2e-05, - "loss": 7.2015, + "epoch": 44.58, + "learning_rate": 0.00023747626291714498, + "loss": 2.8433, "step": 2140 }, { - "epoch": 11.08, - "learning_rate": 2e-05, - "loss": 6.7468, + "epoch": 44.78, + "learning_rate": 0.000236176106866422, + "loss": 2.836, "step": 2150 }, { - "epoch": 11.08, - "eval_accuracy": 0.11234059549981652, - "eval_loss": 6.772030353546143, - "eval_runtime": 169.0603, - "eval_samples_per_second": 142.287, - "eval_steps_per_second": 2.224, + "epoch": 44.78, + "eval_accuracy": 0.47268071006532664, + "eval_loss": 3.0583226680755615, + "eval_runtime": 145.9982, + "eval_samples_per_second": 164.762, + "eval_steps_per_second": 5.151, "step": 2150 }, { - "epoch": 11.13, - "learning_rate": 2e-05, - "loss": 6.6947, + "epoch": 44.99, + "learning_rate": 0.00023487436835559035, + "loss": 2.8457, "step": 2160 }, { - "epoch": 11.18, - "learning_rate": 2e-05, - "loss": 6.7154, + "epoch": 45.21, + "learning_rate": 0.00023357110432690954, + "loss": 2.9941, "step": 2170 }, { - "epoch": 11.24, - "learning_rate": 2e-05, - "loss": 6.6892, + "epoch": 45.41, + "learning_rate": 0.00023226637178937022, + "loss": 2.8208, "step": 2180 }, { - "epoch": 11.29, - "learning_rate": 2e-05, - "loss": 6.7256, + "epoch": 45.62, + "learning_rate": 0.00023096022781620034, + "loss": 2.8154, "step": 2190 }, { - "epoch": 11.34, - "learning_rate": 2e-05, - "loss": 6.7387, + "epoch": 45.82, + "learning_rate": 0.0002296527295423684, + "loss": 2.8129, "step": 2200 }, { - "epoch": 11.34, - "eval_accuracy": 0.11353738741762727, - "eval_loss": 6.7636189460754395, - "eval_runtime": 168.99, - "eval_samples_per_second": 142.346, - "eval_steps_per_second": 2.225, + "epoch": 45.82, + "eval_accuracy": 0.47600857452342976, + "eval_loss": 3.035902738571167, + "eval_runtime": 145.9849, + "eval_samples_per_second": 164.777, + "eval_steps_per_second": 5.151, "step": 2200 }, { - "epoch": 11.39, - "learning_rate": 2e-05, - "loss": 6.6785, + "epoch": 46.04, + "learning_rate": 0.00022834393416208486, + "loss": 2.9871, "step": 2210 }, { - "epoch": 11.44, - "learning_rate": 2e-05, - "loss": 6.6991, + "epoch": 46.25, + "learning_rate": 0.0002270338989262994, + "loss": 2.7892, "step": 2220 }, { - "epoch": 11.49, - "learning_rate": 2e-05, - "loss": 6.7317, + "epoch": 46.45, + "learning_rate": 0.00022572268114019726, + "loss": 2.7843, "step": 2230 }, { - "epoch": 11.54, - "learning_rate": 2e-05, - "loss": 6.7243, + "epoch": 46.66, + "learning_rate": 0.00022441033816069202, + "loss": 2.7867, "step": 2240 }, { - "epoch": 11.6, - "learning_rate": 2e-05, - "loss": 6.7242, + "epoch": 46.86, + "learning_rate": 0.00022309692739391727, + "loss": 2.7733, "step": 2250 }, { - "epoch": 11.6, - "eval_accuracy": 0.11342761785068739, - "eval_loss": 6.755679607391357, - "eval_runtime": 169.0043, - "eval_samples_per_second": 142.334, - "eval_steps_per_second": 2.225, + "epoch": 46.86, + "eval_accuracy": 0.47764141406488453, + "eval_loss": 3.017348051071167, + "eval_runtime": 146.0338, + "eval_samples_per_second": 164.722, + "eval_steps_per_second": 5.149, "step": 2250 }, { - "epoch": 11.65, - "learning_rate": 2e-05, - "loss": 6.7085, + "epoch": 47.08, + "learning_rate": 0.00022178250629271452, + "loss": 2.981, "step": 2260 }, { - "epoch": 11.7, - "learning_rate": 2e-05, - "loss": 6.7231, + "epoch": 47.29, + "learning_rate": 0.00022046713235412103, + "loss": 2.7598, "step": 2270 }, { - "epoch": 11.75, - "learning_rate": 2e-05, - "loss": 6.6973, + "epoch": 47.49, + "learning_rate": 0.00021915086311685404, + "loss": 2.7769, "step": 2280 }, { - "epoch": 11.8, - "learning_rate": 2e-05, - "loss": 6.6921, + "epoch": 47.7, + "learning_rate": 0.00021783375615879415, + "loss": 2.7753, "step": 2290 }, { - "epoch": 11.85, - "learning_rate": 2e-05, - "loss": 6.702, + "epoch": 47.9, + "learning_rate": 0.0002165158690944665, + "loss": 2.7589, "step": 2300 }, { - "epoch": 11.85, - "eval_accuracy": 0.11411896604340402, - "eval_loss": 6.749605178833008, - "eval_runtime": 169.2579, - "eval_samples_per_second": 142.12, - "eval_steps_per_second": 2.221, + "epoch": 47.9, + "eval_accuracy": 0.4811929413931917, + "eval_loss": 2.9977798461914062, + "eval_runtime": 146.0602, + "eval_samples_per_second": 164.692, + "eval_steps_per_second": 5.149, "step": 2300 }, { - "epoch": 11.9, - "learning_rate": 2e-05, - "loss": 6.7191, + "epoch": 48.12, + "learning_rate": 0.00021519725957252063, + "loss": 2.9409, "step": 2310 }, { - "epoch": 11.96, - "learning_rate": 2e-05, - "loss": 6.6818, + "epoch": 48.33, + "learning_rate": 0.00021387798527320882, + "loss": 2.7465, "step": 2320 }, { - "epoch": 12.01, - "learning_rate": 2e-05, - "loss": 7.236, + "epoch": 48.53, + "learning_rate": 0.0002125581039058627, + "loss": 2.7403, "step": 2330 }, { - "epoch": 12.06, - "learning_rate": 2e-05, - "loss": 6.6929, + "epoch": 48.74, + "learning_rate": 0.0002112376732063691, + "loss": 2.7284, "step": 2340 }, { - "epoch": 12.11, - "learning_rate": 2e-05, - "loss": 6.6662, + "epoch": 48.95, + "learning_rate": 0.00020991675093464448, + "loss": 2.7378, "step": 2350 }, { - "epoch": 12.11, - "eval_accuracy": 0.11496613116598788, - "eval_loss": 6.743268013000488, - "eval_runtime": 169.0391, - "eval_samples_per_second": 142.304, - "eval_steps_per_second": 2.224, + "epoch": 48.95, + "eval_accuracy": 0.4831324380858166, + "eval_loss": 2.9787769317626953, + "eval_runtime": 146.0148, + "eval_samples_per_second": 164.744, + "eval_steps_per_second": 5.15, "step": 2350 }, { - "epoch": 12.16, - "learning_rate": 2e-05, - "loss": 6.7133, + "epoch": 49.16, + "learning_rate": 0.00020859539487210813, + "loss": 2.9167, "step": 2360 }, { - "epoch": 12.22, - "learning_rate": 2e-05, - "loss": 6.7061, + "epoch": 49.37, + "learning_rate": 0.0002072736628191549, + "loss": 2.7203, "step": 2370 }, { - "epoch": 12.27, - "learning_rate": 2e-05, - "loss": 6.6505, + "epoch": 49.58, + "learning_rate": 0.0002059516125926265, + "loss": 2.7276, "step": 2380 }, { - "epoch": 12.32, - "learning_rate": 2e-05, - "loss": 6.7027, + "epoch": 49.78, + "learning_rate": 0.00020462930202328278, + "loss": 2.7001, "step": 2390 }, { - "epoch": 12.37, - "learning_rate": 2e-05, - "loss": 6.6781, + "epoch": 49.99, + "learning_rate": 0.00020330678895327174, + "loss": 2.7138, "step": 2400 }, { - "epoch": 12.37, - "eval_accuracy": 0.11483893825636936, - "eval_loss": 6.736194610595703, - "eval_runtime": 168.9398, - "eval_samples_per_second": 142.388, - "eval_steps_per_second": 2.226, + "epoch": 49.99, + "eval_accuracy": 0.4843915093446441, + "eval_loss": 2.967425584793091, + "eval_runtime": 146.0929, + "eval_samples_per_second": 164.655, + "eval_steps_per_second": 5.147, "step": 2400 }, { - "epoch": 12.42, - "learning_rate": 2e-05, - "loss": 6.7312, + "epoch": 50.21, + "learning_rate": 0.00020198413123359926, + "loss": 2.8865, "step": 2410 }, { - "epoch": 12.47, - "learning_rate": 2e-05, - "loss": 6.6969, + "epoch": 50.41, + "learning_rate": 0.00020066138672159903, + "loss": 2.698, "step": 2420 }, { - "epoch": 12.52, - "learning_rate": 2e-05, - "loss": 6.6962, + "epoch": 50.62, + "learning_rate": 0.00019933861327840098, + "loss": 2.6978, "step": 2430 }, { - "epoch": 12.58, - "learning_rate": 2e-05, - "loss": 6.729, + "epoch": 50.82, + "learning_rate": 0.00019801586876640073, + "loss": 2.704, "step": 2440 }, { - "epoch": 12.63, - "learning_rate": 2e-05, - "loss": 6.6743, + "epoch": 51.04, + "learning_rate": 0.0001966932110467283, + "loss": 2.8692, "step": 2450 }, { - "epoch": 12.63, - "eval_accuracy": 0.11607017718543403, - "eval_loss": 6.727517127990723, - "eval_runtime": 169.0176, - "eval_samples_per_second": 142.322, - "eval_steps_per_second": 2.225, + "epoch": 51.04, + "eval_accuracy": 0.4874163939573572, + "eval_loss": 2.9475862979888916, + "eval_runtime": 145.9737, + "eval_samples_per_second": 164.79, + "eval_steps_per_second": 5.152, "step": 2450 }, { - "epoch": 12.68, - "learning_rate": 2e-05, - "loss": 6.6952, + "epoch": 51.25, + "learning_rate": 0.00019537069797671724, + "loss": 2.6734, "step": 2460 }, { - "epoch": 12.73, - "learning_rate": 2e-05, - "loss": 6.6939, + "epoch": 51.45, + "learning_rate": 0.0001940483874073735, + "loss": 2.6636, "step": 2470 }, { - "epoch": 12.78, - "learning_rate": 2e-05, - "loss": 6.6848, + "epoch": 51.66, + "learning_rate": 0.00019272633718084517, + "loss": 2.6756, "step": 2480 }, { - "epoch": 12.83, - "learning_rate": 2e-05, - "loss": 6.6694, + "epoch": 51.86, + "learning_rate": 0.0001914046051278919, + "loss": 2.6808, "step": 2490 }, { - "epoch": 12.88, - "learning_rate": 2e-05, - "loss": 6.6843, + "epoch": 52.08, + "learning_rate": 0.00019008324906535554, + "loss": 2.8462, "step": 2500 }, { - "epoch": 12.88, - "eval_accuracy": 0.11647291428955145, - "eval_loss": 6.724733829498291, - "eval_runtime": 169.033, - "eval_samples_per_second": 142.31, - "eval_steps_per_second": 2.224, + "epoch": 52.08, + "eval_accuracy": 0.48931343115405407, + "eval_loss": 2.934227466583252, + "eval_runtime": 145.9977, + "eval_samples_per_second": 164.763, + "eval_steps_per_second": 5.151, "step": 2500 }, { - "epoch": 12.93, - "learning_rate": 2e-05, - "loss": 6.6913, + "epoch": 52.29, + "learning_rate": 0.0001887623267936309, + "loss": 2.6553, "step": 2510 }, { - "epoch": 12.99, - "learning_rate": 2e-05, - "loss": 6.6832, + "epoch": 52.49, + "learning_rate": 0.00018744189609413734, + "loss": 2.6559, "step": 2520 }, { - "epoch": 13.04, - "learning_rate": 2e-05, - "loss": 7.1899, + "epoch": 52.7, + "learning_rate": 0.0001861220147267912, + "loss": 2.6536, "step": 2530 }, { - "epoch": 13.09, - "learning_rate": 2e-05, - "loss": 6.683, + "epoch": 52.9, + "learning_rate": 0.0001848027404274794, + "loss": 2.6524, "step": 2540 }, { - "epoch": 13.14, - "learning_rate": 2e-05, - "loss": 6.6726, + "epoch": 53.12, + "learning_rate": 0.00018348413090553354, + "loss": 2.8312, "step": 2550 }, { - "epoch": 13.14, - "eval_accuracy": 0.11728262763830573, - "eval_loss": 6.712704658508301, - "eval_runtime": 169.1283, - "eval_samples_per_second": 142.229, - "eval_steps_per_second": 2.223, + "epoch": 53.12, + "eval_accuracy": 0.4900369570164547, + "eval_loss": 2.9268674850463867, + "eval_runtime": 146.0027, + "eval_samples_per_second": 164.757, + "eval_steps_per_second": 5.151, "step": 2550 }, { - "epoch": 13.2, - "learning_rate": 2e-05, - "loss": 6.6882, + "epoch": 53.33, + "learning_rate": 0.00018216624384120595, + "loss": 2.6306, "step": 2560 }, { - "epoch": 13.25, - "learning_rate": 2e-05, - "loss": 6.6761, + "epoch": 53.53, + "learning_rate": 0.00018084913688314597, + "loss": 2.6398, "step": 2570 }, { - "epoch": 13.3, - "learning_rate": 2e-05, - "loss": 6.6757, + "epoch": 53.74, + "learning_rate": 0.000179532867645879, + "loss": 2.6318, "step": 2580 }, { - "epoch": 13.35, - "learning_rate": 2e-05, - "loss": 6.6374, + "epoch": 53.95, + "learning_rate": 0.0001782174937072855, + "loss": 2.6358, "step": 2590 }, { - "epoch": 13.4, - "learning_rate": 2e-05, - "loss": 6.6656, + "epoch": 54.16, + "learning_rate": 0.00017690307260608278, + "loss": 2.7834, "step": 2600 }, { - "epoch": 13.4, - "eval_accuracy": 0.11703056957986009, - "eval_loss": 6.709805011749268, - "eval_runtime": 169.0648, - "eval_samples_per_second": 142.283, - "eval_steps_per_second": 2.224, + "epoch": 54.16, + "eval_accuracy": 0.4917280711401593, + "eval_loss": 2.911123037338257, + "eval_runtime": 146.0206, + "eval_samples_per_second": 164.737, + "eval_steps_per_second": 5.15, "step": 2600 }, { - "epoch": 13.45, - "learning_rate": 2e-05, - "loss": 6.6369, + "epoch": 54.37, + "learning_rate": 0.000175589661839308, + "loss": 2.6226, "step": 2610 }, { - "epoch": 13.5, - "learning_rate": 2e-05, - "loss": 6.6619, + "epoch": 54.58, + "learning_rate": 0.00017427731885980282, + "loss": 2.6183, "step": 2620 }, { - "epoch": 13.55, - "learning_rate": 2e-05, - "loss": 6.6576, + "epoch": 54.78, + "learning_rate": 0.0001729661010737007, + "loss": 2.6313, "step": 2630 }, { - "epoch": 13.61, - "learning_rate": 2e-05, - "loss": 6.6737, + "epoch": 54.99, + "learning_rate": 0.00017165606583791515, + "loss": 2.6366, "step": 2640 }, { - "epoch": 13.66, - "learning_rate": 2e-05, - "loss": 6.6428, + "epoch": 55.21, + "learning_rate": 0.00017034727045763158, + "loss": 2.7822, "step": 2650 }, { - "epoch": 13.66, - "eval_accuracy": 0.11852539903484209, - "eval_loss": 6.701896667480469, - "eval_runtime": 169.0251, - "eval_samples_per_second": 142.316, - "eval_steps_per_second": 2.225, + "epoch": 55.21, + "eval_accuracy": 0.4934482911572486, + "eval_loss": 2.8986542224884033, + "eval_runtime": 146.1152, + "eval_samples_per_second": 164.63, + "eval_steps_per_second": 5.147, "step": 2650 }, { - "epoch": 13.71, - "learning_rate": 2e-05, - "loss": 6.6634, + "epoch": 55.41, + "learning_rate": 0.00016903977218379974, + "loss": 2.5985, "step": 2660 }, { - "epoch": 13.76, - "learning_rate": 2e-05, - "loss": 6.6575, + "epoch": 55.62, + "learning_rate": 0.00016773362821062983, + "loss": 2.6059, "step": 2670 }, { - "epoch": 13.81, - "learning_rate": 2e-05, - "loss": 6.6788, + "epoch": 55.82, + "learning_rate": 0.00016642889567309048, + "loss": 2.6083, "step": 2680 }, { - "epoch": 13.86, - "learning_rate": 2e-05, - "loss": 6.659, + "epoch": 56.04, + "learning_rate": 0.0001651256316444097, + "loss": 2.7793, "step": 2690 }, { - "epoch": 13.91, - "learning_rate": 2e-05, - "loss": 6.6355, + "epoch": 56.25, + "learning_rate": 0.0001638238931335781, + "loss": 2.584, "step": 2700 }, { - "epoch": 13.91, - "eval_accuracy": 0.11754058144545079, - "eval_loss": 6.697854995727539, - "eval_runtime": 169.1175, - "eval_samples_per_second": 142.238, - "eval_steps_per_second": 2.223, + "epoch": 56.25, + "eval_accuracy": 0.49487679829418024, + "eval_loss": 2.8844311237335205, + "eval_runtime": 145.9294, + "eval_samples_per_second": 164.84, + "eval_steps_per_second": 5.153, "step": 2700 }, { - "epoch": 13.97, - "learning_rate": 2e-05, - "loss": 6.6451, + "epoch": 56.45, + "learning_rate": 0.00016252373708285504, + "loss": 2.5884, "step": 2710 }, { - "epoch": 14.02, - "learning_rate": 2e-05, - "loss": 7.1351, + "epoch": 56.66, + "learning_rate": 0.00016122522036527838, + "loss": 2.5881, "step": 2720 }, { - "epoch": 14.07, - "learning_rate": 2e-05, - "loss": 6.6533, + "epoch": 56.86, + "learning_rate": 0.00015992839978217578, + "loss": 2.5866, "step": 2730 }, { - "epoch": 14.12, - "learning_rate": 2e-05, - "loss": 6.6421, + "epoch": 57.08, + "learning_rate": 0.00015863333206068067, + "loss": 2.7644, "step": 2740 }, { - "epoch": 14.17, - "learning_rate": 2e-05, - "loss": 6.6521, + "epoch": 57.29, + "learning_rate": 0.00015734007385125067, + "loss": 2.5668, "step": 2750 }, { - "epoch": 14.17, - "eval_accuracy": 0.11875193333953445, - "eval_loss": 6.692318439483643, - "eval_runtime": 169.1423, - "eval_samples_per_second": 142.218, - "eval_steps_per_second": 2.223, + "epoch": 57.29, + "eval_accuracy": 0.49651714759851406, + "eval_loss": 2.880821704864502, + "eval_runtime": 146.1597, + "eval_samples_per_second": 164.58, + "eval_steps_per_second": 5.145, "step": 2750 }, { - "epoch": 14.23, - "learning_rate": 2e-05, - "loss": 6.6099, + "epoch": 57.49, + "learning_rate": 0.0001560486817251894, + "loss": 2.5728, "step": 2760 }, { - "epoch": 14.28, - "learning_rate": 2e-05, - "loss": 6.6254, + "epoch": 57.7, + "learning_rate": 0.000154759212172172, + "loss": 2.5765, "step": 2770 }, { - "epoch": 14.33, - "learning_rate": 2e-05, - "loss": 6.6545, + "epoch": 57.9, + "learning_rate": 0.00015347172159777396, + "loss": 2.5794, "step": 2780 }, { - "epoch": 14.38, - "learning_rate": 2e-05, - "loss": 6.6443, + "epoch": 58.12, + "learning_rate": 0.000152186266321004, + "loss": 2.7342, "step": 2790 }, { - "epoch": 14.43, - "learning_rate": 2e-05, - "loss": 6.6735, + "epoch": 58.33, + "learning_rate": 0.0001509029025718402, + "loss": 2.5536, "step": 2800 }, { - "epoch": 14.43, - "eval_accuracy": 0.1185968174100908, - "eval_loss": 6.684227466583252, - "eval_runtime": 169.1201, - "eval_samples_per_second": 142.236, - "eval_steps_per_second": 2.223, + "epoch": 58.33, + "eval_accuracy": 0.4981620698137741, + "eval_loss": 2.864001512527466, + "eval_runtime": 146.123, + "eval_samples_per_second": 164.622, + "eval_steps_per_second": 5.146, "step": 2800 }, { - "epoch": 14.48, - "learning_rate": 2e-05, - "loss": 6.6032, + "epoch": 58.53, + "learning_rate": 0.0001496216864887704, + "loss": 2.5466, "step": 2810 }, { - "epoch": 14.53, - "learning_rate": 2e-05, - "loss": 6.6217, + "epoch": 58.74, + "learning_rate": 0.00014834267411633674, + "loss": 2.553, "step": 2820 }, { - "epoch": 14.59, - "learning_rate": 2e-05, - "loss": 6.6212, + "epoch": 58.95, + "learning_rate": 0.0001470659214026837, + "loss": 2.5623, "step": 2830 }, { - "epoch": 14.64, - "learning_rate": 2e-05, - "loss": 6.636, + "epoch": 59.16, + "learning_rate": 0.00014579148419711119, + "loss": 2.727, "step": 2840 }, { - "epoch": 14.69, - "learning_rate": 2e-05, - "loss": 6.6151, + "epoch": 59.37, + "learning_rate": 0.00014451941824763113, + "loss": 2.5403, "step": 2850 }, { - "epoch": 14.69, - "eval_accuracy": 0.11951815218710449, - "eval_loss": 6.679075241088867, - "eval_runtime": 168.9877, - "eval_samples_per_second": 142.348, - "eval_steps_per_second": 2.225, + "epoch": 59.37, + "eval_accuracy": 0.49815218958648255, + "eval_loss": 2.860569953918457, + "eval_runtime": 146.132, + "eval_samples_per_second": 164.611, + "eval_steps_per_second": 5.146, "step": 2850 }, { - "epoch": 14.74, - "learning_rate": 2e-05, - "loss": 6.6269, + "epoch": 59.58, + "learning_rate": 0.000143249779198529, + "loss": 2.5441, "step": 2860 }, { - "epoch": 14.79, - "learning_rate": 2e-05, - "loss": 6.6356, + "epoch": 59.78, + "learning_rate": 0.00014198262258793002, + "loss": 2.5541, "step": 2870 }, { - "epoch": 14.84, - "learning_rate": 2e-05, - "loss": 6.6496, + "epoch": 59.99, + "learning_rate": 0.00014071800384536927, + "loss": 2.5482, "step": 2880 }, { - "epoch": 14.89, - "learning_rate": 2e-05, - "loss": 6.6702, + "epoch": 60.21, + "learning_rate": 0.00013945597828936737, + "loss": 2.6878, "step": 2890 }, { - "epoch": 14.94, - "learning_rate": 2e-05, - "loss": 6.6248, + "epoch": 60.41, + "learning_rate": 0.00013819660112501054, + "loss": 2.5294, "step": 2900 }, { - "epoch": 14.94, - "eval_accuracy": 0.11975263870094723, - "eval_loss": 6.675192832946777, - "eval_runtime": 169.15, - "eval_samples_per_second": 142.211, - "eval_steps_per_second": 2.223, + "epoch": 60.41, + "eval_accuracy": 0.5007705653773009, + "eval_loss": 2.8440916538238525, + "eval_runtime": 146.0675, + "eval_samples_per_second": 164.684, + "eval_steps_per_second": 5.148, "step": 2900 }, { - "epoch": 15.0, - "learning_rate": 2e-05, - "loss": 6.6325, + "epoch": 60.62, + "learning_rate": 0.00013693992744153572, + "loss": 2.5448, "step": 2910 }, { - "epoch": 15.05, - "learning_rate": 2e-05, - "loss": 7.124, + "epoch": 60.82, + "learning_rate": 0.00013568601220992097, + "loss": 2.5435, "step": 2920 }, { - "epoch": 15.1, - "learning_rate": 2e-05, - "loss": 6.6072, + "epoch": 61.04, + "learning_rate": 0.00013443491028048045, + "loss": 2.71, "step": 2930 }, { - "epoch": 15.15, - "learning_rate": 2e-05, - "loss": 6.6165, + "epoch": 61.25, + "learning_rate": 0.0001331866763804658, + "loss": 2.5199, "step": 2940 }, { - "epoch": 15.21, - "learning_rate": 2e-05, - "loss": 6.6427, + "epoch": 61.45, + "learning_rate": 0.0001319413651116714, + "loss": 2.513, "step": 2950 }, { - "epoch": 15.21, - "eval_accuracy": 0.12074793730646591, - "eval_loss": 6.666472434997559, - "eval_runtime": 169.0435, - "eval_samples_per_second": 142.301, - "eval_steps_per_second": 2.224, + "epoch": 61.45, + "eval_accuracy": 0.5013016714921779, + "eval_loss": 2.840217113494873, + "eval_runtime": 146.0072, + "eval_samples_per_second": 164.752, + "eval_steps_per_second": 5.15, "step": 2950 }, { - "epoch": 15.26, - "learning_rate": 2e-05, - "loss": 6.6282, + "epoch": 61.66, + "learning_rate": 0.00013069903094804644, + "loss": 2.5158, "step": 2960 }, { - "epoch": 15.31, - "learning_rate": 2e-05, - "loss": 6.6127, + "epoch": 61.86, + "learning_rate": 0.0001294597282333118, + "loss": 2.5292, "step": 2970 }, { - "epoch": 15.36, - "learning_rate": 2e-05, - "loss": 6.6067, + "epoch": 62.08, + "learning_rate": 0.00012822351117858303, + "loss": 2.6752, "step": 2980 }, { - "epoch": 15.41, - "learning_rate": 2e-05, - "loss": 6.6559, + "epoch": 62.29, + "learning_rate": 0.0001269904338599989, + "loss": 2.5094, "step": 2990 }, { - "epoch": 15.46, - "learning_rate": 2e-05, - "loss": 6.5947, + "epoch": 62.49, + "learning_rate": 0.0001257605502163558, + "loss": 2.5105, "step": 3000 }, { - "epoch": 15.46, - "eval_accuracy": 0.12065681925283153, - "eval_loss": 6.663944721221924, - "eval_runtime": 169.0216, - "eval_samples_per_second": 142.319, - "eval_steps_per_second": 2.225, + "epoch": 62.49, + "eval_accuracy": 0.5022339398713631, + "eval_loss": 2.8315513134002686, + "eval_runtime": 146.1095, + "eval_samples_per_second": 164.637, + "eval_steps_per_second": 5.147, "step": 3000 }, { - "epoch": 15.51, - "learning_rate": 2e-05, - "loss": 6.62, + "epoch": 62.7, + "learning_rate": 0.00012453391404674885, + "loss": 2.4981, "step": 3010 }, { - "epoch": 15.56, - "learning_rate": 2e-05, - "loss": 6.6387, + "epoch": 62.9, + "learning_rate": 0.00012331057900821768, + "loss": 2.5072, "step": 3020 }, { - "epoch": 15.62, - "learning_rate": 2e-05, - "loss": 6.6129, + "epoch": 63.12, + "learning_rate": 0.0001220905986134, + "loss": 2.6561, "step": 3030 }, { - "epoch": 15.67, - "learning_rate": 2e-05, - "loss": 6.6413, + "epoch": 63.33, + "learning_rate": 0.00012087402622819039, + "loss": 2.5062, "step": 3040 }, { - "epoch": 15.72, - "learning_rate": 2e-05, - "loss": 6.6199, + "epoch": 63.53, + "learning_rate": 0.00011966091506940616, + "loss": 2.4897, "step": 3050 }, { - "epoch": 15.72, - "eval_accuracy": 0.12169938085571898, - "eval_loss": 6.659843921661377, - "eval_runtime": 169.0987, - "eval_samples_per_second": 142.254, - "eval_steps_per_second": 2.224, + "epoch": 63.53, + "eval_accuracy": 0.502685487439774, + "eval_loss": 2.823685646057129, + "eval_runtime": 146.1084, + "eval_samples_per_second": 164.638, + "eval_steps_per_second": 5.147, "step": 3050 }, { - "epoch": 15.77, - "learning_rate": 2e-05, - "loss": 6.6067, + "epoch": 63.74, + "learning_rate": 0.00011845131820245934, + "loss": 2.4945, "step": 3060 }, { - "epoch": 15.82, - "learning_rate": 2e-05, - "loss": 6.6333, + "epoch": 63.95, + "learning_rate": 0.00011724528853903536, + "loss": 2.5023, "step": 3070 }, { - "epoch": 15.87, - "learning_rate": 2e-05, - "loss": 6.619, + "epoch": 64.16, + "learning_rate": 0.00011604287883477889, + "loss": 2.637, "step": 3080 }, { - "epoch": 15.92, - "learning_rate": 2e-05, - "loss": 6.6274, + "epoch": 64.37, + "learning_rate": 0.00011484414168698547, + "loss": 2.4841, "step": 3090 }, { - "epoch": 15.98, - "learning_rate": 2e-05, - "loss": 6.6127, + "epoch": 64.58, + "learning_rate": 0.00011364912953230145, + "loss": 2.4974, "step": 3100 }, { - "epoch": 15.98, - "eval_accuracy": 0.12192011294272531, - "eval_loss": 6.659284591674805, - "eval_runtime": 169.0589, - "eval_samples_per_second": 142.288, - "eval_steps_per_second": 2.224, + "epoch": 64.58, + "eval_accuracy": 0.5039655187362361, + "eval_loss": 2.818704605102539, + "eval_runtime": 146.0534, + "eval_samples_per_second": 164.7, + "eval_steps_per_second": 5.149, "step": 3100 }, { - "epoch": 16.03, - "learning_rate": 2e-05, - "loss": 7.1321, + "epoch": 64.78, + "learning_rate": 0.00011245789464442964, + "loss": 2.496, "step": 3110 }, { - "epoch": 16.08, - "learning_rate": 2e-05, - "loss": 6.5976, + "epoch": 64.99, + "learning_rate": 0.00011127048913184326, + "loss": 2.4902, "step": 3120 }, { - "epoch": 16.13, - "learning_rate": 2e-05, - "loss": 6.6261, + "epoch": 65.21, + "learning_rate": 0.00011008696493550599, + "loss": 2.6366, "step": 3130 }, { - "epoch": 16.18, - "learning_rate": 2e-05, - "loss": 6.6105, + "epoch": 65.41, + "learning_rate": 0.00010890737382660015, + "loss": 2.4739, "step": 3140 }, { - "epoch": 16.24, - "learning_rate": 2e-05, - "loss": 6.6031, + "epoch": 65.62, + "learning_rate": 0.00010773176740426248, + "loss": 2.4799, "step": 3150 }, { - "epoch": 16.24, - "eval_accuracy": 0.12264471521223672, - "eval_loss": 6.651196002960205, - "eval_runtime": 169.0792, - "eval_samples_per_second": 142.271, - "eval_steps_per_second": 2.224, + "epoch": 65.62, + "eval_accuracy": 0.5044451239477096, + "eval_loss": 2.8128514289855957, + "eval_runtime": 146.0215, + "eval_samples_per_second": 164.736, + "eval_steps_per_second": 5.15, "step": 3150 }, { - "epoch": 16.29, - "learning_rate": 2e-05, - "loss": 6.5796, + "epoch": 65.82, + "learning_rate": 0.00010656019709332606, + "loss": 2.4707, "step": 3160 }, { - "epoch": 16.34, - "learning_rate": 2e-05, - "loss": 6.5983, + "epoch": 66.04, + "learning_rate": 0.00010539271414207186, + "loss": 2.6249, "step": 3170 }, { - "epoch": 16.39, - "learning_rate": 2e-05, - "loss": 6.5812, + "epoch": 66.25, + "learning_rate": 0.00010422936961998609, + "loss": 2.4617, "step": 3180 }, { - "epoch": 16.44, - "learning_rate": 2e-05, - "loss": 6.6298, + "epoch": 66.45, + "learning_rate": 0.00010307021441552707, + "loss": 2.4508, "step": 3190 }, { - "epoch": 16.49, - "learning_rate": 2e-05, - "loss": 6.5742, + "epoch": 66.66, + "learning_rate": 0.00010191529923389845, + "loss": 2.4741, "step": 3200 }, { - "epoch": 16.49, - "eval_accuracy": 0.12269288211432908, - "eval_loss": 6.64845609664917, - "eval_runtime": 169.2018, - "eval_samples_per_second": 142.168, - "eval_steps_per_second": 2.222, + "epoch": 66.66, + "eval_accuracy": 0.5057173793056381, + "eval_loss": 2.805563449859619, + "eval_runtime": 146.0069, + "eval_samples_per_second": 164.752, + "eval_steps_per_second": 5.15, "step": 3200 }, { - "epoch": 16.54, - "learning_rate": 2e-05, - "loss": 6.6001, + "epoch": 66.86, + "learning_rate": 0.00010076467459483155, + "loss": 2.4658, "step": 3210 }, { - "epoch": 16.6, - "learning_rate": 2e-05, - "loss": 6.6124, + "epoch": 67.08, + "learning_rate": 9.961839083037592e-05, + "loss": 2.6267, "step": 3220 }, { - "epoch": 16.65, - "learning_rate": 2e-05, - "loss": 6.6131, + "epoch": 67.29, + "learning_rate": 9.847649808269658e-05, + "loss": 2.4656, "step": 3230 }, { - "epoch": 16.7, - "learning_rate": 2e-05, - "loss": 6.6051, + "epoch": 67.49, + "learning_rate": 9.733904630188176e-05, + "loss": 2.4421, "step": 3240 }, { - "epoch": 16.75, - "learning_rate": 2e-05, - "loss": 6.621, + "epoch": 67.7, + "learning_rate": 9.620608524375703e-05, + "loss": 2.4582, "step": 3250 }, { - "epoch": 16.75, - "eval_accuracy": 0.12212792135914549, - "eval_loss": 6.647208213806152, - "eval_runtime": 169.0564, - "eval_samples_per_second": 142.29, - "eval_steps_per_second": 2.224, + "epoch": 67.7, + "eval_accuracy": 0.506052237287108, + "eval_loss": 2.80246639251709, + "eval_runtime": 145.9985, + "eval_samples_per_second": 164.762, + "eval_steps_per_second": 5.151, "step": 3250 }, { - "epoch": 16.8, - "learning_rate": 2e-05, - "loss": 6.597, + "epoch": 67.9, + "learning_rate": 9.507766446770934e-05, + "loss": 2.456, "step": 3260 }, { - "epoch": 16.85, - "learning_rate": 2e-05, - "loss": 6.6102, + "epoch": 68.12, + "learning_rate": 9.39538333345191e-05, + "loss": 2.6204, "step": 3270 }, { - "epoch": 16.9, - "learning_rate": 2e-05, - "loss": 6.5693, + "epoch": 68.33, + "learning_rate": 9.283464100420063e-05, + "loss": 2.4513, "step": 3280 }, { - "epoch": 16.96, - "learning_rate": 2e-05, - "loss": 6.5589, + "epoch": 68.53, + "learning_rate": 9.17201364338524e-05, + "loss": 2.4486, "step": 3290 }, { - "epoch": 17.01, - "learning_rate": 2e-05, - "loss": 7.0655, + "epoch": 68.74, + "learning_rate": 9.061036837551466e-05, + "loss": 2.4389, "step": 3300 }, { - "epoch": 17.01, - "eval_accuracy": 0.123154659717467, - "eval_loss": 6.636902332305908, - "eval_runtime": 169.0684, - "eval_samples_per_second": 142.28, - "eval_steps_per_second": 2.224, + "epoch": 68.74, + "eval_accuracy": 0.5075605292045352, + "eval_loss": 2.791304111480713, + "eval_runtime": 146.0353, + "eval_samples_per_second": 164.72, + "eval_steps_per_second": 5.149, "step": 3300 }, { - "epoch": 17.06, - "learning_rate": 2e-05, - "loss": 6.5935, + "epoch": 68.95, + "learning_rate": 8.950538537403736e-05, + "loss": 2.4384, "step": 3310 }, { - "epoch": 17.11, - "learning_rate": 2e-05, - "loss": 6.5824, + "epoch": 69.16, + "learning_rate": 8.840523576495681e-05, + "loss": 2.5977, "step": 3320 }, { - "epoch": 17.16, - "learning_rate": 2e-05, - "loss": 6.5584, + "epoch": 69.37, + "learning_rate": 8.730996767238072e-05, + "loss": 2.4459, "step": 3330 }, { - "epoch": 17.22, - "learning_rate": 2e-05, - "loss": 6.5853, + "epoch": 69.58, + "learning_rate": 8.621962900688378e-05, + "loss": 2.4281, "step": 3340 }, { - "epoch": 17.27, - "learning_rate": 2e-05, - "loss": 6.5866, + "epoch": 69.78, + "learning_rate": 8.513426746341128e-05, + "loss": 2.4539, "step": 3350 }, { - "epoch": 17.27, - "eval_accuracy": 0.12344173360979259, - "eval_loss": 6.637628555297852, - "eval_runtime": 169.0659, - "eval_samples_per_second": 142.282, - "eval_steps_per_second": 2.224, + "epoch": 69.78, + "eval_accuracy": 0.5071934293322717, + "eval_loss": 2.7881319522857666, + "eval_runtime": 145.9867, + "eval_samples_per_second": 164.775, + "eval_steps_per_second": 5.151, "step": 3350 }, { - "epoch": 17.32, - "learning_rate": 2e-05, - "loss": 6.5515, + "epoch": 69.99, + "learning_rate": 8.405393051919333e-05, + "loss": 2.4298, "step": 3360 }, { - "epoch": 17.37, - "learning_rate": 2e-05, - "loss": 6.623, + "epoch": 70.21, + "learning_rate": 8.29786654316677e-05, + "loss": 2.5885, "step": 3370 }, { - "epoch": 17.42, - "learning_rate": 2e-05, - "loss": 6.5592, + "epoch": 70.41, + "learning_rate": 8.190851923641259e-05, + "loss": 2.4073, "step": 3380 }, { - "epoch": 17.47, - "learning_rate": 2e-05, - "loss": 6.5367, + "epoch": 70.62, + "learning_rate": 8.084353874508947e-05, + "loss": 2.4379, "step": 3390 }, { - "epoch": 17.52, - "learning_rate": 2e-05, - "loss": 6.6098, + "epoch": 70.82, + "learning_rate": 7.978377054339499e-05, + "loss": 2.4252, "step": 3400 }, { - "epoch": 17.52, - "eval_accuracy": 0.12519736165745915, - "eval_loss": 6.631270885467529, - "eval_runtime": 169.1186, - "eval_samples_per_second": 142.237, - "eval_steps_per_second": 2.223, + "epoch": 70.82, + "eval_accuracy": 0.5081794918909719, + "eval_loss": 2.7884321212768555, + "eval_runtime": 146.1195, + "eval_samples_per_second": 164.626, + "eval_steps_per_second": 5.146, "step": 3400 }, { - "epoch": 17.58, - "learning_rate": 2e-05, - "loss": 6.5829, + "epoch": 71.04, + "learning_rate": 7.872926098902358e-05, + "loss": 2.5932, "step": 3410 }, { - "epoch": 17.63, - "learning_rate": 2e-05, - "loss": 6.564, + "epoch": 71.25, + "learning_rate": 7.768005620963916e-05, + "loss": 2.4153, "step": 3420 }, { - "epoch": 17.68, - "learning_rate": 2e-05, - "loss": 6.5818, + "epoch": 71.45, + "learning_rate": 7.663620210085781e-05, + "loss": 2.4195, "step": 3430 }, { - "epoch": 17.73, - "learning_rate": 2e-05, - "loss": 6.5715, + "epoch": 71.66, + "learning_rate": 7.55977443242399e-05, + "loss": 2.4231, "step": 3440 }, { - "epoch": 17.78, - "learning_rate": 2e-05, - "loss": 6.5676, + "epoch": 71.86, + "learning_rate": 7.456472830529259e-05, + "loss": 2.4287, "step": 3450 }, { - "epoch": 17.78, - "eval_accuracy": 0.124823313950876, - "eval_loss": 6.625356674194336, - "eval_runtime": 169.1641, - "eval_samples_per_second": 142.199, - "eval_steps_per_second": 2.223, + "epoch": 71.86, + "eval_accuracy": 0.5093288685486723, + "eval_loss": 2.778383493423462, + "eval_runtime": 145.9882, + "eval_samples_per_second": 164.774, + "eval_steps_per_second": 5.151, "step": 3450 }, { - "epoch": 17.83, - "learning_rate": 2e-05, - "loss": 6.5796, + "epoch": 72.08, + "learning_rate": 7.353719923148324e-05, + "loss": 2.5804, "step": 3460 }, { - "epoch": 17.88, - "learning_rate": 2e-05, - "loss": 6.595, + "epoch": 72.29, + "learning_rate": 7.251520205026205e-05, + "loss": 2.4048, "step": 3470 }, { - "epoch": 17.93, - "learning_rate": 2e-05, - "loss": 6.5946, + "epoch": 72.49, + "learning_rate": 7.149878146709676e-05, + "loss": 2.4008, "step": 3480 }, { - "epoch": 17.99, - "learning_rate": 2e-05, - "loss": 6.5585, + "epoch": 72.7, + "learning_rate": 7.048798194351625e-05, + "loss": 2.41, "step": 3490 }, { - "epoch": 18.04, - "learning_rate": 2e-05, - "loss": 7.0636, + "epoch": 72.9, + "learning_rate": 6.948284769516627e-05, + "loss": 2.4131, "step": 3500 }, { - "epoch": 18.04, - "eval_accuracy": 0.1255522859600729, - "eval_loss": 6.622625350952148, - "eval_runtime": 169.0712, - "eval_samples_per_second": 142.277, - "eval_steps_per_second": 2.224, + "epoch": 72.9, + "eval_accuracy": 0.5098879891877023, + "eval_loss": 2.7781522274017334, + "eval_runtime": 146.0156, + "eval_samples_per_second": 164.743, + "eval_steps_per_second": 5.15, "step": 3500 }, { - "epoch": 18.09, - "learning_rate": 2e-05, - "loss": 6.5962, + "epoch": 73.12, + "learning_rate": 6.848342268987511e-05, + "loss": 2.5661, "step": 3510 }, { - "epoch": 18.14, - "learning_rate": 2e-05, - "loss": 6.544, + "epoch": 73.33, + "learning_rate": 6.748975064573007e-05, + "loss": 2.3994, "step": 3520 }, { - "epoch": 18.2, - "learning_rate": 2e-05, - "loss": 6.5729, + "epoch": 73.53, + "learning_rate": 6.650187502916552e-05, + "loss": 2.4078, "step": 3530 }, { - "epoch": 18.25, - "learning_rate": 2e-05, - "loss": 6.5665, + "epoch": 73.74, + "learning_rate": 6.551983905306107e-05, + "loss": 2.4168, "step": 3540 }, { - "epoch": 18.3, - "learning_rate": 2e-05, - "loss": 6.5444, + "epoch": 73.95, + "learning_rate": 6.454368567485183e-05, + "loss": 2.4016, "step": 3550 }, { - "epoch": 18.3, - "eval_accuracy": 0.12534055374011083, - "eval_loss": 6.616397857666016, - "eval_runtime": 169.0563, - "eval_samples_per_second": 142.29, - "eval_steps_per_second": 2.224, + "epoch": 73.95, + "eval_accuracy": 0.5097699735946659, + "eval_loss": 2.772381544113159, + "eval_runtime": 146.0586, + "eval_samples_per_second": 164.694, + "eval_steps_per_second": 5.149, "step": 3550 }, { - "epoch": 18.35, - "learning_rate": 2e-05, - "loss": 6.6036, + "epoch": 74.16, + "learning_rate": 6.35734575946487e-05, + "loss": 2.5732, "step": 3560 }, { - "epoch": 18.4, - "learning_rate": 2e-05, - "loss": 6.5666, + "epoch": 74.37, + "learning_rate": 6.260919725337109e-05, + "loss": 2.3961, "step": 3570 }, { - "epoch": 18.45, - "learning_rate": 2e-05, - "loss": 6.5536, + "epoch": 74.58, + "learning_rate": 6.165094683089015e-05, + "loss": 2.4073, "step": 3580 }, { - "epoch": 18.5, - "learning_rate": 2e-05, - "loss": 6.5631, + "epoch": 74.78, + "learning_rate": 6.069874824418356e-05, + "loss": 2.3997, "step": 3590 }, { - "epoch": 18.55, - "learning_rate": 2e-05, - "loss": 6.561, + "epoch": 74.99, + "learning_rate": 5.975264314550229e-05, + "loss": 2.3998, "step": 3600 }, { - "epoch": 18.55, - "eval_accuracy": 0.12544435584844563, - "eval_loss": 6.615684986114502, - "eval_runtime": 169.0849, - "eval_samples_per_second": 142.266, - "eval_steps_per_second": 2.224, + "epoch": 74.99, + "eval_accuracy": 0.5110515365958426, + "eval_loss": 2.7658748626708984, + "eval_runtime": 146.0874, + "eval_samples_per_second": 164.662, + "eval_steps_per_second": 5.148, "step": 3600 }, { - "epoch": 18.61, - "learning_rate": 2e-05, - "loss": 6.5385, + "epoch": 75.21, + "learning_rate": 5.881267292054828e-05, + "loss": 2.5492, "step": 3610 }, { - "epoch": 18.66, - "learning_rate": 2e-05, - "loss": 6.5558, + "epoch": 75.41, + "learning_rate": 5.787887868666417e-05, + "loss": 2.3838, "step": 3620 }, { - "epoch": 18.71, - "learning_rate": 2e-05, - "loss": 6.5936, + "epoch": 75.62, + "learning_rate": 5.6951301291034945e-05, + "loss": 2.398, "step": 3630 }, { - "epoch": 18.76, - "learning_rate": 2e-05, - "loss": 6.5451, + "epoch": 75.82, + "learning_rate": 5.602998130890065e-05, + "loss": 2.4025, "step": 3640 }, { - "epoch": 18.81, - "learning_rate": 2e-05, - "loss": 6.5882, + "epoch": 76.04, + "learning_rate": 5.511495904178221e-05, + "loss": 2.5475, "step": 3650 }, { - "epoch": 18.81, - "eval_accuracy": 0.12569411736714234, - "eval_loss": 6.607241153717041, - "eval_runtime": 169.0962, - "eval_samples_per_second": 142.256, - "eval_steps_per_second": 2.224, + "epoch": 76.04, + "eval_accuracy": 0.510823536539714, + "eval_loss": 2.7650203704833984, + "eval_runtime": 146.0073, + "eval_samples_per_second": 164.752, + "eval_steps_per_second": 5.15, "step": 3650 }, { - "epoch": 18.86, - "learning_rate": 2e-05, - "loss": 6.5568, + "epoch": 76.25, + "learning_rate": 5.4206274515717736e-05, + "loss": 2.4011, "step": 3660 }, { - "epoch": 18.91, - "learning_rate": 2e-05, - "loss": 6.5808, + "epoch": 76.45, + "learning_rate": 5.330396747951205e-05, + "loss": 2.3818, "step": 3670 }, { - "epoch": 18.97, - "learning_rate": 2e-05, - "loss": 6.5542, + "epoch": 76.66, + "learning_rate": 5.240807740299811e-05, + "loss": 2.3911, "step": 3680 }, { - "epoch": 19.02, - "learning_rate": 2e-05, - "loss": 7.0483, + "epoch": 76.86, + "learning_rate": 5.1518643475310034e-05, + "loss": 2.389, "step": 3690 }, { - "epoch": 19.07, - "learning_rate": 2e-05, - "loss": 6.5518, + "epoch": 77.08, + "learning_rate": 5.0635704603169287e-05, + "loss": 2.5443, "step": 3700 }, { - "epoch": 19.07, - "eval_accuracy": 0.12669918361249846, - "eval_loss": 6.6064453125, - "eval_runtime": 169.0526, - "eval_samples_per_second": 142.293, - "eval_steps_per_second": 2.224, + "epoch": 77.08, + "eval_accuracy": 0.5117344133064243, + "eval_loss": 2.7620205879211426, + "eval_runtime": 146.0022, + "eval_samples_per_second": 164.758, + "eval_steps_per_second": 5.151, "step": 3700 }, { - "epoch": 19.12, - "learning_rate": 2e-05, - "loss": 6.5397, + "epoch": 77.29, + "learning_rate": 4.975929940918236e-05, + "loss": 2.38, "step": 3710 }, { - "epoch": 19.17, - "learning_rate": 2e-05, - "loss": 6.5834, + "epoch": 77.49, + "learning_rate": 4.8889466230151646e-05, + "loss": 2.3758, "step": 3720 }, { - "epoch": 19.23, - "learning_rate": 2e-05, - "loss": 6.5657, + "epoch": 77.7, + "learning_rate": 4.8026243115398314e-05, + "loss": 2.3744, "step": 3730 }, { - "epoch": 19.28, - "learning_rate": 2e-05, - "loss": 6.5657, + "epoch": 77.9, + "learning_rate": 4.7169667825097775e-05, + "loss": 2.3784, "step": 3740 }, { - "epoch": 19.33, - "learning_rate": 2e-05, - "loss": 6.5599, + "epoch": 78.12, + "learning_rate": 4.631977782862824e-05, + "loss": 2.5381, "step": 3750 }, { - "epoch": 19.33, - "eval_accuracy": 0.12710447874336364, - "eval_loss": 6.60552453994751, - "eval_runtime": 169.0746, - "eval_samples_per_second": 142.274, - "eval_steps_per_second": 2.224, + "epoch": 78.12, + "eval_accuracy": 0.5115312635222847, + "eval_loss": 2.76308274269104, + "eval_runtime": 146.1953, + "eval_samples_per_second": 164.54, + "eval_steps_per_second": 5.144, "step": 3750 }, { - "epoch": 19.38, - "learning_rate": 2e-05, - "loss": 6.5841, + "epoch": 78.33, + "learning_rate": 4.547661030293129e-05, + "loss": 2.3771, "step": 3760 }, { - "epoch": 19.43, - "learning_rate": 2e-05, - "loss": 6.5189, + "epoch": 78.53, + "learning_rate": 4.464020213088611e-05, + "loss": 2.3786, "step": 3770 }, { - "epoch": 19.48, - "learning_rate": 2e-05, - "loss": 6.551, + "epoch": 78.74, + "learning_rate": 4.381058989969564e-05, + "loss": 2.3688, "step": 3780 }, { - "epoch": 19.53, - "learning_rate": 2e-05, - "loss": 6.5466, + "epoch": 78.95, + "learning_rate": 4.298780989928646e-05, + "loss": 2.3792, "step": 3790 }, { - "epoch": 19.59, - "learning_rate": 2e-05, - "loss": 6.5407, + "epoch": 79.16, + "learning_rate": 4.217189812072131e-05, + "loss": 2.5269, "step": 3800 }, { - "epoch": 19.59, - "eval_accuracy": 0.12744788950670777, - "eval_loss": 6.598654270172119, - "eval_runtime": 169.0329, - "eval_samples_per_second": 142.31, - "eval_steps_per_second": 2.224, + "epoch": 79.16, + "eval_accuracy": 0.5122286175796967, + "eval_loss": 2.7577943801879883, + "eval_runtime": 146.122, + "eval_samples_per_second": 164.623, + "eval_steps_per_second": 5.146, "step": 3800 }, { - "epoch": 19.64, - "learning_rate": 2e-05, - "loss": 6.5842, + "epoch": 79.37, + "learning_rate": 4.136289025462443e-05, + "loss": 2.3679, "step": 3810 }, { - "epoch": 19.69, - "learning_rate": 2e-05, - "loss": 6.5077, + "epoch": 79.58, + "learning_rate": 4.0560821689620856e-05, + "loss": 2.3749, "step": 3820 }, { - "epoch": 19.74, - "learning_rate": 2e-05, - "loss": 6.5263, + "epoch": 79.78, + "learning_rate": 3.976572751078782e-05, + "loss": 2.3605, "step": 3830 }, { - "epoch": 19.79, - "learning_rate": 2e-05, - "loss": 6.5459, + "epoch": 79.99, + "learning_rate": 3.8977642498120594e-05, + "loss": 2.3747, "step": 3840 }, { - "epoch": 19.84, - "learning_rate": 2e-05, - "loss": 6.5373, + "epoch": 80.21, + "learning_rate": 3.819660112501053e-05, + "loss": 2.5288, "step": 3850 }, { - "epoch": 19.84, - "eval_accuracy": 0.12802956265143414, - "eval_loss": 6.595397472381592, - "eval_runtime": 169.0397, - "eval_samples_per_second": 142.304, - "eval_steps_per_second": 2.224, + "epoch": 80.21, + "eval_accuracy": 0.5124386898610601, + "eval_loss": 2.754046678543091, + "eval_runtime": 146.1606, + "eval_samples_per_second": 164.579, + "eval_steps_per_second": 5.145, "step": 3850 }, { - "epoch": 19.89, - "learning_rate": 2e-05, - "loss": 6.5126, + "epoch": 80.41, + "learning_rate": 3.742263755673758e-05, + "loss": 2.367, "step": 3860 }, { - "epoch": 19.94, - "learning_rate": 2e-05, - "loss": 6.5966, + "epoch": 80.62, + "learning_rate": 3.6655785648975585e-05, + "loss": 2.3667, "step": 3870 }, { - "epoch": 20.0, - "learning_rate": 2e-05, - "loss": 6.5305, + "epoch": 80.82, + "learning_rate": 3.589607894631111e-05, + "loss": 2.3717, "step": 3880 }, { - "epoch": 20.05, - "learning_rate": 2e-05, - "loss": 7.0605, + "epoch": 81.04, + "learning_rate": 3.514355068077655e-05, + "loss": 2.5195, "step": 3890 }, { - "epoch": 20.1, - "learning_rate": 2e-05, - "loss": 6.5381, + "epoch": 81.25, + "learning_rate": 3.439823377039599e-05, + "loss": 2.3669, "step": 3900 }, { - "epoch": 20.1, - "eval_accuracy": 0.1281530363855393, - "eval_loss": 6.589879989624023, - "eval_runtime": 169.1405, - "eval_samples_per_second": 142.219, - "eval_steps_per_second": 2.223, + "epoch": 81.25, + "eval_accuracy": 0.5124800918682825, + "eval_loss": 2.752890110015869, + "eval_runtime": 145.9521, + "eval_samples_per_second": 164.814, + "eval_steps_per_second": 5.152, "step": 3900 }, { - "epoch": 20.15, - "learning_rate": 2e-05, - "loss": 6.5167, + "epoch": 81.45, + "learning_rate": 3.36601608177457e-05, + "loss": 2.3595, "step": 3910 }, { - "epoch": 20.21, - "learning_rate": 2e-05, - "loss": 6.5289, + "epoch": 81.66, + "learning_rate": 3.292936410852754e-05, + "loss": 2.3727, "step": 3920 }, { - "epoch": 20.26, - "learning_rate": 2e-05, - "loss": 6.5296, + "epoch": 81.86, + "learning_rate": 3.220587561015709e-05, + "loss": 2.3707, "step": 3930 }, { - "epoch": 20.31, - "learning_rate": 2e-05, - "loss": 6.4745, + "epoch": 82.08, + "learning_rate": 3.148972697036507e-05, + "loss": 2.508, "step": 3940 }, { - "epoch": 20.36, - "learning_rate": 2e-05, - "loss": 6.5517, + "epoch": 82.29, + "learning_rate": 3.078094951581289e-05, + "loss": 2.3631, "step": 3950 }, { - "epoch": 20.36, - "eval_accuracy": 0.12833386885114662, - "eval_loss": 6.58883810043335, - "eval_runtime": 168.9823, - "eval_samples_per_second": 142.352, - "eval_steps_per_second": 2.225, + "epoch": 82.29, + "eval_accuracy": 0.5132219293707184, + "eval_loss": 2.749772071838379, + "eval_runtime": 146.0679, + "eval_samples_per_second": 164.684, + "eval_steps_per_second": 5.148, "step": 3950 }, { - "epoch": 20.41, - "learning_rate": 2e-05, - "loss": 6.5466, + "epoch": 82.49, + "learning_rate": 3.007957425072265e-05, + "loss": 2.3568, "step": 3960 }, { - "epoch": 20.46, - "learning_rate": 2e-05, - "loss": 6.5482, + "epoch": 82.7, + "learning_rate": 2.9385631855520546e-05, + "loss": 2.3679, "step": 3970 }, { - "epoch": 20.51, - "learning_rate": 2e-05, - "loss": 6.5047, + "epoch": 82.9, + "learning_rate": 2.8699152685494925e-05, + "loss": 2.3504, "step": 3980 }, { - "epoch": 20.56, - "learning_rate": 2e-05, - "loss": 6.499, + "epoch": 83.12, + "learning_rate": 2.8020166769468616e-05, + "loss": 2.5054, "step": 3990 }, { - "epoch": 20.62, - "learning_rate": 2e-05, - "loss": 6.5371, + "epoch": 83.33, + "learning_rate": 2.7348703808485223e-05, + "loss": 2.3499, "step": 4000 }, { - "epoch": 20.62, - "eval_accuracy": 0.1295166100847349, - "eval_loss": 6.585357189178467, - "eval_runtime": 169.1057, - "eval_samples_per_second": 142.248, - "eval_steps_per_second": 2.223, + "epoch": 83.33, + "eval_accuracy": 0.5135577468816207, + "eval_loss": 2.7453861236572266, + "eval_runtime": 146.0782, + "eval_samples_per_second": 164.672, + "eval_steps_per_second": 5.148, "step": 4000 }, { - "epoch": 20.67, - "learning_rate": 2e-05, - "loss": 6.5112, + "epoch": 83.53, + "learning_rate": 2.6684793174509915e-05, + "loss": 2.3478, "step": 4010 }, { - "epoch": 20.72, - "learning_rate": 2e-05, - "loss": 6.538, + "epoch": 83.74, + "learning_rate": 2.6028463909144574e-05, + "loss": 2.3686, "step": 4020 }, { - "epoch": 20.77, - "learning_rate": 2e-05, - "loss": 6.5547, + "epoch": 83.95, + "learning_rate": 2.5379744722357403e-05, + "loss": 2.3636, "step": 4030 }, { - "epoch": 20.82, - "learning_rate": 2e-05, - "loss": 6.5062, + "epoch": 84.16, + "learning_rate": 2.473866399122733e-05, + "loss": 2.5195, "step": 4040 }, { - "epoch": 20.87, - "learning_rate": 2e-05, - "loss": 6.5819, + "epoch": 84.37, + "learning_rate": 2.410524975870221e-05, + "loss": 2.3726, "step": 4050 }, { - "epoch": 20.87, - "eval_accuracy": 0.12822338785929896, - "eval_loss": 6.582521915435791, - "eval_runtime": 169.044, - "eval_samples_per_second": 142.3, - "eval_steps_per_second": 2.224, + "epoch": 84.37, + "eval_accuracy": 0.5140964497348997, + "eval_loss": 2.7446117401123047, + "eval_runtime": 146.09, + "eval_samples_per_second": 164.659, + "eval_steps_per_second": 5.148, "step": 4050 }, { - "epoch": 20.92, - "learning_rate": 2e-05, - "loss": 6.4947, + "epoch": 84.58, + "learning_rate": 2.347952973237262e-05, + "loss": 2.3504, "step": 4060 }, { - "epoch": 20.98, - "learning_rate": 2e-05, - "loss": 6.5279, + "epoch": 84.78, + "learning_rate": 2.286153128325954e-05, + "loss": 2.351, "step": 4070 }, { - "epoch": 21.03, - "learning_rate": 2e-05, - "loss": 7.0033, + "epoch": 84.99, + "learning_rate": 2.2251281444617257e-05, + "loss": 2.3506, "step": 4080 }, { - "epoch": 21.08, - "learning_rate": 2e-05, - "loss": 6.5198, + "epoch": 85.21, + "learning_rate": 2.1648806910750575e-05, + "loss": 2.5104, "step": 4090 }, { - "epoch": 21.13, - "learning_rate": 2e-05, - "loss": 6.5425, + "epoch": 85.41, + "learning_rate": 2.1054134035847307e-05, + "loss": 2.3411, "step": 4100 }, { - "epoch": 21.13, - "eval_accuracy": 0.12887030827275436, - "eval_loss": 6.579444408416748, - "eval_runtime": 169.204, - "eval_samples_per_second": 142.166, - "eval_steps_per_second": 2.222, + "epoch": 85.41, + "eval_accuracy": 0.5143741932133077, + "eval_loss": 2.740255355834961, + "eval_runtime": 146.0438, + "eval_samples_per_second": 164.711, + "eval_steps_per_second": 5.149, "step": 4100 }, { - "epoch": 21.18, - "learning_rate": 2e-05, - "loss": 6.4964, + "epoch": 85.62, + "learning_rate": 2.0467288832825583e-05, + "loss": 2.3666, "step": 4110 }, { - "epoch": 21.24, - "learning_rate": 2e-05, - "loss": 6.5506, + "epoch": 85.82, + "learning_rate": 1.9888296972195587e-05, + "loss": 2.3451, "step": 4120 }, { - "epoch": 21.29, - "learning_rate": 2e-05, - "loss": 6.5544, + "epoch": 86.04, + "learning_rate": 1.931718378093703e-05, + "loss": 2.5151, "step": 4130 }, { - "epoch": 21.34, - "learning_rate": 2e-05, - "loss": 6.5089, + "epoch": 86.25, + "learning_rate": 1.875397424139109e-05, + "loss": 2.3539, "step": 4140 }, { - "epoch": 21.39, - "learning_rate": 2e-05, - "loss": 6.5372, + "epoch": 86.45, + "learning_rate": 1.81986929901675e-05, + "loss": 2.3321, "step": 4150 }, { - "epoch": 21.39, - "eval_accuracy": 0.12997385168937536, - "eval_loss": 6.575997829437256, - "eval_runtime": 169.1292, - "eval_samples_per_second": 142.229, - "eval_steps_per_second": 2.223, + "epoch": 86.45, + "eval_accuracy": 0.5146461086764289, + "eval_loss": 2.7371606826782227, + "eval_runtime": 146.1164, + "eval_samples_per_second": 164.629, + "eval_steps_per_second": 5.147, "step": 4150 }, { - "epoch": 21.44, - "learning_rate": 2e-05, - "loss": 6.5305, + "epoch": 86.66, + "learning_rate": 1.765136431706711e-05, + "loss": 2.3573, "step": 4160 }, { - "epoch": 21.49, - "learning_rate": 2e-05, - "loss": 6.5443, + "epoch": 86.86, + "learning_rate": 1.711201216401912e-05, + "loss": 2.3422, "step": 4170 }, { - "epoch": 21.54, - "learning_rate": 2e-05, - "loss": 6.5214, + "epoch": 87.08, + "learning_rate": 1.6580660124034032e-05, + "loss": 2.5055, "step": 4180 }, { - "epoch": 21.6, - "learning_rate": 2e-05, - "loss": 6.4939, + "epoch": 87.29, + "learning_rate": 1.605733144017132e-05, + "loss": 2.3429, "step": 4190 }, { - "epoch": 21.65, - "learning_rate": 2e-05, - "loss": 6.544, + "epoch": 87.49, + "learning_rate": 1.5542049004523053e-05, + "loss": 2.3456, "step": 4200 }, { - "epoch": 21.65, - "eval_accuracy": 0.13029984560795255, - "eval_loss": 6.571804046630859, - "eval_runtime": 168.9481, - "eval_samples_per_second": 142.381, - "eval_steps_per_second": 2.226, + "epoch": 87.49, + "eval_accuracy": 0.5146212850149416, + "eval_loss": 2.7389299869537354, + "eval_runtime": 146.0012, + "eval_samples_per_second": 164.759, + "eval_steps_per_second": 5.151, "step": 4200 }, { - "epoch": 21.7, - "learning_rate": 2e-05, - "loss": 6.5006, + "epoch": 87.7, + "learning_rate": 1.503483535721224e-05, + "loss": 2.3608, "step": 4210 }, { - "epoch": 21.75, - "learning_rate": 2e-05, - "loss": 6.5257, + "epoch": 87.9, + "learning_rate": 1.4535712685406921e-05, + "loss": 2.3466, "step": 4220 }, { - "epoch": 21.8, - "learning_rate": 2e-05, - "loss": 6.5282, + "epoch": 88.12, + "learning_rate": 1.4044702822349731e-05, + "loss": 2.4892, "step": 4230 }, { - "epoch": 21.85, - "learning_rate": 2e-05, - "loss": 6.5319, + "epoch": 88.33, + "learning_rate": 1.3561827246402692e-05, + "loss": 2.3418, "step": 4240 }, { - "epoch": 21.9, - "learning_rate": 2e-05, - "loss": 6.5129, + "epoch": 88.53, + "learning_rate": 1.3087107080107853e-05, + "loss": 2.3372, "step": 4250 }, { - "epoch": 21.9, - "eval_accuracy": 0.13096158217776768, - "eval_loss": 6.565962314605713, - "eval_runtime": 168.9477, - "eval_samples_per_second": 142.381, - "eval_steps_per_second": 2.226, + "epoch": 88.53, + "eval_accuracy": 0.515111201963272, + "eval_loss": 2.7384002208709717, + "eval_runtime": 145.8226, + "eval_samples_per_second": 164.961, + "eval_steps_per_second": 5.157, "step": 4250 }, { - "epoch": 21.96, - "learning_rate": 2e-05, - "loss": 6.4916, + "epoch": 88.74, + "learning_rate": 1.2620563089263093e-05, + "loss": 2.3411, "step": 4260 }, { - "epoch": 22.01, - "learning_rate": 2e-05, - "loss": 6.9742, + "epoch": 88.95, + "learning_rate": 1.2162215682014012e-05, + "loss": 2.3637, "step": 4270 }, { - "epoch": 22.06, - "learning_rate": 2e-05, - "loss": 6.4991, + "epoch": 89.16, + "learning_rate": 1.1712084907961053e-05, + "loss": 2.4971, "step": 4280 }, { - "epoch": 22.11, - "learning_rate": 2e-05, - "loss": 6.516, + "epoch": 89.37, + "learning_rate": 1.127019045728246e-05, + "loss": 2.3476, "step": 4290 }, { - "epoch": 22.16, - "learning_rate": 2e-05, - "loss": 6.4798, + "epoch": 89.58, + "learning_rate": 1.0836551659873074e-05, + "loss": 2.343, "step": 4300 }, { - "epoch": 22.16, - "eval_accuracy": 0.1304970998721391, - "eval_loss": 6.56817102432251, - "eval_runtime": 168.9692, - "eval_samples_per_second": 142.363, - "eval_steps_per_second": 2.225, + "epoch": 89.58, + "eval_accuracy": 0.5144067649722459, + "eval_loss": 2.7397918701171875, + "eval_runtime": 146.0005, + "eval_samples_per_second": 164.76, + "eval_steps_per_second": 5.151, "step": 4300 }, { - "epoch": 22.22, - "learning_rate": 2e-05, - "loss": 6.5255, - "step": 4310 - }, - { - "epoch": 22.27, - "learning_rate": 2e-05, - "loss": 6.4953, - "step": 4320 - }, - { - "epoch": 22.32, - "learning_rate": 2e-05, - "loss": 6.5558, - "step": 4330 - }, - { - "epoch": 22.37, - "learning_rate": 2e-05, - "loss": 6.5029, - "step": 4340 - }, - { - "epoch": 22.42, - "learning_rate": 2e-05, - "loss": 6.5556, - "step": 4350 - }, - { - "epoch": 22.42, - "eval_accuracy": 0.13152063993455215, - "eval_loss": 6.561850547790527, - "eval_runtime": 168.9174, - "eval_samples_per_second": 142.407, - "eval_steps_per_second": 2.226, - "step": 4350 - }, - { - "epoch": 22.47, - "learning_rate": 2e-05, - "loss": 6.4888, - "step": 4360 - }, - { - "epoch": 22.52, - "learning_rate": 2e-05, - "loss": 6.4516, - "step": 4370 - }, - { - "epoch": 22.58, - "learning_rate": 2e-05, - "loss": 6.5203, - "step": 4380 - }, - { - "epoch": 22.63, - "learning_rate": 2e-05, - "loss": 6.502, - "step": 4390 - }, - { - "epoch": 22.68, - "learning_rate": 2e-05, - "loss": 6.4946, - "step": 4400 - }, - { - "epoch": 22.68, - "eval_accuracy": 0.13143446792084687, - "eval_loss": 6.558900833129883, - "eval_runtime": 168.9992, - "eval_samples_per_second": 142.338, - "eval_steps_per_second": 2.225, - "step": 4400 - }, - { - "epoch": 22.73, - "learning_rate": 2e-05, - "loss": 6.4919, - "step": 4410 - }, - { - "epoch": 22.78, - "learning_rate": 2e-05, - "loss": 6.482, - "step": 4420 - }, - { - "epoch": 22.83, - "learning_rate": 2e-05, - "loss": 6.5231, - "step": 4430 - }, - { - "epoch": 22.88, - "learning_rate": 2e-05, - "loss": 6.5293, - "step": 4440 - }, - { - "epoch": 22.93, - "learning_rate": 2e-05, - "loss": 6.5212, - "step": 4450 - }, - { - "epoch": 22.93, - "eval_accuracy": 0.13184165697753022, - "eval_loss": 6.559324264526367, - "eval_runtime": 168.9331, - "eval_samples_per_second": 142.394, - "eval_steps_per_second": 2.226, - "step": 4450 - }, - { - "epoch": 22.99, - "learning_rate": 2e-05, - "loss": 6.4944, - "step": 4460 - }, - { - "epoch": 23.04, - "learning_rate": 2e-05, - "loss": 6.9696, - "step": 4470 - }, - { - "epoch": 23.09, - "learning_rate": 2e-05, - "loss": 6.5051, - "step": 4480 - }, - { - "epoch": 23.14, - "learning_rate": 2e-05, - "loss": 6.4822, - "step": 4490 - }, - { - "epoch": 23.2, - "learning_rate": 2e-05, - "loss": 6.5055, - "step": 4500 - }, - { - "epoch": 23.2, - "eval_accuracy": 0.1310761376111694, - "eval_loss": 6.5552473068237305, - "eval_runtime": 169.0117, - "eval_samples_per_second": 142.327, - "eval_steps_per_second": 2.225, - "step": 4500 - }, - { - "epoch": 23.25, - "learning_rate": 2e-05, - "loss": 6.5152, - "step": 4510 - }, - { - "epoch": 23.3, - "learning_rate": 2e-05, - "loss": 6.504, - "step": 4520 - }, - { - "epoch": 23.35, - "learning_rate": 2e-05, - "loss": 6.5149, - "step": 4530 - }, - { - "epoch": 23.4, - "learning_rate": 2e-05, - "loss": 6.4913, - "step": 4540 - }, - { - "epoch": 23.45, - "learning_rate": 2e-05, - "loss": 6.4693, - "step": 4550 - }, - { - "epoch": 23.45, - "eval_accuracy": 0.13253956813479448, - "eval_loss": 6.548061847686768, - "eval_runtime": 169.0655, - "eval_samples_per_second": 142.282, - "eval_steps_per_second": 2.224, - "step": 4550 - }, - { - "epoch": 23.5, - "learning_rate": 2e-05, - "loss": 6.4741, - "step": 4560 - }, - { - "epoch": 23.55, - "learning_rate": 2e-05, - "loss": 6.5033, - "step": 4570 - }, - { - "epoch": 23.61, - "learning_rate": 2e-05, - "loss": 6.5313, - "step": 4580 - }, - { - "epoch": 23.66, - "learning_rate": 2e-05, - "loss": 6.4789, - "step": 4590 - }, - { - "epoch": 23.71, - "learning_rate": 2e-05, - "loss": 6.4706, - "step": 4600 - }, - { - "epoch": 23.71, - "eval_accuracy": 0.13172442724332517, - "eval_loss": 6.54693078994751, - "eval_runtime": 169.0562, - "eval_samples_per_second": 142.29, - "eval_steps_per_second": 2.224, - "step": 4600 - }, - { - "epoch": 23.76, - "learning_rate": 2e-05, - "loss": 6.5044, - "step": 4610 - }, - { - "epoch": 23.81, - "learning_rate": 2e-05, - "loss": 6.5038, - "step": 4620 - }, - { - "epoch": 23.86, - "learning_rate": 2e-05, - "loss": 6.5173, - "step": 4630 - }, - { - "epoch": 23.91, - "learning_rate": 2e-05, - "loss": 6.5048, - "step": 4640 - }, - { - "epoch": 23.97, - "learning_rate": 2e-05, - "loss": 6.495, - "step": 4650 - }, - { - "epoch": 23.97, - "eval_accuracy": 0.13237315726859447, - "eval_loss": 6.546177387237549, - "eval_runtime": 168.9825, - "eval_samples_per_second": 142.352, - "eval_steps_per_second": 2.225, - "step": 4650 - }, - { - "epoch": 24.02, - "learning_rate": 2e-05, - "loss": 6.9878, - "step": 4660 - }, - { - "epoch": 24.07, - "learning_rate": 2e-05, - "loss": 6.4801, - "step": 4670 - }, - { - "epoch": 24.12, - "learning_rate": 2e-05, - "loss": 6.5032, - "step": 4680 - }, - { - "epoch": 24.17, - "learning_rate": 2e-05, - "loss": 6.4802, - "step": 4690 - }, - { - "epoch": 24.23, - "learning_rate": 2e-05, - "loss": 6.4901, - "step": 4700 - }, - { - "epoch": 24.23, - "eval_accuracy": 0.13279457912397785, - "eval_loss": 6.541420936584473, - "eval_runtime": 168.8748, - "eval_samples_per_second": 142.443, - "eval_steps_per_second": 2.227, - "step": 4700 - }, - { - "epoch": 24.28, - "learning_rate": 2e-05, - "loss": 6.4895, - "step": 4710 - }, - { - "epoch": 24.33, - "learning_rate": 2e-05, - "loss": 6.5127, - "step": 4720 - }, - { - "epoch": 24.38, - "learning_rate": 2e-05, - "loss": 6.5062, - "step": 4730 - }, - { - "epoch": 24.43, - "learning_rate": 2e-05, - "loss": 6.4874, - "step": 4740 - }, - { - "epoch": 24.48, - "learning_rate": 2e-05, - "loss": 6.4936, - "step": 4750 - }, - { - "epoch": 24.48, - "eval_accuracy": 0.13342969669901336, - "eval_loss": 6.538527011871338, - "eval_runtime": 168.9618, - "eval_samples_per_second": 142.369, - "eval_steps_per_second": 2.225, - "step": 4750 - }, - { - "epoch": 24.53, - "learning_rate": 2e-05, - "loss": 6.4868, - "step": 4760 - }, - { - "epoch": 24.59, - "learning_rate": 2e-05, - "loss": 6.4657, - "step": 4770 - }, - { - "epoch": 24.64, - "learning_rate": 2e-05, - "loss": 6.4732, - "step": 4780 - }, - { - "epoch": 24.69, - "learning_rate": 2e-05, - "loss": 6.5089, - "step": 4790 - }, - { - "epoch": 24.74, - "learning_rate": 2e-05, - "loss": 6.481, - "step": 4800 - }, - { - "epoch": 24.74, - "eval_accuracy": 0.13314294552046066, - "eval_loss": 6.536244869232178, - "eval_runtime": 168.9165, - "eval_samples_per_second": 142.408, - "eval_steps_per_second": 2.226, - "step": 4800 - }, - { - "epoch": 24.79, - "learning_rate": 2e-05, - "loss": 6.4942, - "step": 4810 - }, - { - "epoch": 24.84, - "learning_rate": 2e-05, - "loss": 6.4739, - "step": 4820 - }, - { - "epoch": 24.89, - "learning_rate": 2e-05, - "loss": 6.4646, - "step": 4830 - }, - { - "epoch": 24.94, - "learning_rate": 2e-05, - "loss": 6.4478, - "step": 4840 - }, - { - "epoch": 25.0, - "learning_rate": 2e-05, - "loss": 6.5186, - "step": 4850 - }, - { - "epoch": 25.0, - "eval_accuracy": 0.13348454019073938, - "eval_loss": 6.535652160644531, - "eval_runtime": 168.9724, - "eval_samples_per_second": 142.361, - "eval_steps_per_second": 2.225, - "step": 4850 - }, - { - "epoch": 25.05, - "learning_rate": 2e-05, - "loss": 6.9644, - "step": 4860 - }, - { - "epoch": 25.1, - "learning_rate": 2e-05, - "loss": 6.4549, - "step": 4870 - }, - { - "epoch": 25.15, - "learning_rate": 2e-05, - "loss": 6.4725, - "step": 4880 - }, - { - "epoch": 25.21, - "learning_rate": 2e-05, - "loss": 6.467, - "step": 4890 - }, - { - "epoch": 25.26, - "learning_rate": 2e-05, - "loss": 6.4711, - "step": 4900 - }, - { - "epoch": 25.26, - "eval_accuracy": 0.1339031847051157, - "eval_loss": 6.53087854385376, - "eval_runtime": 168.9489, - "eval_samples_per_second": 142.38, - "eval_steps_per_second": 2.226, - "step": 4900 - }, - { - "epoch": 25.31, - "learning_rate": 2e-05, - "loss": 6.4475, - "step": 4910 - }, - { - "epoch": 25.36, - "learning_rate": 2e-05, - "loss": 6.4674, - "step": 4920 - }, - { - "epoch": 25.41, - "learning_rate": 2e-05, - "loss": 6.4737, - "step": 4930 - }, - { - "epoch": 25.46, - "learning_rate": 2e-05, - "loss": 6.5061, - "step": 4940 - }, - { - "epoch": 25.51, - "learning_rate": 2e-05, - "loss": 6.4513, - "step": 4950 - }, - { - "epoch": 25.51, - "eval_accuracy": 0.13372873418009035, - "eval_loss": 6.528397083282471, - "eval_runtime": 168.9948, - "eval_samples_per_second": 142.342, - "eval_steps_per_second": 2.225, - "step": 4950 - }, - { - "epoch": 25.56, - "learning_rate": 2e-05, - "loss": 6.4663, - "step": 4960 - }, - { - "epoch": 25.62, - "learning_rate": 2e-05, - "loss": 6.4488, - "step": 4970 - }, - { - "epoch": 25.67, - "learning_rate": 2e-05, - "loss": 6.4882, - "step": 4980 - }, - { - "epoch": 25.72, - "learning_rate": 2e-05, - "loss": 6.4644, - "step": 4990 - }, - { - "epoch": 25.77, - "learning_rate": 2e-05, - "loss": 6.4652, - "step": 5000 - }, - { - "epoch": 25.77, - "eval_accuracy": 0.1343316375042143, - "eval_loss": 6.524177074432373, - "eval_runtime": 168.9768, - "eval_samples_per_second": 142.357, - "eval_steps_per_second": 2.225, - "step": 5000 - }, - { - "epoch": 25.82, - "learning_rate": 2e-05, - "loss": 6.5203, - "step": 5010 - }, - { - "epoch": 25.87, - "learning_rate": 2e-05, - "loss": 6.4518, - "step": 5020 - }, - { - "epoch": 25.92, - "learning_rate": 2e-05, - "loss": 6.4791, - "step": 5030 - }, - { - "epoch": 25.98, - "learning_rate": 2e-05, - "loss": 6.4843, - "step": 5040 - }, - { - "epoch": 26.03, - "learning_rate": 2e-05, - "loss": 6.9335, - "step": 5050 - }, - { - "epoch": 26.03, - "eval_accuracy": 0.13449298626675552, - "eval_loss": 6.521671772003174, - "eval_runtime": 168.9858, - "eval_samples_per_second": 142.349, - "eval_steps_per_second": 2.225, - "step": 5050 - }, - { - "epoch": 26.08, - "learning_rate": 2e-05, - "loss": 6.4677, - "step": 5060 - }, - { - "epoch": 26.13, - "learning_rate": 2e-05, - "loss": 6.4347, - "step": 5070 - }, - { - "epoch": 26.18, - "learning_rate": 2e-05, - "loss": 6.446, - "step": 5080 - }, - { - "epoch": 26.24, - "learning_rate": 2e-05, - "loss": 6.4386, - "step": 5090 - }, - { - "epoch": 26.29, - "learning_rate": 2e-05, - "loss": 6.4747, - "step": 5100 - }, - { - "epoch": 26.29, - "eval_accuracy": 0.13449609380922367, - "eval_loss": 6.520633697509766, - "eval_runtime": 168.9534, - "eval_samples_per_second": 142.377, - "eval_steps_per_second": 2.225, - "step": 5100 - }, - { - "epoch": 26.34, - "learning_rate": 2e-05, - "loss": 6.4392, - "step": 5110 - }, - { - "epoch": 26.39, - "learning_rate": 2e-05, - "loss": 6.4715, - "step": 5120 - }, - { - "epoch": 26.44, - "learning_rate": 2e-05, - "loss": 6.4502, - "step": 5130 - }, - { - "epoch": 26.49, - "learning_rate": 2e-05, - "loss": 6.4566, - "step": 5140 - }, - { - "epoch": 26.54, - "learning_rate": 2e-05, - "loss": 6.4702, - "step": 5150 - }, - { - "epoch": 26.54, - "eval_accuracy": 0.13501667146291277, - "eval_loss": 6.52007532119751, - "eval_runtime": 168.9331, - "eval_samples_per_second": 142.394, - "eval_steps_per_second": 2.226, - "step": 5150 - }, - { - "epoch": 26.6, - "learning_rate": 2e-05, - "loss": 6.4778, - "step": 5160 - }, - { - "epoch": 26.65, - "learning_rate": 2e-05, - "loss": 6.4631, - "step": 5170 - }, - { - "epoch": 26.7, - "learning_rate": 2e-05, - "loss": 6.453, - "step": 5180 - }, - { - "epoch": 26.75, - "learning_rate": 2e-05, - "loss": 6.4477, - "step": 5190 - }, - { - "epoch": 26.8, - "learning_rate": 2e-05, - "loss": 6.4524, - "step": 5200 - }, - { - "epoch": 26.8, - "eval_accuracy": 0.13524723051481435, - "eval_loss": 6.515605926513672, - "eval_runtime": 168.9736, - "eval_samples_per_second": 142.359, - "eval_steps_per_second": 2.225, - "step": 5200 - }, - { - "epoch": 26.85, - "learning_rate": 2e-05, - "loss": 6.4479, - "step": 5210 - }, - { - "epoch": 26.9, - "learning_rate": 2e-05, - "loss": 6.4726, - "step": 5220 - }, - { - "epoch": 26.96, - "learning_rate": 2e-05, - "loss": 6.4736, - "step": 5230 - }, - { - "epoch": 27.01, - "learning_rate": 2e-05, - "loss": 6.9569, - "step": 5240 - }, - { - "epoch": 27.06, - "learning_rate": 2e-05, - "loss": 6.4225, - "step": 5250 - }, - { - "epoch": 27.06, - "eval_accuracy": 0.13492381162327038, - "eval_loss": 6.515013694763184, - "eval_runtime": 168.8677, - "eval_samples_per_second": 142.449, - "eval_steps_per_second": 2.227, - "step": 5250 - }, - { - "epoch": 27.11, - "learning_rate": 2e-05, - "loss": 6.4502, - "step": 5260 - }, - { - "epoch": 27.16, - "learning_rate": 2e-05, - "loss": 6.4587, - "step": 5270 - }, - { - "epoch": 27.22, - "learning_rate": 2e-05, - "loss": 6.4512, - "step": 5280 - }, - { - "epoch": 27.27, - "learning_rate": 2e-05, - "loss": 6.4495, - "step": 5290 - }, - { - "epoch": 27.32, - "learning_rate": 2e-05, - "loss": 6.4599, - "step": 5300 - }, - { - "epoch": 27.32, - "eval_accuracy": 0.13545740402985318, - "eval_loss": 6.511618137359619, - "eval_runtime": 168.8908, - "eval_samples_per_second": 142.429, - "eval_steps_per_second": 2.226, - "step": 5300 - }, - { - "epoch": 27.37, - "learning_rate": 2e-05, - "loss": 6.4499, - "step": 5310 - }, - { - "epoch": 27.42, - "learning_rate": 2e-05, - "loss": 6.4468, - "step": 5320 - }, - { - "epoch": 27.47, - "learning_rate": 2e-05, - "loss": 6.4448, - "step": 5330 - }, - { - "epoch": 27.52, - "learning_rate": 2e-05, - "loss": 6.4436, - "step": 5340 - }, - { - "epoch": 27.58, - "learning_rate": 2e-05, - "loss": 6.4591, - "step": 5350 - }, - { - "epoch": 27.58, - "eval_accuracy": 0.1358384989998707, - "eval_loss": 6.509791374206543, - "eval_runtime": 169.0437, - "eval_samples_per_second": 142.301, - "eval_steps_per_second": 2.224, - "step": 5350 - }, - { - "epoch": 27.63, - "learning_rate": 2e-05, - "loss": 6.4593, - "step": 5360 - }, - { - "epoch": 27.68, - "learning_rate": 2e-05, - "loss": 6.4223, - "step": 5370 - }, - { - "epoch": 27.73, - "learning_rate": 2e-05, - "loss": 6.4278, - "step": 5380 - }, - { - "epoch": 27.78, - "learning_rate": 2e-05, - "loss": 6.4607, - "step": 5390 - }, - { - "epoch": 27.83, - "learning_rate": 2e-05, - "loss": 6.4184, - "step": 5400 - }, - { - "epoch": 27.83, - "eval_accuracy": 0.1352895741192505, - "eval_loss": 6.509643077850342, - "eval_runtime": 169.086, - "eval_samples_per_second": 142.265, - "eval_steps_per_second": 2.224, - "step": 5400 - }, - { - "epoch": 27.88, - "learning_rate": 2e-05, - "loss": 6.455, - "step": 5410 - }, - { - "epoch": 27.93, - "learning_rate": 2e-05, - "loss": 6.4447, - "step": 5420 - }, - { - "epoch": 27.99, - "learning_rate": 2e-05, - "loss": 6.3932, - "step": 5430 - }, - { - "epoch": 28.04, - "learning_rate": 2e-05, - "loss": 6.953, - "step": 5440 - }, - { - "epoch": 28.09, - "learning_rate": 2e-05, - "loss": 6.43, - "step": 5450 - }, - { - "epoch": 28.09, - "eval_accuracy": 0.1360815394195743, - "eval_loss": 6.507402420043945, - "eval_runtime": 169.2594, - "eval_samples_per_second": 142.119, - "eval_steps_per_second": 2.221, - "step": 5450 - }, - { - "epoch": 28.14, - "learning_rate": 2e-05, - "loss": 6.4622, - "step": 5460 - }, - { - "epoch": 28.2, - "learning_rate": 2e-05, - "loss": 6.4898, - "step": 5470 - }, - { - "epoch": 28.25, - "learning_rate": 2e-05, - "loss": 6.4297, - "step": 5480 - }, - { - "epoch": 28.3, - "learning_rate": 2e-05, - "loss": 6.4151, - "step": 5490 - }, - { - "epoch": 28.35, - "learning_rate": 2e-05, - "loss": 6.4604, - "step": 5500 - }, - { - "epoch": 28.35, - "eval_accuracy": 0.136740573949904, - "eval_loss": 6.4999098777771, - "eval_runtime": 169.0085, - "eval_samples_per_second": 142.33, - "eval_steps_per_second": 2.225, - "step": 5500 - }, - { - "epoch": 28.4, - "learning_rate": 2e-05, - "loss": 6.4254, - "step": 5510 - }, - { - "epoch": 28.45, - "learning_rate": 2e-05, - "loss": 6.4188, - "step": 5520 - }, - { - "epoch": 28.5, - "learning_rate": 2e-05, - "loss": 6.4418, - "step": 5530 - }, - { - "epoch": 28.55, - "learning_rate": 2e-05, - "loss": 6.4201, - "step": 5540 - }, - { - "epoch": 28.61, - "learning_rate": 2e-05, - "loss": 6.4593, - "step": 5550 - }, - { - "epoch": 28.61, - "eval_accuracy": 0.13586421209634997, - "eval_loss": 6.499355792999268, - "eval_runtime": 169.0692, - "eval_samples_per_second": 142.279, - "eval_steps_per_second": 2.224, - "step": 5550 - }, - { - "epoch": 28.66, - "learning_rate": 2e-05, - "loss": 6.4249, - "step": 5560 - }, - { - "epoch": 28.71, - "learning_rate": 2e-05, - "loss": 6.4335, - "step": 5570 - }, - { - "epoch": 28.76, - "learning_rate": 2e-05, - "loss": 6.4411, - "step": 5580 - }, - { - "epoch": 28.81, - "learning_rate": 2e-05, - "loss": 6.4238, - "step": 5590 - }, - { - "epoch": 28.86, - "learning_rate": 2e-05, - "loss": 6.4648, - "step": 5600 - }, - { - "epoch": 28.86, - "eval_accuracy": 0.13563087896124043, - "eval_loss": 6.498055934906006, - "eval_runtime": 168.9957, - "eval_samples_per_second": 142.341, - "eval_steps_per_second": 2.225, - "step": 5600 - }, - { - "epoch": 28.91, - "learning_rate": 2e-05, - "loss": 6.4051, - "step": 5610 - }, - { - "epoch": 28.97, - "learning_rate": 2e-05, - "loss": 6.4433, - "step": 5620 - }, - { - "epoch": 29.02, - "learning_rate": 2e-05, - "loss": 6.9371, - "step": 5630 - }, - { - "epoch": 29.07, - "learning_rate": 2e-05, - "loss": 6.4164, - "step": 5640 - }, - { - "epoch": 29.12, - "learning_rate": 2e-05, - "loss": 6.4453, - "step": 5650 - }, - { - "epoch": 29.12, - "eval_accuracy": 0.1374106846803503, - "eval_loss": 6.494868755340576, - "eval_runtime": 169.1796, - "eval_samples_per_second": 142.186, - "eval_steps_per_second": 2.222, - "step": 5650 - }, - { - "epoch": 29.17, - "learning_rate": 2e-05, - "loss": 6.4349, - "step": 5660 - }, - { - "epoch": 29.23, - "learning_rate": 2e-05, - "loss": 6.468, - "step": 5670 - }, - { - "epoch": 29.28, - "learning_rate": 2e-05, - "loss": 6.4532, - "step": 5680 - }, - { - "epoch": 29.33, - "learning_rate": 2e-05, - "loss": 6.4349, - "step": 5690 - }, - { - "epoch": 29.38, - "learning_rate": 2e-05, - "loss": 6.4275, - "step": 5700 - }, - { - "epoch": 29.38, - "eval_accuracy": 0.13618291226339158, - "eval_loss": 6.49544095993042, - "eval_runtime": 169.054, - "eval_samples_per_second": 142.292, - "eval_steps_per_second": 2.224, - "step": 5700 - }, - { - "epoch": 29.43, - "learning_rate": 2e-05, - "loss": 6.4304, - "step": 5710 - }, - { - "epoch": 29.48, - "learning_rate": 2e-05, - "loss": 6.4465, - "step": 5720 - }, - { - "epoch": 29.53, - "learning_rate": 2e-05, - "loss": 6.4144, - "step": 5730 - }, - { - "epoch": 29.59, - "learning_rate": 2e-05, - "loss": 6.4441, - "step": 5740 - }, - { - "epoch": 29.64, - "learning_rate": 2e-05, - "loss": 6.4165, - "step": 5750 - }, - { - "epoch": 29.64, - "eval_accuracy": 0.1368880629366925, - "eval_loss": 6.493827819824219, - "eval_runtime": 169.0908, - "eval_samples_per_second": 142.261, - "eval_steps_per_second": 2.224, - "step": 5750 - }, - { - "epoch": 29.69, - "learning_rate": 2e-05, - "loss": 6.4493, - "step": 5760 - }, - { - "epoch": 29.74, - "learning_rate": 2e-05, - "loss": 6.4144, - "step": 5770 - }, - { - "epoch": 29.79, - "learning_rate": 2e-05, - "loss": 6.4323, - "step": 5780 - }, - { - "epoch": 29.84, - "learning_rate": 2e-05, - "loss": 6.4063, - "step": 5790 - }, - { - "epoch": 29.89, - "learning_rate": 2e-05, - "loss": 6.4211, - "step": 5800 - }, - { - "epoch": 29.89, - "eval_accuracy": 0.1375700871847227, - "eval_loss": 6.4911370277404785, - "eval_runtime": 169.0097, - "eval_samples_per_second": 142.329, - "eval_steps_per_second": 2.225, - "step": 5800 - }, - { - "epoch": 29.94, - "learning_rate": 2e-05, - "loss": 6.3907, - "step": 5810 - }, - { - "epoch": 30.0, - "learning_rate": 2e-05, - "loss": 6.4137, - "step": 5820 - }, - { - "epoch": 30.05, - "learning_rate": 2e-05, - "loss": 6.9177, - "step": 5830 - }, - { - "epoch": 30.1, - "learning_rate": 2e-05, - "loss": 6.4397, - "step": 5840 - }, - { - "epoch": 30.15, - "learning_rate": 2e-05, - "loss": 6.4188, - "step": 5850 - }, - { - "epoch": 30.15, - "eval_accuracy": 0.13739219384676168, - "eval_loss": 6.486028671264648, - "eval_runtime": 169.0403, - "eval_samples_per_second": 142.303, - "eval_steps_per_second": 2.224, - "step": 5850 - }, - { - "epoch": 30.21, - "learning_rate": 2e-05, - "loss": 6.395, - "step": 5860 - }, - { - "epoch": 30.26, - "learning_rate": 2e-05, - "loss": 6.4017, - "step": 5870 - }, - { - "epoch": 30.31, - "learning_rate": 2e-05, - "loss": 6.4429, - "step": 5880 - }, - { - "epoch": 30.36, - "learning_rate": 2e-05, - "loss": 6.3932, - "step": 5890 - }, - { - "epoch": 30.41, - "learning_rate": 2e-05, - "loss": 6.4337, - "step": 5900 - }, - { - "epoch": 30.41, - "eval_accuracy": 0.13796805623574424, - "eval_loss": 6.480716228485107, - "eval_runtime": 169.1216, - "eval_samples_per_second": 142.235, - "eval_steps_per_second": 2.223, - "step": 5900 - }, - { - "epoch": 30.46, - "learning_rate": 2e-05, - "loss": 6.4171, - "step": 5910 - }, - { - "epoch": 30.51, - "learning_rate": 2e-05, - "loss": 6.42, - "step": 5920 - }, - { - "epoch": 30.56, - "learning_rate": 2e-05, - "loss": 6.4512, - "step": 5930 - }, - { - "epoch": 30.62, - "learning_rate": 2e-05, - "loss": 6.4306, - "step": 5940 - }, - { - "epoch": 30.67, - "learning_rate": 2e-05, - "loss": 6.4228, - "step": 5950 - }, - { - "epoch": 30.67, - "eval_accuracy": 0.1374551952909284, - "eval_loss": 6.487619400024414, - "eval_runtime": 169.1032, - "eval_samples_per_second": 142.25, - "eval_steps_per_second": 2.223, - "step": 5950 - }, - { - "epoch": 30.72, - "learning_rate": 2e-05, - "loss": 6.4332, - "step": 5960 - }, - { - "epoch": 30.77, - "learning_rate": 2e-05, - "loss": 6.413, - "step": 5970 - }, - { - "epoch": 30.82, - "learning_rate": 2e-05, - "loss": 6.3904, - "step": 5980 - }, - { - "epoch": 30.87, - "learning_rate": 2e-05, - "loss": 6.4198, - "step": 5990 - }, - { - "epoch": 30.92, - "learning_rate": 2e-05, - "loss": 6.3841, - "step": 6000 - }, - { - "epoch": 30.92, - "eval_accuracy": 0.1375611450497736, - "eval_loss": 6.4811110496521, - "eval_runtime": 169.0873, - "eval_samples_per_second": 142.264, - "eval_steps_per_second": 2.224, - "step": 6000 - }, - { - "epoch": 30.98, - "learning_rate": 2e-05, - "loss": 6.4452, - "step": 6010 - }, - { - "epoch": 31.03, - "learning_rate": 2e-05, - "loss": 6.9049, - "step": 6020 - }, - { - "epoch": 31.08, - "learning_rate": 2e-05, - "loss": 6.4161, - "step": 6030 - }, - { - "epoch": 31.13, - "learning_rate": 2e-05, - "loss": 6.3858, - "step": 6040 - }, - { - "epoch": 31.18, - "learning_rate": 2e-05, - "loss": 6.4383, - "step": 6050 - }, - { - "epoch": 31.18, - "eval_accuracy": 0.13791837603204882, - "eval_loss": 6.483151912689209, - "eval_runtime": 169.0866, - "eval_samples_per_second": 142.264, - "eval_steps_per_second": 2.224, - "step": 6050 - }, - { - "epoch": 31.18, - "step": 6050, - "total_flos": 1.125873723276288e+16, - "train_loss": 6.89371246590102, - "train_runtime": 29548.4223, - "train_samples_per_second": 33.721, - "train_steps_per_second": 0.263 + "epoch": 89.58, + "step": 4300, + "total_flos": 2.954083328682332e+17, + "train_loss": 3.7431876293448516, + "train_runtime": 42244.2763, + "train_samples_per_second": 58.967, + "train_steps_per_second": 0.114 } ], - "max_steps": 7760, - "num_train_epochs": 40, - "total_flos": 1.125873723276288e+16, + "max_steps": 4800, + "num_train_epochs": 100, + "total_flos": 2.954083328682332e+17, "trial_name": null, "trial_params": null }