diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100755--- "a/trainer_state.json" +++ /dev/null @@ -1,13626 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.998383185125303, - "eval_steps": 25.0, - "global_step": 1236, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "full_loss": 0.5913, - "grad_norm": 11.4375, - "learning_rate": 6.578947368421053e-07, - "long_answer_loss": 0.5913, - "loss": 0.5772, - "short_answer_loss": NaN, - "step": 1, - "template_loss": 0.0 - }, - { - "epoch": 0.0, - "full_loss": 0.5407, - "grad_norm": 11.6875, - "learning_rate": 1.3157894736842106e-06, - "long_answer_loss": 0.5407, - "loss": 0.5984, - "short_answer_loss": NaN, - "step": 2, - "template_loss": 0.0 - }, - { - "epoch": 0.0, - "full_loss": 0.5632, - "grad_norm": 11.0, - "learning_rate": 1.9736842105263157e-06, - "long_answer_loss": 0.5632, - "loss": 0.5768, - "short_answer_loss": NaN, - "step": 3, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.5517, - "grad_norm": 10.5, - "learning_rate": 2.631578947368421e-06, - "long_answer_loss": 0.5517, - "loss": 0.5593, - "short_answer_loss": NaN, - "step": 4, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.4862, - "grad_norm": 9.0625, - "learning_rate": 3.2894736842105265e-06, - "long_answer_loss": 0.4862, - "loss": 0.5092, - "short_answer_loss": NaN, - "step": 5, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.501, - "grad_norm": 8.25, - "learning_rate": 3.9473684210526315e-06, - "long_answer_loss": 0.501, - "loss": 0.489, - "short_answer_loss": NaN, - "step": 6, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.4114, - "grad_norm": 7.0, - "learning_rate": 4.605263157894737e-06, - "long_answer_loss": 0.4114, - "loss": 0.4271, - "short_answer_loss": NaN, - "step": 7, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.357, - "grad_norm": 5.3125, - "learning_rate": 5.263157894736842e-06, - "long_answer_loss": 0.357, - "loss": 0.3714, - "short_answer_loss": NaN, - "step": 8, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.3179, - "grad_norm": 6.6875, - "learning_rate": 5.921052631578948e-06, - "long_answer_loss": 0.3179, - "loss": 0.3373, - "short_answer_loss": NaN, - "step": 9, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2982, - "grad_norm": 6.3125, - "learning_rate": 6.578947368421053e-06, - "long_answer_loss": 0.2982, - "loss": 0.3161, - "short_answer_loss": NaN, - "step": 10, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2744, - "grad_norm": 4.96875, - "learning_rate": 7.236842105263158e-06, - "long_answer_loss": 0.2744, - "loss": 0.2775, - "short_answer_loss": NaN, - "step": 11, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2795, - "grad_norm": 4.21875, - "learning_rate": 7.894736842105263e-06, - "long_answer_loss": 0.2795, - "loss": 0.2757, - "short_answer_loss": NaN, - "step": 12, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2391, - "grad_norm": 3.515625, - "learning_rate": 8.552631578947368e-06, - "long_answer_loss": 0.2391, - "loss": 0.241, - "short_answer_loss": NaN, - "step": 13, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2621, - "grad_norm": 3.0625, - "learning_rate": 9.210526315789474e-06, - "long_answer_loss": 0.2621, - "loss": 0.2527, - "short_answer_loss": NaN, - "step": 14, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2162, - "grad_norm": 2.875, - "learning_rate": 9.868421052631579e-06, - "long_answer_loss": 0.2162, - "loss": 0.2324, - "short_answer_loss": NaN, - "step": 15, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.2265, - "grad_norm": 3.078125, - "learning_rate": 1.0526315789473684e-05, - "long_answer_loss": 0.2265, - "loss": 0.238, - "short_answer_loss": NaN, - "step": 16, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.2399, - "grad_norm": 2.96875, - "learning_rate": 1.118421052631579e-05, - "long_answer_loss": 0.2399, - "loss": 0.2288, - "short_answer_loss": NaN, - "step": 17, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1865, - "grad_norm": 2.65625, - "learning_rate": 1.1842105263157895e-05, - "long_answer_loss": 0.1865, - "loss": 0.209, - "short_answer_loss": NaN, - "step": 18, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1784, - "grad_norm": 2.703125, - "learning_rate": 1.25e-05, - "long_answer_loss": 0.1784, - "loss": 0.2255, - "short_answer_loss": NaN, - "step": 19, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.2396, - "grad_norm": 2.859375, - "learning_rate": 1.3157894736842106e-05, - "long_answer_loss": 0.2396, - "loss": 0.2175, - "short_answer_loss": NaN, - "step": 20, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1953, - "grad_norm": 2.796875, - "learning_rate": 1.3815789473684213e-05, - "long_answer_loss": 0.1953, - "loss": 0.2107, - "short_answer_loss": NaN, - "step": 21, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1817, - "grad_norm": 2.625, - "learning_rate": 1.4473684210526317e-05, - "long_answer_loss": 0.1817, - "loss": 0.2208, - "short_answer_loss": NaN, - "step": 22, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1913, - "grad_norm": 2.703125, - "learning_rate": 1.5131578947368422e-05, - "long_answer_loss": 0.1913, - "loss": 0.1938, - "short_answer_loss": NaN, - "step": 23, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1947, - "grad_norm": 2.59375, - "learning_rate": 1.5789473684210526e-05, - "long_answer_loss": 0.1947, - "loss": 0.2017, - "short_answer_loss": NaN, - "step": 24, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.2353, - "grad_norm": 2.59375, - "learning_rate": 1.6447368421052635e-05, - "long_answer_loss": 0.2353, - "loss": 0.2086, - "short_answer_loss": NaN, - "step": 25, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.2258, - "grad_norm": 2.578125, - "learning_rate": 1.7105263157894737e-05, - "long_answer_loss": 0.2258, - "loss": 0.1929, - "short_answer_loss": NaN, - "step": 26, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1916, - "grad_norm": 2.8125, - "learning_rate": 1.7763157894736842e-05, - "long_answer_loss": 0.1916, - "loss": 0.1996, - "short_answer_loss": NaN, - "step": 27, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.185, - "grad_norm": 2.78125, - "learning_rate": 1.8421052631578947e-05, - "long_answer_loss": 0.185, - "loss": 0.193, - "short_answer_loss": NaN, - "step": 28, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1788, - "grad_norm": 2.65625, - "learning_rate": 1.9078947368421056e-05, - "long_answer_loss": 0.1788, - "loss": 0.1975, - "short_answer_loss": NaN, - "step": 29, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1976, - "grad_norm": 2.609375, - "learning_rate": 1.9736842105263158e-05, - "long_answer_loss": 0.1976, - "loss": 0.1802, - "short_answer_loss": NaN, - "step": 30, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.204, - "grad_norm": 2.40625, - "learning_rate": 2.0394736842105264e-05, - "long_answer_loss": 0.204, - "loss": 0.1897, - "short_answer_loss": NaN, - "step": 31, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.191, - "grad_norm": 2.46875, - "learning_rate": 2.105263157894737e-05, - "long_answer_loss": 0.191, - "loss": 0.2029, - "short_answer_loss": NaN, - "step": 32, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1831, - "grad_norm": 2.375, - "learning_rate": 2.1710526315789474e-05, - "long_answer_loss": 0.1831, - "loss": 0.1809, - "short_answer_loss": NaN, - "step": 33, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.2093, - "grad_norm": 2.40625, - "learning_rate": 2.236842105263158e-05, - "long_answer_loss": 0.2093, - "loss": 0.193, - "short_answer_loss": NaN, - "step": 34, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.2136, - "grad_norm": 2.640625, - "learning_rate": 2.3026315789473685e-05, - "long_answer_loss": 0.2136, - "loss": 0.1973, - "short_answer_loss": NaN, - "step": 35, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.2073, - "grad_norm": 2.234375, - "learning_rate": 2.368421052631579e-05, - "long_answer_loss": 0.2073, - "loss": 0.1839, - "short_answer_loss": NaN, - "step": 36, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.2317, - "grad_norm": 2.625, - "learning_rate": 2.4342105263157896e-05, - "long_answer_loss": 0.2317, - "loss": 0.1985, - "short_answer_loss": NaN, - "step": 37, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1949, - "grad_norm": 2.578125, - "learning_rate": 2.5e-05, - "long_answer_loss": 0.1949, - "loss": 0.1837, - "short_answer_loss": NaN, - "step": 38, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1792, - "grad_norm": 2.484375, - "learning_rate": 2.499995702005279e-05, - "long_answer_loss": 0.1792, - "loss": 0.1878, - "short_answer_loss": NaN, - "step": 39, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1855, - "grad_norm": 2.25, - "learning_rate": 2.499982808050672e-05, - "long_answer_loss": 0.1855, - "loss": 0.1966, - "short_answer_loss": NaN, - "step": 40, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1857, - "grad_norm": 2.546875, - "learning_rate": 2.4999613182248482e-05, - "long_answer_loss": 0.1857, - "loss": 0.1934, - "short_answer_loss": NaN, - "step": 41, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1906, - "grad_norm": 2.4375, - "learning_rate": 2.499931232675589e-05, - "long_answer_loss": 0.1906, - "loss": 0.1937, - "short_answer_loss": NaN, - "step": 42, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1886, - "grad_norm": 2.515625, - "learning_rate": 2.499892551609786e-05, - "long_answer_loss": 0.1886, - "loss": 0.1995, - "short_answer_loss": NaN, - "step": 43, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1893, - "grad_norm": 2.734375, - "learning_rate": 2.499845275293441e-05, - "long_answer_loss": 0.1893, - "loss": 0.2026, - "short_answer_loss": NaN, - "step": 44, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.2057, - "grad_norm": 2.53125, - "learning_rate": 2.499789404051663e-05, - "long_answer_loss": 0.2057, - "loss": 0.2021, - "short_answer_loss": NaN, - "step": 45, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1896, - "grad_norm": 2.5, - "learning_rate": 2.4997249382686673e-05, - "long_answer_loss": 0.1896, - "loss": 0.1881, - "short_answer_loss": NaN, - "step": 46, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.2066, - "grad_norm": 2.46875, - "learning_rate": 2.4996518783877716e-05, - "long_answer_loss": 0.2066, - "loss": 0.1984, - "short_answer_loss": NaN, - "step": 47, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1739, - "grad_norm": 2.515625, - "learning_rate": 2.4995702249113935e-05, - "long_answer_loss": 0.1739, - "loss": 0.1963, - "short_answer_loss": NaN, - "step": 48, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1877, - "grad_norm": 2.375, - "learning_rate": 2.499479978401047e-05, - "long_answer_loss": 0.1877, - "loss": 0.1921, - "short_answer_loss": NaN, - "step": 49, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1709, - "grad_norm": 2.375, - "learning_rate": 2.499381139477338e-05, - "long_answer_loss": 0.1709, - "loss": 0.1892, - "short_answer_loss": NaN, - "step": 50, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1591, - "grad_norm": 2.53125, - "learning_rate": 2.4992737088199623e-05, - "long_answer_loss": 0.1591, - "loss": 0.1913, - "short_answer_loss": NaN, - "step": 51, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1878, - "grad_norm": 2.53125, - "learning_rate": 2.499157687167697e-05, - "long_answer_loss": 0.1878, - "loss": 0.1977, - "short_answer_loss": NaN, - "step": 52, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1668, - "grad_norm": 2.421875, - "learning_rate": 2.499033075318399e-05, - "long_answer_loss": 0.1668, - "loss": 0.1937, - "short_answer_loss": NaN, - "step": 53, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1727, - "grad_norm": 2.453125, - "learning_rate": 2.4988998741289986e-05, - "long_answer_loss": 0.1727, - "loss": 0.1807, - "short_answer_loss": NaN, - "step": 54, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1721, - "grad_norm": 2.9375, - "learning_rate": 2.4987580845154922e-05, - "long_answer_loss": 0.1721, - "loss": 0.1911, - "short_answer_loss": NaN, - "step": 55, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.2108, - "grad_norm": 2.40625, - "learning_rate": 2.4986077074529374e-05, - "long_answer_loss": 0.2108, - "loss": 0.1986, - "short_answer_loss": NaN, - "step": 56, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1706, - "grad_norm": 2.453125, - "learning_rate": 2.498448743975446e-05, - "long_answer_loss": 0.1706, - "loss": 0.1781, - "short_answer_loss": NaN, - "step": 57, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1635, - "grad_norm": 2.09375, - "learning_rate": 2.498281195176177e-05, - "long_answer_loss": 0.1635, - "loss": 0.1883, - "short_answer_loss": NaN, - "step": 58, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1799, - "grad_norm": 2.546875, - "learning_rate": 2.498105062207328e-05, - "long_answer_loss": 0.1799, - "loss": 0.1854, - "short_answer_loss": NaN, - "step": 59, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.2089, - "grad_norm": 2.515625, - "learning_rate": 2.4979203462801287e-05, - "long_answer_loss": 0.2089, - "loss": 0.1871, - "short_answer_loss": NaN, - "step": 60, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.2018, - "grad_norm": 2.34375, - "learning_rate": 2.497727048664833e-05, - "long_answer_loss": 0.2018, - "loss": 0.1861, - "short_answer_loss": NaN, - "step": 61, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.203, - "grad_norm": 2.40625, - "learning_rate": 2.497525170690707e-05, - "long_answer_loss": 0.203, - "loss": 0.1979, - "short_answer_loss": NaN, - "step": 62, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1858, - "grad_norm": 2.078125, - "learning_rate": 2.4973147137460246e-05, - "long_answer_loss": 0.1858, - "loss": 0.1876, - "short_answer_loss": NaN, - "step": 63, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1554, - "grad_norm": 2.265625, - "learning_rate": 2.4970956792780533e-05, - "long_answer_loss": 0.1554, - "loss": 0.1781, - "short_answer_loss": NaN, - "step": 64, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.2165, - "grad_norm": 2.5, - "learning_rate": 2.4968680687930482e-05, - "long_answer_loss": 0.2165, - "loss": 0.1937, - "short_answer_loss": NaN, - "step": 65, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.186, - "grad_norm": 2.046875, - "learning_rate": 2.4966318838562392e-05, - "long_answer_loss": 0.186, - "loss": 0.1856, - "short_answer_loss": NaN, - "step": 66, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1919, - "grad_norm": 2.34375, - "learning_rate": 2.49638712609182e-05, - "long_answer_loss": 0.1919, - "loss": 0.1886, - "short_answer_loss": NaN, - "step": 67, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1953, - "grad_norm": 2.125, - "learning_rate": 2.4961337971829397e-05, - "long_answer_loss": 0.1953, - "loss": 0.1902, - "short_answer_loss": NaN, - "step": 68, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1847, - "grad_norm": 2.109375, - "learning_rate": 2.4958718988716885e-05, - "long_answer_loss": 0.1847, - "loss": 0.1858, - "short_answer_loss": NaN, - "step": 69, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.2127, - "grad_norm": 2.203125, - "learning_rate": 2.4956014329590855e-05, - "long_answer_loss": 0.2127, - "loss": 0.1936, - "short_answer_loss": NaN, - "step": 70, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1856, - "grad_norm": 2.421875, - "learning_rate": 2.495322401305069e-05, - "long_answer_loss": 0.1856, - "loss": 0.1982, - "short_answer_loss": NaN, - "step": 71, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1997, - "grad_norm": 2.390625, - "learning_rate": 2.4950348058284813e-05, - "long_answer_loss": 0.1997, - "loss": 0.1893, - "short_answer_loss": NaN, - "step": 72, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.2013, - "grad_norm": 2.125, - "learning_rate": 2.494738648507057e-05, - "long_answer_loss": 0.2013, - "loss": 0.1877, - "short_answer_loss": NaN, - "step": 73, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1742, - "grad_norm": 2.1875, - "learning_rate": 2.494433931377408e-05, - "long_answer_loss": 0.1742, - "loss": 0.1841, - "short_answer_loss": NaN, - "step": 74, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.2026, - "grad_norm": 2.21875, - "learning_rate": 2.4941206565350102e-05, - "long_answer_loss": 0.2026, - "loss": 0.1785, - "short_answer_loss": NaN, - "step": 75, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.2138, - "grad_norm": 2.234375, - "learning_rate": 2.49379882613419e-05, - "long_answer_loss": 0.2138, - "loss": 0.1952, - "short_answer_loss": NaN, - "step": 76, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.183, - "grad_norm": 2.125, - "learning_rate": 2.4934684423881074e-05, - "long_answer_loss": 0.183, - "loss": 0.1743, - "short_answer_loss": NaN, - "step": 77, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1767, - "grad_norm": 2.390625, - "learning_rate": 2.4931295075687428e-05, - "long_answer_loss": 0.1767, - "loss": 0.1919, - "short_answer_loss": NaN, - "step": 78, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1558, - "grad_norm": 2.09375, - "learning_rate": 2.4927820240068805e-05, - "long_answer_loss": 0.1558, - "loss": 0.1802, - "short_answer_loss": NaN, - "step": 79, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1912, - "grad_norm": 2.234375, - "learning_rate": 2.492425994092092e-05, - "long_answer_loss": 0.1912, - "loss": 0.1875, - "short_answer_loss": NaN, - "step": 80, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.2034, - "grad_norm": 2.125, - "learning_rate": 2.4920614202727217e-05, - "long_answer_loss": 0.2034, - "loss": 0.1925, - "short_answer_loss": NaN, - "step": 81, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1561, - "grad_norm": 2.09375, - "learning_rate": 2.4916883050558664e-05, - "long_answer_loss": 0.1561, - "loss": 0.1878, - "short_answer_loss": NaN, - "step": 82, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.2187, - "grad_norm": 2.34375, - "learning_rate": 2.491306651007363e-05, - "long_answer_loss": 0.2187, - "loss": 0.1993, - "short_answer_loss": NaN, - "step": 83, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.2146, - "grad_norm": 2.375, - "learning_rate": 2.490916460751766e-05, - "long_answer_loss": 0.2146, - "loss": 0.1946, - "short_answer_loss": NaN, - "step": 84, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1711, - "grad_norm": 2.171875, - "learning_rate": 2.4905177369723333e-05, - "long_answer_loss": 0.1711, - "loss": 0.1791, - "short_answer_loss": NaN, - "step": 85, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1823, - "grad_norm": 2.15625, - "learning_rate": 2.4901104824110042e-05, - "long_answer_loss": 0.1823, - "loss": 0.1865, - "short_answer_loss": NaN, - "step": 86, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1902, - "grad_norm": 2.171875, - "learning_rate": 2.489694699868384e-05, - "long_answer_loss": 0.1902, - "loss": 0.1932, - "short_answer_loss": NaN, - "step": 87, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1712, - "grad_norm": 2.09375, - "learning_rate": 2.4892703922037225e-05, - "long_answer_loss": 0.1712, - "loss": 0.185, - "short_answer_loss": NaN, - "step": 88, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1422, - "grad_norm": 2.109375, - "learning_rate": 2.488837562334895e-05, - "long_answer_loss": 0.1422, - "loss": 0.1847, - "short_answer_loss": NaN, - "step": 89, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1879, - "grad_norm": 2.25, - "learning_rate": 2.4883962132383823e-05, - "long_answer_loss": 0.1879, - "loss": 0.19, - "short_answer_loss": NaN, - "step": 90, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.2042, - "grad_norm": 2.09375, - "learning_rate": 2.4879463479492504e-05, - "long_answer_loss": 0.2042, - "loss": 0.1942, - "short_answer_loss": NaN, - "step": 91, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.2047, - "grad_norm": 2.203125, - "learning_rate": 2.4874879695611287e-05, - "long_answer_loss": 0.2047, - "loss": 0.193, - "short_answer_loss": NaN, - "step": 92, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1753, - "grad_norm": 1.9765625, - "learning_rate": 2.4870210812261898e-05, - "long_answer_loss": 0.1753, - "loss": 0.1829, - "short_answer_loss": NaN, - "step": 93, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1802, - "grad_norm": 2.015625, - "learning_rate": 2.486545686155128e-05, - "long_answer_loss": 0.1802, - "loss": 0.191, - "short_answer_loss": NaN, - "step": 94, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1443, - "grad_norm": 2.171875, - "learning_rate": 2.4860617876171355e-05, - "long_answer_loss": 0.1443, - "loss": 0.18, - "short_answer_loss": NaN, - "step": 95, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1755, - "grad_norm": 2.109375, - "learning_rate": 2.4855693889398822e-05, - "long_answer_loss": 0.1755, - "loss": 0.1902, - "short_answer_loss": NaN, - "step": 96, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1992, - "grad_norm": 1.9921875, - "learning_rate": 2.485068493509491e-05, - "long_answer_loss": 0.1992, - "loss": 0.1887, - "short_answer_loss": NaN, - "step": 97, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.2405, - "grad_norm": 2.203125, - "learning_rate": 2.4845591047705153e-05, - "long_answer_loss": 0.2405, - "loss": 0.1958, - "short_answer_loss": NaN, - "step": 98, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.2091, - "grad_norm": 2.203125, - "learning_rate": 2.484041226225915e-05, - "long_answer_loss": 0.2091, - "loss": 0.1872, - "short_answer_loss": NaN, - "step": 99, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1757, - "grad_norm": 1.9765625, - "learning_rate": 2.4835148614370334e-05, - "long_answer_loss": 0.1757, - "loss": 0.1841, - "short_answer_loss": NaN, - "step": 100, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1818, - "grad_norm": 2.125, - "learning_rate": 2.482980014023571e-05, - "long_answer_loss": 0.1818, - "loss": 0.1852, - "short_answer_loss": NaN, - "step": 101, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1899, - "grad_norm": 2.453125, - "learning_rate": 2.4824366876635623e-05, - "long_answer_loss": 0.1899, - "loss": 0.195, - "short_answer_loss": NaN, - "step": 102, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1858, - "grad_norm": 2.28125, - "learning_rate": 2.481884886093349e-05, - "long_answer_loss": 0.1858, - "loss": 0.1946, - "short_answer_loss": NaN, - "step": 103, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1847, - "grad_norm": 2.109375, - "learning_rate": 2.4813246131075564e-05, - "long_answer_loss": 0.1847, - "loss": 0.1895, - "short_answer_loss": NaN, - "step": 104, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1691, - "grad_norm": 2.1875, - "learning_rate": 2.480755872559064e-05, - "long_answer_loss": 0.1691, - "loss": 0.191, - "short_answer_loss": NaN, - "step": 105, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.2181, - "grad_norm": 2.265625, - "learning_rate": 2.4801786683589824e-05, - "long_answer_loss": 0.2181, - "loss": 0.1879, - "short_answer_loss": NaN, - "step": 106, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1856, - "grad_norm": 2.25, - "learning_rate": 2.4795930044766247e-05, - "long_answer_loss": 0.1856, - "loss": 0.1899, - "short_answer_loss": NaN, - "step": 107, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1754, - "grad_norm": 2.0625, - "learning_rate": 2.4789988849394792e-05, - "long_answer_loss": 0.1754, - "loss": 0.1905, - "short_answer_loss": NaN, - "step": 108, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1835, - "grad_norm": 2.28125, - "learning_rate": 2.478396313833182e-05, - "long_answer_loss": 0.1835, - "loss": 0.1983, - "short_answer_loss": NaN, - "step": 109, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.2057, - "grad_norm": 2.1875, - "learning_rate": 2.4777852953014896e-05, - "long_answer_loss": 0.2057, - "loss": 0.1931, - "short_answer_loss": NaN, - "step": 110, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1497, - "grad_norm": 1.9921875, - "learning_rate": 2.4771658335462483e-05, - "long_answer_loss": 0.1497, - "loss": 0.1912, - "short_answer_loss": NaN, - "step": 111, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1689, - "grad_norm": 2.21875, - "learning_rate": 2.476537932827368e-05, - "long_answer_loss": 0.1689, - "loss": 0.1817, - "short_answer_loss": NaN, - "step": 112, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.2024, - "grad_norm": 1.8515625, - "learning_rate": 2.4759015974627906e-05, - "long_answer_loss": 0.2024, - "loss": 0.1819, - "short_answer_loss": NaN, - "step": 113, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1597, - "grad_norm": 2.109375, - "learning_rate": 2.475256831828462e-05, - "long_answer_loss": 0.1597, - "loss": 0.1916, - "short_answer_loss": NaN, - "step": 114, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1845, - "grad_norm": 2.0, - "learning_rate": 2.4746036403583012e-05, - "long_answer_loss": 0.1845, - "loss": 0.1897, - "short_answer_loss": NaN, - "step": 115, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1652, - "grad_norm": 2.0625, - "learning_rate": 2.4739420275441694e-05, - "long_answer_loss": 0.1652, - "loss": 0.1878, - "short_answer_loss": NaN, - "step": 116, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1572, - "grad_norm": 1.9375, - "learning_rate": 2.4732719979358403e-05, - "long_answer_loss": 0.1572, - "loss": 0.1766, - "short_answer_loss": NaN, - "step": 117, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1945, - "grad_norm": 1.984375, - "learning_rate": 2.472593556140968e-05, - "long_answer_loss": 0.1945, - "loss": 0.1812, - "short_answer_loss": NaN, - "step": 118, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1754, - "grad_norm": 1.890625, - "learning_rate": 2.4719067068250552e-05, - "long_answer_loss": 0.1754, - "loss": 0.1834, - "short_answer_loss": NaN, - "step": 119, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1827, - "grad_norm": 2.015625, - "learning_rate": 2.4712114547114212e-05, - "long_answer_loss": 0.1827, - "loss": 0.1866, - "short_answer_loss": NaN, - "step": 120, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1763, - "grad_norm": 1.921875, - "learning_rate": 2.4705078045811704e-05, - "long_answer_loss": 0.1763, - "loss": 0.1771, - "short_answer_loss": NaN, - "step": 121, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1739, - "grad_norm": 1.9921875, - "learning_rate": 2.469795761273157e-05, - "long_answer_loss": 0.1739, - "loss": 0.1826, - "short_answer_loss": NaN, - "step": 122, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1831, - "grad_norm": 2.09375, - "learning_rate": 2.4690753296839558e-05, - "long_answer_loss": 0.1831, - "loss": 0.192, - "short_answer_loss": NaN, - "step": 123, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1404, - "grad_norm": 2.0, - "learning_rate": 2.4683465147678235e-05, - "long_answer_loss": 0.1404, - "loss": 0.1811, - "short_answer_loss": NaN, - "step": 124, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1653, - "grad_norm": 2.140625, - "learning_rate": 2.4676093215366695e-05, - "long_answer_loss": 0.1653, - "loss": 0.185, - "short_answer_loss": NaN, - "step": 125, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.2176, - "grad_norm": 2.203125, - "learning_rate": 2.466863755060017e-05, - "long_answer_loss": 0.2176, - "loss": 0.1936, - "short_answer_loss": NaN, - "step": 126, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.2059, - "grad_norm": 1.9609375, - "learning_rate": 2.4661098204649717e-05, - "long_answer_loss": 0.2059, - "loss": 0.1855, - "short_answer_loss": NaN, - "step": 127, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1801, - "grad_norm": 2.40625, - "learning_rate": 2.4653475229361843e-05, - "long_answer_loss": 0.1801, - "loss": 0.193, - "short_answer_loss": NaN, - "step": 128, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1823, - "grad_norm": 1.9140625, - "learning_rate": 2.4645768677158165e-05, - "long_answer_loss": 0.1823, - "loss": 0.1789, - "short_answer_loss": NaN, - "step": 129, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1603, - "grad_norm": 2.046875, - "learning_rate": 2.4637978601035033e-05, - "long_answer_loss": 0.1603, - "loss": 0.1858, - "short_answer_loss": NaN, - "step": 130, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1987, - "grad_norm": 2.046875, - "learning_rate": 2.463010505456318e-05, - "long_answer_loss": 0.1987, - "loss": 0.1986, - "short_answer_loss": NaN, - "step": 131, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1814, - "grad_norm": 2.171875, - "learning_rate": 2.4622148091887338e-05, - "long_answer_loss": 0.1814, - "loss": 0.1878, - "short_answer_loss": NaN, - "step": 132, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1871, - "grad_norm": 1.8828125, - "learning_rate": 2.4614107767725887e-05, - "long_answer_loss": 0.1871, - "loss": 0.1781, - "short_answer_loss": NaN, - "step": 133, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1763, - "grad_norm": 2.015625, - "learning_rate": 2.4605984137370452e-05, - "long_answer_loss": 0.1763, - "loss": 0.1949, - "short_answer_loss": NaN, - "step": 134, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.2048, - "grad_norm": 2.234375, - "learning_rate": 2.4597777256685556e-05, - "long_answer_loss": 0.2048, - "loss": 0.1948, - "short_answer_loss": NaN, - "step": 135, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1825, - "grad_norm": 1.9375, - "learning_rate": 2.45894871821082e-05, - "long_answer_loss": 0.1825, - "loss": 0.1836, - "short_answer_loss": NaN, - "step": 136, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1889, - "grad_norm": 1.9140625, - "learning_rate": 2.45811139706475e-05, - "long_answer_loss": 0.1889, - "loss": 0.1848, - "short_answer_loss": NaN, - "step": 137, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1961, - "grad_norm": 1.859375, - "learning_rate": 2.4572657679884285e-05, - "long_answer_loss": 0.1961, - "loss": 0.179, - "short_answer_loss": NaN, - "step": 138, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.197, - "grad_norm": 1.8359375, - "learning_rate": 2.4564118367970706e-05, - "long_answer_loss": 0.197, - "loss": 0.1927, - "short_answer_loss": NaN, - "step": 139, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.2099, - "grad_norm": 1.9453125, - "learning_rate": 2.455549609362983e-05, - "long_answer_loss": 0.2099, - "loss": 0.1934, - "short_answer_loss": NaN, - "step": 140, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1903, - "grad_norm": 2.28125, - "learning_rate": 2.4546790916155243e-05, - "long_answer_loss": 0.1903, - "loss": 0.1921, - "short_answer_loss": NaN, - "step": 141, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1882, - "grad_norm": 1.8984375, - "learning_rate": 2.4538002895410634e-05, - "long_answer_loss": 0.1882, - "loss": 0.1834, - "short_answer_loss": NaN, - "step": 142, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.2116, - "grad_norm": 1.921875, - "learning_rate": 2.452913209182939e-05, - "long_answer_loss": 0.2116, - "loss": 0.1767, - "short_answer_loss": NaN, - "step": 143, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1931, - "grad_norm": 1.9921875, - "learning_rate": 2.4520178566414177e-05, - "long_answer_loss": 0.1931, - "loss": 0.1955, - "short_answer_loss": NaN, - "step": 144, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1908, - "grad_norm": 2.09375, - "learning_rate": 2.4511142380736517e-05, - "long_answer_loss": 0.1908, - "loss": 0.1934, - "short_answer_loss": NaN, - "step": 145, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.2259, - "grad_norm": 1.9140625, - "learning_rate": 2.450202359693639e-05, - "long_answer_loss": 0.2259, - "loss": 0.1822, - "short_answer_loss": NaN, - "step": 146, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.2343, - "grad_norm": 1.953125, - "learning_rate": 2.449282227772176e-05, - "long_answer_loss": 0.2343, - "loss": 0.1911, - "short_answer_loss": NaN, - "step": 147, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1845, - "grad_norm": 2.015625, - "learning_rate": 2.4483538486368186e-05, - "long_answer_loss": 0.1845, - "loss": 0.1914, - "short_answer_loss": NaN, - "step": 148, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1773, - "grad_norm": 1.78125, - "learning_rate": 2.4474172286718363e-05, - "long_answer_loss": 0.1773, - "loss": 0.179, - "short_answer_loss": NaN, - "step": 149, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.187, - "grad_norm": 1.8828125, - "learning_rate": 2.4464723743181693e-05, - "long_answer_loss": 0.187, - "loss": 0.1869, - "short_answer_loss": NaN, - "step": 150, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1795, - "grad_norm": 2.03125, - "learning_rate": 2.445519292073385e-05, - "long_answer_loss": 0.1795, - "loss": 0.1928, - "short_answer_loss": NaN, - "step": 151, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1942, - "grad_norm": 1.96875, - "learning_rate": 2.4445579884916297e-05, - "long_answer_loss": 0.1942, - "loss": 0.1897, - "short_answer_loss": NaN, - "step": 152, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.183, - "grad_norm": 1.9140625, - "learning_rate": 2.443588470183589e-05, - "long_answer_loss": 0.183, - "loss": 0.1922, - "short_answer_loss": NaN, - "step": 153, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1974, - "grad_norm": 1.953125, - "learning_rate": 2.442610743816438e-05, - "long_answer_loss": 0.1974, - "loss": 0.1998, - "short_answer_loss": NaN, - "step": 154, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.169, - "grad_norm": 1.9140625, - "learning_rate": 2.4416248161137972e-05, - "long_answer_loss": 0.169, - "loss": 0.1812, - "short_answer_loss": NaN, - "step": 155, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1794, - "grad_norm": 1.890625, - "learning_rate": 2.4406306938556853e-05, - "long_answer_loss": 0.1794, - "loss": 0.1882, - "short_answer_loss": NaN, - "step": 156, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1685, - "grad_norm": 1.859375, - "learning_rate": 2.4396283838784743e-05, - "long_answer_loss": 0.1685, - "loss": 0.1772, - "short_answer_loss": NaN, - "step": 157, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1855, - "grad_norm": 2.015625, - "learning_rate": 2.438617893074841e-05, - "long_answer_loss": 0.1855, - "loss": 0.1951, - "short_answer_loss": NaN, - "step": 158, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1648, - "grad_norm": 1.8046875, - "learning_rate": 2.4375992283937194e-05, - "long_answer_loss": 0.1648, - "loss": 0.1752, - "short_answer_loss": NaN, - "step": 159, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1721, - "grad_norm": 1.8359375, - "learning_rate": 2.4365723968402552e-05, - "long_answer_loss": 0.1721, - "loss": 0.1834, - "short_answer_loss": NaN, - "step": 160, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1973, - "grad_norm": 1.9375, - "learning_rate": 2.4355374054757546e-05, - "long_answer_loss": 0.1973, - "loss": 0.1833, - "short_answer_loss": NaN, - "step": 161, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1963, - "grad_norm": 1.84375, - "learning_rate": 2.434494261417637e-05, - "long_answer_loss": 0.1963, - "loss": 0.1867, - "short_answer_loss": NaN, - "step": 162, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1969, - "grad_norm": 1.78125, - "learning_rate": 2.433442971839387e-05, - "long_answer_loss": 0.1969, - "loss": 0.1884, - "short_answer_loss": NaN, - "step": 163, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.2025, - "grad_norm": 2.109375, - "learning_rate": 2.432383543970504e-05, - "long_answer_loss": 0.2025, - "loss": 0.1926, - "short_answer_loss": NaN, - "step": 164, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.2021, - "grad_norm": 1.859375, - "learning_rate": 2.4313159850964523e-05, - "long_answer_loss": 0.2021, - "loss": 0.175, - "short_answer_loss": NaN, - "step": 165, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.159, - "grad_norm": 1.90625, - "learning_rate": 2.4302403025586122e-05, - "long_answer_loss": 0.159, - "loss": 0.1889, - "short_answer_loss": NaN, - "step": 166, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1843, - "grad_norm": 2.046875, - "learning_rate": 2.429156503754228e-05, - "long_answer_loss": 0.1843, - "loss": 0.1956, - "short_answer_loss": NaN, - "step": 167, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1944, - "grad_norm": 1.8203125, - "learning_rate": 2.428064596136358e-05, - "long_answer_loss": 0.1944, - "loss": 0.1856, - "short_answer_loss": NaN, - "step": 168, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1753, - "grad_norm": 1.78125, - "learning_rate": 2.4269645872138237e-05, - "long_answer_loss": 0.1753, - "loss": 0.1751, - "short_answer_loss": NaN, - "step": 169, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1986, - "grad_norm": 1.7890625, - "learning_rate": 2.4258564845511568e-05, - "long_answer_loss": 0.1986, - "loss": 0.1929, - "short_answer_loss": NaN, - "step": 170, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1973, - "grad_norm": 1.9375, - "learning_rate": 2.4247402957685482e-05, - "long_answer_loss": 0.1973, - "loss": 0.1852, - "short_answer_loss": NaN, - "step": 171, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.2181, - "grad_norm": 2.0, - "learning_rate": 2.4236160285417964e-05, - "long_answer_loss": 0.2181, - "loss": 0.2004, - "short_answer_loss": NaN, - "step": 172, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1811, - "grad_norm": 1.890625, - "learning_rate": 2.4224836906022518e-05, - "long_answer_loss": 0.1811, - "loss": 0.1879, - "short_answer_loss": NaN, - "step": 173, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1941, - "grad_norm": 1.8828125, - "learning_rate": 2.421343289736767e-05, - "long_answer_loss": 0.1941, - "loss": 0.1896, - "short_answer_loss": NaN, - "step": 174, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1872, - "grad_norm": 1.9140625, - "learning_rate": 2.4201948337876405e-05, - "long_answer_loss": 0.1872, - "loss": 0.1869, - "short_answer_loss": NaN, - "step": 175, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.2018, - "grad_norm": 2.0, - "learning_rate": 2.4190383306525647e-05, - "long_answer_loss": 0.2018, - "loss": 0.188, - "short_answer_loss": NaN, - "step": 176, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1861, - "grad_norm": 1.921875, - "learning_rate": 2.4178737882845708e-05, - "long_answer_loss": 0.1861, - "loss": 0.1856, - "short_answer_loss": NaN, - "step": 177, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.2307, - "grad_norm": 2.0, - "learning_rate": 2.4167012146919735e-05, - "long_answer_loss": 0.2307, - "loss": 0.1893, - "short_answer_loss": NaN, - "step": 178, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1916, - "grad_norm": 1.96875, - "learning_rate": 2.4155206179383172e-05, - "long_answer_loss": 0.1916, - "loss": 0.1842, - "short_answer_loss": NaN, - "step": 179, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.2011, - "grad_norm": 1.890625, - "learning_rate": 2.41433200614232e-05, - "long_answer_loss": 0.2011, - "loss": 0.192, - "short_answer_loss": NaN, - "step": 180, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1894, - "grad_norm": 2.03125, - "learning_rate": 2.4131353874778168e-05, - "long_answer_loss": 0.1894, - "loss": 0.1856, - "short_answer_loss": NaN, - "step": 181, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1915, - "grad_norm": 2.0625, - "learning_rate": 2.4119307701737053e-05, - "long_answer_loss": 0.1915, - "loss": 0.196, - "short_answer_loss": NaN, - "step": 182, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.2175, - "grad_norm": 1.7421875, - "learning_rate": 2.4107181625138874e-05, - "long_answer_loss": 0.2175, - "loss": 0.1841, - "short_answer_loss": NaN, - "step": 183, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1877, - "grad_norm": 1.8984375, - "learning_rate": 2.4094975728372133e-05, - "long_answer_loss": 0.1877, - "loss": 0.1905, - "short_answer_loss": NaN, - "step": 184, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.2038, - "grad_norm": 1.859375, - "learning_rate": 2.4082690095374234e-05, - "long_answer_loss": 0.2038, - "loss": 0.1906, - "short_answer_loss": NaN, - "step": 185, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.2006, - "grad_norm": 1.78125, - "learning_rate": 2.407032481063092e-05, - "long_answer_loss": 0.2006, - "loss": 0.1848, - "short_answer_loss": NaN, - "step": 186, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.2446, - "grad_norm": 1.984375, - "learning_rate": 2.4057879959175672e-05, - "long_answer_loss": 0.2446, - "loss": 0.196, - "short_answer_loss": NaN, - "step": 187, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.199, - "grad_norm": 1.8359375, - "learning_rate": 2.4045355626589145e-05, - "long_answer_loss": 0.199, - "loss": 0.1846, - "short_answer_loss": NaN, - "step": 188, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.2025, - "grad_norm": 2.265625, - "learning_rate": 2.4032751898998555e-05, - "long_answer_loss": 0.2025, - "loss": 0.1986, - "short_answer_loss": NaN, - "step": 189, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1821, - "grad_norm": 2.03125, - "learning_rate": 2.4020068863077116e-05, - "long_answer_loss": 0.1821, - "loss": 0.1906, - "short_answer_loss": NaN, - "step": 190, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1585, - "grad_norm": 1.7734375, - "learning_rate": 2.4007306606043416e-05, - "long_answer_loss": 0.1585, - "loss": 0.1821, - "short_answer_loss": NaN, - "step": 191, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1971, - "grad_norm": 1.8515625, - "learning_rate": 2.3994465215660846e-05, - "long_answer_loss": 0.1971, - "loss": 0.1865, - "short_answer_loss": NaN, - "step": 192, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1936, - "grad_norm": 1.90625, - "learning_rate": 2.3981544780236963e-05, - "long_answer_loss": 0.1936, - "loss": 0.1786, - "short_answer_loss": NaN, - "step": 193, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1601, - "grad_norm": 1.96875, - "learning_rate": 2.3968545388622917e-05, - "long_answer_loss": 0.1601, - "loss": 0.184, - "short_answer_loss": NaN, - "step": 194, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1794, - "grad_norm": 1.859375, - "learning_rate": 2.39554671302128e-05, - "long_answer_loss": 0.1794, - "loss": 0.1826, - "short_answer_loss": NaN, - "step": 195, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.2061, - "grad_norm": 1.9453125, - "learning_rate": 2.3942310094943083e-05, - "long_answer_loss": 0.2061, - "loss": 0.1856, - "short_answer_loss": NaN, - "step": 196, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1974, - "grad_norm": 1.8671875, - "learning_rate": 2.3929074373291946e-05, - "long_answer_loss": 0.1974, - "loss": 0.188, - "short_answer_loss": NaN, - "step": 197, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1961, - "grad_norm": 2.109375, - "learning_rate": 2.391576005627869e-05, - "long_answer_loss": 0.1961, - "loss": 0.1941, - "short_answer_loss": NaN, - "step": 198, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1659, - "grad_norm": 2.03125, - "learning_rate": 2.3902367235463104e-05, - "long_answer_loss": 0.1659, - "loss": 0.1898, - "short_answer_loss": NaN, - "step": 199, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.221, - "grad_norm": 2.015625, - "learning_rate": 2.3888896002944815e-05, - "long_answer_loss": 0.221, - "loss": 0.195, - "short_answer_loss": NaN, - "step": 200, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.2119, - "grad_norm": 1.9140625, - "learning_rate": 2.387534645136269e-05, - "long_answer_loss": 0.2119, - "loss": 0.1854, - "short_answer_loss": NaN, - "step": 201, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.2143, - "grad_norm": 1.703125, - "learning_rate": 2.3861718673894166e-05, - "long_answer_loss": 0.2143, - "loss": 0.1867, - "short_answer_loss": NaN, - "step": 202, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1887, - "grad_norm": 2.125, - "learning_rate": 2.384801276425463e-05, - "long_answer_loss": 0.1887, - "loss": 0.1929, - "short_answer_loss": NaN, - "step": 203, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1986, - "grad_norm": 2.09375, - "learning_rate": 2.3834228816696763e-05, - "long_answer_loss": 0.1986, - "loss": 0.1885, - "short_answer_loss": NaN, - "step": 204, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.2079, - "grad_norm": 1.8515625, - "learning_rate": 2.3820366926009903e-05, - "long_answer_loss": 0.2079, - "loss": 0.1958, - "short_answer_loss": NaN, - "step": 205, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1936, - "grad_norm": 2.078125, - "learning_rate": 2.3806427187519376e-05, - "long_answer_loss": 0.1936, - "loss": 0.1938, - "short_answer_loss": NaN, - "step": 206, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1903, - "grad_norm": 1.859375, - "learning_rate": 2.3792409697085866e-05, - "long_answer_loss": 0.1903, - "loss": 0.1859, - "short_answer_loss": NaN, - "step": 207, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1969, - "grad_norm": 2.09375, - "learning_rate": 2.3778314551104725e-05, - "long_answer_loss": 0.1969, - "loss": 0.1914, - "short_answer_loss": NaN, - "step": 208, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1905, - "grad_norm": 1.9921875, - "learning_rate": 2.376414184650534e-05, - "long_answer_loss": 0.1905, - "loss": 0.1832, - "short_answer_loss": NaN, - "step": 209, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1684, - "grad_norm": 1.78125, - "learning_rate": 2.3749891680750445e-05, - "long_answer_loss": 0.1684, - "loss": 0.1765, - "short_answer_loss": NaN, - "step": 210, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1781, - "grad_norm": 2.03125, - "learning_rate": 2.3735564151835462e-05, - "long_answer_loss": 0.1781, - "loss": 0.1868, - "short_answer_loss": NaN, - "step": 211, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1583, - "grad_norm": 1.9140625, - "learning_rate": 2.3721159358287815e-05, - "long_answer_loss": 0.1583, - "loss": 0.1739, - "short_answer_loss": NaN, - "step": 212, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1895, - "grad_norm": 1.96875, - "learning_rate": 2.370667739916627e-05, - "long_answer_loss": 0.1895, - "loss": 0.1963, - "short_answer_loss": NaN, - "step": 213, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1893, - "grad_norm": 1.8671875, - "learning_rate": 2.369211837406024e-05, - "long_answer_loss": 0.1893, - "loss": 0.1824, - "short_answer_loss": NaN, - "step": 214, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.177, - "grad_norm": 1.796875, - "learning_rate": 2.3677482383089105e-05, - "long_answer_loss": 0.177, - "loss": 0.1841, - "short_answer_loss": NaN, - "step": 215, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1768, - "grad_norm": 1.8828125, - "learning_rate": 2.3662769526901526e-05, - "long_answer_loss": 0.1768, - "loss": 0.1736, - "short_answer_loss": NaN, - "step": 216, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1741, - "grad_norm": 2.015625, - "learning_rate": 2.364797990667475e-05, - "long_answer_loss": 0.1741, - "loss": 0.1913, - "short_answer_loss": NaN, - "step": 217, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1897, - "grad_norm": 1.9296875, - "learning_rate": 2.3633113624113908e-05, - "long_answer_loss": 0.1897, - "loss": 0.1854, - "short_answer_loss": NaN, - "step": 218, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1796, - "grad_norm": 1.9296875, - "learning_rate": 2.3618170781451328e-05, - "long_answer_loss": 0.1796, - "loss": 0.1837, - "short_answer_loss": NaN, - "step": 219, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1461, - "grad_norm": 1.921875, - "learning_rate": 2.3603151481445823e-05, - "long_answer_loss": 0.1461, - "loss": 0.1872, - "short_answer_loss": NaN, - "step": 220, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1872, - "grad_norm": 1.8984375, - "learning_rate": 2.3588055827381995e-05, - "long_answer_loss": 0.1872, - "loss": 0.1828, - "short_answer_loss": NaN, - "step": 221, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1991, - "grad_norm": 1.984375, - "learning_rate": 2.35728839230695e-05, - "long_answer_loss": 0.1991, - "loss": 0.1823, - "short_answer_loss": NaN, - "step": 222, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.2084, - "grad_norm": 1.8203125, - "learning_rate": 2.3557635872842372e-05, - "long_answer_loss": 0.2084, - "loss": 0.1903, - "short_answer_loss": NaN, - "step": 223, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1454, - "grad_norm": 1.8515625, - "learning_rate": 2.3542311781558263e-05, - "long_answer_loss": 0.1454, - "loss": 0.1738, - "short_answer_loss": NaN, - "step": 224, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.2094, - "grad_norm": 1.8828125, - "learning_rate": 2.3526911754597763e-05, - "long_answer_loss": 0.2094, - "loss": 0.1725, - "short_answer_loss": NaN, - "step": 225, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1653, - "grad_norm": 1.7578125, - "learning_rate": 2.3511435897863647e-05, - "long_answer_loss": 0.1653, - "loss": 0.1774, - "short_answer_loss": NaN, - "step": 226, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1597, - "grad_norm": 1.890625, - "learning_rate": 2.3495884317780154e-05, - "long_answer_loss": 0.1597, - "loss": 0.1845, - "short_answer_loss": NaN, - "step": 227, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1791, - "grad_norm": 1.6796875, - "learning_rate": 2.3480257121292254e-05, - "long_answer_loss": 0.1791, - "loss": 0.1814, - "short_answer_loss": NaN, - "step": 228, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.161, - "grad_norm": 1.7421875, - "learning_rate": 2.3464554415864927e-05, - "long_answer_loss": 0.161, - "loss": 0.1722, - "short_answer_loss": NaN, - "step": 229, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1547, - "grad_norm": 1.890625, - "learning_rate": 2.3448776309482402e-05, - "long_answer_loss": 0.1547, - "loss": 0.1762, - "short_answer_loss": NaN, - "step": 230, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.163, - "grad_norm": 1.984375, - "learning_rate": 2.3432922910647426e-05, - "long_answer_loss": 0.163, - "loss": 0.1701, - "short_answer_loss": NaN, - "step": 231, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1822, - "grad_norm": 1.9140625, - "learning_rate": 2.341699432838052e-05, - "long_answer_loss": 0.1822, - "loss": 0.1915, - "short_answer_loss": NaN, - "step": 232, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1643, - "grad_norm": 1.890625, - "learning_rate": 2.3400990672219226e-05, - "long_answer_loss": 0.1643, - "loss": 0.1743, - "short_answer_loss": NaN, - "step": 233, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1864, - "grad_norm": 1.875, - "learning_rate": 2.3384912052217345e-05, - "long_answer_loss": 0.1864, - "loss": 0.1837, - "short_answer_loss": NaN, - "step": 234, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.2088, - "grad_norm": 1.78125, - "learning_rate": 2.3368758578944205e-05, - "long_answer_loss": 0.2088, - "loss": 0.1754, - "short_answer_loss": NaN, - "step": 235, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1659, - "grad_norm": 1.6953125, - "learning_rate": 2.3352530363483866e-05, - "long_answer_loss": 0.1659, - "loss": 0.1826, - "short_answer_loss": NaN, - "step": 236, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1785, - "grad_norm": 1.8046875, - "learning_rate": 2.3336227517434385e-05, - "long_answer_loss": 0.1785, - "loss": 0.1885, - "short_answer_loss": NaN, - "step": 237, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1631, - "grad_norm": 1.765625, - "learning_rate": 2.331985015290704e-05, - "long_answer_loss": 0.1631, - "loss": 0.1812, - "short_answer_loss": NaN, - "step": 238, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1705, - "grad_norm": 1.8359375, - "learning_rate": 2.330339838252555e-05, - "long_answer_loss": 0.1705, - "loss": 0.1753, - "short_answer_loss": NaN, - "step": 239, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1575, - "grad_norm": 1.8984375, - "learning_rate": 2.3286872319425312e-05, - "long_answer_loss": 0.1575, - "loss": 0.1864, - "short_answer_loss": NaN, - "step": 240, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1795, - "grad_norm": 1.90625, - "learning_rate": 2.3270272077252613e-05, - "long_answer_loss": 0.1795, - "loss": 0.18, - "short_answer_loss": NaN, - "step": 241, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1681, - "grad_norm": 1.8671875, - "learning_rate": 2.3253597770163866e-05, - "long_answer_loss": 0.1681, - "loss": 0.1814, - "short_answer_loss": NaN, - "step": 242, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1861, - "grad_norm": 1.8671875, - "learning_rate": 2.3236849512824793e-05, - "long_answer_loss": 0.1861, - "loss": 0.184, - "short_answer_loss": NaN, - "step": 243, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1786, - "grad_norm": 2.0, - "learning_rate": 2.322002742040968e-05, - "long_answer_loss": 0.1786, - "loss": 0.1818, - "short_answer_loss": NaN, - "step": 244, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1927, - "grad_norm": 2.078125, - "learning_rate": 2.3203131608600548e-05, - "long_answer_loss": 0.1927, - "loss": 0.1988, - "short_answer_loss": NaN, - "step": 245, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1746, - "grad_norm": 1.7265625, - "learning_rate": 2.318616219358637e-05, - "long_answer_loss": 0.1746, - "loss": 0.1711, - "short_answer_loss": NaN, - "step": 246, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.159, - "grad_norm": 1.875, - "learning_rate": 2.3169119292062273e-05, - "long_answer_loss": 0.159, - "loss": 0.181, - "short_answer_loss": NaN, - "step": 247, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1809, - "grad_norm": 1.9296875, - "learning_rate": 2.3152003021228746e-05, - "long_answer_loss": 0.1809, - "loss": 0.1942, - "short_answer_loss": NaN, - "step": 248, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1779, - "grad_norm": 1.71875, - "learning_rate": 2.3134813498790814e-05, - "long_answer_loss": 0.1779, - "loss": 0.1757, - "short_answer_loss": NaN, - "step": 249, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1589, - "grad_norm": 1.8203125, - "learning_rate": 2.311755084295723e-05, - "long_answer_loss": 0.1589, - "loss": 0.1693, - "short_answer_loss": NaN, - "step": 250, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1904, - "grad_norm": 1.953125, - "learning_rate": 2.3100215172439693e-05, - "long_answer_loss": 0.1904, - "loss": 0.1824, - "short_answer_loss": NaN, - "step": 251, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1714, - "grad_norm": 1.90625, - "learning_rate": 2.308280660645199e-05, - "long_answer_loss": 0.1714, - "loss": 0.1849, - "short_answer_loss": NaN, - "step": 252, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.203, - "grad_norm": 1.7890625, - "learning_rate": 2.3065325264709196e-05, - "long_answer_loss": 0.203, - "loss": 0.1783, - "short_answer_loss": NaN, - "step": 253, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1694, - "grad_norm": 2.015625, - "learning_rate": 2.3047771267426866e-05, - "long_answer_loss": 0.1694, - "loss": 0.1777, - "short_answer_loss": NaN, - "step": 254, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.2025, - "grad_norm": 1.8828125, - "learning_rate": 2.303014473532017e-05, - "long_answer_loss": 0.2025, - "loss": 0.1794, - "short_answer_loss": NaN, - "step": 255, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1735, - "grad_norm": 1.71875, - "learning_rate": 2.3012445789603093e-05, - "long_answer_loss": 0.1735, - "loss": 0.1752, - "short_answer_loss": NaN, - "step": 256, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1522, - "grad_norm": 2.015625, - "learning_rate": 2.2994674551987605e-05, - "long_answer_loss": 0.1522, - "loss": 0.1789, - "short_answer_loss": NaN, - "step": 257, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1737, - "grad_norm": 1.9765625, - "learning_rate": 2.2976831144682797e-05, - "long_answer_loss": 0.1737, - "loss": 0.18, - "short_answer_loss": NaN, - "step": 258, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1774, - "grad_norm": 1.8671875, - "learning_rate": 2.295891569039406e-05, - "long_answer_loss": 0.1774, - "loss": 0.1879, - "short_answer_loss": NaN, - "step": 259, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1798, - "grad_norm": 1.8359375, - "learning_rate": 2.2940928312322246e-05, - "long_answer_loss": 0.1798, - "loss": 0.1803, - "short_answer_loss": NaN, - "step": 260, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.2178, - "grad_norm": 1.921875, - "learning_rate": 2.29228691341628e-05, - "long_answer_loss": 0.2178, - "loss": 0.1801, - "short_answer_loss": NaN, - "step": 261, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1985, - "grad_norm": 1.90625, - "learning_rate": 2.2904738280104927e-05, - "long_answer_loss": 0.1985, - "loss": 0.1893, - "short_answer_loss": NaN, - "step": 262, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1651, - "grad_norm": 1.7578125, - "learning_rate": 2.2886535874830726e-05, - "long_answer_loss": 0.1651, - "loss": 0.1812, - "short_answer_loss": NaN, - "step": 263, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.2057, - "grad_norm": 1.7578125, - "learning_rate": 2.286826204351435e-05, - "long_answer_loss": 0.2057, - "loss": 0.1824, - "short_answer_loss": NaN, - "step": 264, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1897, - "grad_norm": 1.9453125, - "learning_rate": 2.284991691182113e-05, - "long_answer_loss": 0.1897, - "loss": 0.1896, - "short_answer_loss": NaN, - "step": 265, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1933, - "grad_norm": 1.890625, - "learning_rate": 2.2831500605906702e-05, - "long_answer_loss": 0.1933, - "loss": 0.1734, - "short_answer_loss": NaN, - "step": 266, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1577, - "grad_norm": 1.7109375, - "learning_rate": 2.281301325241617e-05, - "long_answer_loss": 0.1577, - "loss": 0.1691, - "short_answer_loss": NaN, - "step": 267, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.161, - "grad_norm": 1.9375, - "learning_rate": 2.279445497848321e-05, - "long_answer_loss": 0.161, - "loss": 0.181, - "short_answer_loss": NaN, - "step": 268, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1905, - "grad_norm": 2.046875, - "learning_rate": 2.2775825911729207e-05, - "long_answer_loss": 0.1905, - "loss": 0.1961, - "short_answer_loss": NaN, - "step": 269, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1823, - "grad_norm": 1.75, - "learning_rate": 2.275712618026236e-05, - "long_answer_loss": 0.1823, - "loss": 0.1751, - "short_answer_loss": NaN, - "step": 270, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1653, - "grad_norm": 1.8984375, - "learning_rate": 2.2738355912676838e-05, - "long_answer_loss": 0.1653, - "loss": 0.1805, - "short_answer_loss": NaN, - "step": 271, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1818, - "grad_norm": 1.859375, - "learning_rate": 2.2719515238051846e-05, - "long_answer_loss": 0.1818, - "loss": 0.1761, - "short_answer_loss": NaN, - "step": 272, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1863, - "grad_norm": 1.984375, - "learning_rate": 2.2700604285950783e-05, - "long_answer_loss": 0.1863, - "loss": 0.1864, - "short_answer_loss": NaN, - "step": 273, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1573, - "grad_norm": 2.078125, - "learning_rate": 2.2681623186420323e-05, - "long_answer_loss": 0.1573, - "loss": 0.1811, - "short_answer_loss": NaN, - "step": 274, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1858, - "grad_norm": 1.71875, - "learning_rate": 2.266257206998953e-05, - "long_answer_loss": 0.1858, - "loss": 0.1731, - "short_answer_loss": NaN, - "step": 275, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.185, - "grad_norm": 2.125, - "learning_rate": 2.264345106766896e-05, - "long_answer_loss": 0.185, - "loss": 0.1823, - "short_answer_loss": NaN, - "step": 276, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.204, - "grad_norm": 2.03125, - "learning_rate": 2.2624260310949763e-05, - "long_answer_loss": 0.204, - "loss": 0.1764, - "short_answer_loss": NaN, - "step": 277, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1896, - "grad_norm": 1.7265625, - "learning_rate": 2.2604999931802773e-05, - "long_answer_loss": 0.1896, - "loss": 0.1822, - "short_answer_loss": NaN, - "step": 278, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.191, - "grad_norm": 1.859375, - "learning_rate": 2.25856700626776e-05, - "long_answer_loss": 0.191, - "loss": 0.1832, - "short_answer_loss": NaN, - "step": 279, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1685, - "grad_norm": 1.90625, - "learning_rate": 2.2566270836501725e-05, - "long_answer_loss": 0.1685, - "loss": 0.1778, - "short_answer_loss": NaN, - "step": 280, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.171, - "grad_norm": 1.6875, - "learning_rate": 2.2546802386679585e-05, - "long_answer_loss": 0.171, - "loss": 0.1776, - "short_answer_loss": NaN, - "step": 281, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.2089, - "grad_norm": 1.8125, - "learning_rate": 2.2527264847091652e-05, - "long_answer_loss": 0.2089, - "loss": 0.1911, - "short_answer_loss": NaN, - "step": 282, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1788, - "grad_norm": 1.7578125, - "learning_rate": 2.2507658352093503e-05, - "long_answer_loss": 0.1788, - "loss": 0.1738, - "short_answer_loss": NaN, - "step": 283, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1825, - "grad_norm": 1.921875, - "learning_rate": 2.2487983036514932e-05, - "long_answer_loss": 0.1825, - "loss": 0.179, - "short_answer_loss": NaN, - "step": 284, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1738, - "grad_norm": 1.828125, - "learning_rate": 2.2468239035658972e-05, - "long_answer_loss": 0.1738, - "loss": 0.1746, - "short_answer_loss": NaN, - "step": 285, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1766, - "grad_norm": 1.75, - "learning_rate": 2.2448426485301006e-05, - "long_answer_loss": 0.1766, - "loss": 0.1774, - "short_answer_loss": NaN, - "step": 286, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1972, - "grad_norm": 1.6171875, - "learning_rate": 2.2428545521687816e-05, - "long_answer_loss": 0.1972, - "loss": 0.1653, - "short_answer_loss": NaN, - "step": 287, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1686, - "grad_norm": 1.8984375, - "learning_rate": 2.2408596281536638e-05, - "long_answer_loss": 0.1686, - "loss": 0.1737, - "short_answer_loss": NaN, - "step": 288, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1981, - "grad_norm": 1.8984375, - "learning_rate": 2.2388578902034243e-05, - "long_answer_loss": 0.1981, - "loss": 0.1735, - "short_answer_loss": NaN, - "step": 289, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1923, - "grad_norm": 1.9453125, - "learning_rate": 2.2368493520835977e-05, - "long_answer_loss": 0.1923, - "loss": 0.1842, - "short_answer_loss": NaN, - "step": 290, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.2005, - "grad_norm": 1.828125, - "learning_rate": 2.2348340276064816e-05, - "long_answer_loss": 0.2005, - "loss": 0.1797, - "short_answer_loss": NaN, - "step": 291, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.179, - "grad_norm": 1.9296875, - "learning_rate": 2.2328119306310423e-05, - "long_answer_loss": 0.179, - "loss": 0.1784, - "short_answer_loss": NaN, - "step": 292, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1715, - "grad_norm": 1.8046875, - "learning_rate": 2.23078307506282e-05, - "long_answer_loss": 0.1715, - "loss": 0.1694, - "short_answer_loss": NaN, - "step": 293, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1441, - "grad_norm": 1.703125, - "learning_rate": 2.2287474748538308e-05, - "long_answer_loss": 0.1441, - "loss": 0.1689, - "short_answer_loss": NaN, - "step": 294, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1673, - "grad_norm": 1.875, - "learning_rate": 2.2267051440024734e-05, - "long_answer_loss": 0.1673, - "loss": 0.178, - "short_answer_loss": NaN, - "step": 295, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.2067, - "grad_norm": 1.796875, - "learning_rate": 2.2246560965534312e-05, - "long_answer_loss": 0.2067, - "loss": 0.1841, - "short_answer_loss": NaN, - "step": 296, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1729, - "grad_norm": 1.75, - "learning_rate": 2.222600346597576e-05, - "long_answer_loss": 0.1729, - "loss": 0.1665, - "short_answer_loss": NaN, - "step": 297, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1546, - "grad_norm": 1.921875, - "learning_rate": 2.2205379082718725e-05, - "long_answer_loss": 0.1546, - "loss": 0.1866, - "short_answer_loss": NaN, - "step": 298, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1767, - "grad_norm": 1.765625, - "learning_rate": 2.2184687957592786e-05, - "long_answer_loss": 0.1767, - "loss": 0.1773, - "short_answer_loss": NaN, - "step": 299, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1712, - "grad_norm": 1.7578125, - "learning_rate": 2.21639302328865e-05, - "long_answer_loss": 0.1712, - "loss": 0.1779, - "short_answer_loss": NaN, - "step": 300, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1846, - "grad_norm": 1.9609375, - "learning_rate": 2.2143106051346407e-05, - "long_answer_loss": 0.1846, - "loss": 0.1862, - "short_answer_loss": NaN, - "step": 301, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1772, - "grad_norm": 1.8671875, - "learning_rate": 2.2122215556176074e-05, - "long_answer_loss": 0.1772, - "loss": 0.1744, - "short_answer_loss": NaN, - "step": 302, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.172, - "grad_norm": 1.765625, - "learning_rate": 2.2101258891035075e-05, - "long_answer_loss": 0.172, - "loss": 0.1764, - "short_answer_loss": NaN, - "step": 303, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1873, - "grad_norm": 1.9609375, - "learning_rate": 2.2080236200038026e-05, - "long_answer_loss": 0.1873, - "loss": 0.1825, - "short_answer_loss": NaN, - "step": 304, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1944, - "grad_norm": 1.96875, - "learning_rate": 2.2059147627753595e-05, - "long_answer_loss": 0.1944, - "loss": 0.1785, - "short_answer_loss": NaN, - "step": 305, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1921, - "grad_norm": 1.765625, - "learning_rate": 2.2037993319203498e-05, - "long_answer_loss": 0.1921, - "loss": 0.1716, - "short_answer_loss": NaN, - "step": 306, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1942, - "grad_norm": 1.9765625, - "learning_rate": 2.20167734198615e-05, - "long_answer_loss": 0.1942, - "loss": 0.1826, - "short_answer_loss": NaN, - "step": 307, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1896, - "grad_norm": 2.015625, - "learning_rate": 2.1995488075652433e-05, - "long_answer_loss": 0.1896, - "loss": 0.1821, - "short_answer_loss": NaN, - "step": 308, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1871, - "grad_norm": 1.9140625, - "learning_rate": 2.1974137432951165e-05, - "long_answer_loss": 0.1871, - "loss": 0.1877, - "short_answer_loss": NaN, - "step": 309, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1826, - "grad_norm": 1.734375, - "learning_rate": 2.195272163858162e-05, - "long_answer_loss": 0.1826, - "loss": 0.1769, - "short_answer_loss": NaN, - "step": 310, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1859, - "grad_norm": 2.03125, - "learning_rate": 2.193124083981575e-05, - "long_answer_loss": 0.1859, - "loss": 0.1716, - "short_answer_loss": NaN, - "step": 311, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1858, - "grad_norm": 1.765625, - "learning_rate": 2.190969518437253e-05, - "long_answer_loss": 0.1858, - "loss": 0.1744, - "short_answer_loss": NaN, - "step": 312, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1558, - "grad_norm": 1.7578125, - "learning_rate": 2.1888084820416944e-05, - "long_answer_loss": 0.1558, - "loss": 0.1725, - "short_answer_loss": NaN, - "step": 313, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1731, - "grad_norm": 1.8359375, - "learning_rate": 2.186640989655896e-05, - "long_answer_loss": 0.1731, - "loss": 0.189, - "short_answer_loss": NaN, - "step": 314, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1705, - "grad_norm": 1.7734375, - "learning_rate": 2.1844670561852508e-05, - "long_answer_loss": 0.1705, - "loss": 0.1745, - "short_answer_loss": NaN, - "step": 315, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1566, - "grad_norm": 2.046875, - "learning_rate": 2.1822866965794465e-05, - "long_answer_loss": 0.1566, - "loss": 0.1757, - "short_answer_loss": NaN, - "step": 316, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1705, - "grad_norm": 1.953125, - "learning_rate": 2.180099925832361e-05, - "long_answer_loss": 0.1705, - "loss": 0.173, - "short_answer_loss": NaN, - "step": 317, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1721, - "grad_norm": 1.8828125, - "learning_rate": 2.177906758981962e-05, - "long_answer_loss": 0.1721, - "loss": 0.1692, - "short_answer_loss": NaN, - "step": 318, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1687, - "grad_norm": 2.015625, - "learning_rate": 2.1757072111101994e-05, - "long_answer_loss": 0.1687, - "loss": 0.1773, - "short_answer_loss": NaN, - "step": 319, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1731, - "grad_norm": 1.9140625, - "learning_rate": 2.1735012973429068e-05, - "long_answer_loss": 0.1731, - "loss": 0.1768, - "short_answer_loss": NaN, - "step": 320, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1698, - "grad_norm": 1.9375, - "learning_rate": 2.1712890328496927e-05, - "long_answer_loss": 0.1698, - "loss": 0.1804, - "short_answer_loss": NaN, - "step": 321, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1582, - "grad_norm": 1.9375, - "learning_rate": 2.1690704328438384e-05, - "long_answer_loss": 0.1582, - "loss": 0.1813, - "short_answer_loss": NaN, - "step": 322, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1814, - "grad_norm": 1.765625, - "learning_rate": 2.1668455125821945e-05, - "long_answer_loss": 0.1814, - "loss": 0.1804, - "short_answer_loss": NaN, - "step": 323, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.2042, - "grad_norm": 1.9296875, - "learning_rate": 2.1646142873650738e-05, - "long_answer_loss": 0.2042, - "loss": 0.1828, - "short_answer_loss": NaN, - "step": 324, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1851, - "grad_norm": 1.828125, - "learning_rate": 2.1623767725361466e-05, - "long_answer_loss": 0.1851, - "loss": 0.1888, - "short_answer_loss": NaN, - "step": 325, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1432, - "grad_norm": 1.7734375, - "learning_rate": 2.160132983482336e-05, - "long_answer_loss": 0.1432, - "loss": 0.1752, - "short_answer_loss": NaN, - "step": 326, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1712, - "grad_norm": 1.8046875, - "learning_rate": 2.157882935633712e-05, - "long_answer_loss": 0.1712, - "loss": 0.1735, - "short_answer_loss": NaN, - "step": 327, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1751, - "grad_norm": 1.828125, - "learning_rate": 2.1556266444633845e-05, - "long_answer_loss": 0.1751, - "loss": 0.1697, - "short_answer_loss": NaN, - "step": 328, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1779, - "grad_norm": 1.7578125, - "learning_rate": 2.153364125487397e-05, - "long_answer_loss": 0.1779, - "loss": 0.1869, - "short_answer_loss": NaN, - "step": 329, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1904, - "grad_norm": 1.8046875, - "learning_rate": 2.1510953942646215e-05, - "long_answer_loss": 0.1904, - "loss": 0.1821, - "short_answer_loss": NaN, - "step": 330, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1818, - "grad_norm": 1.859375, - "learning_rate": 2.1488204663966498e-05, - "long_answer_loss": 0.1818, - "loss": 0.1789, - "short_answer_loss": NaN, - "step": 331, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1572, - "grad_norm": 1.6953125, - "learning_rate": 2.1465393575276867e-05, - "long_answer_loss": 0.1572, - "loss": 0.1721, - "short_answer_loss": NaN, - "step": 332, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1797, - "grad_norm": 1.640625, - "learning_rate": 2.1442520833444416e-05, - "long_answer_loss": 0.1797, - "loss": 0.1697, - "short_answer_loss": NaN, - "step": 333, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1511, - "grad_norm": 1.7421875, - "learning_rate": 2.1419586595760226e-05, - "long_answer_loss": 0.1511, - "loss": 0.1778, - "short_answer_loss": NaN, - "step": 334, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1576, - "grad_norm": 1.6875, - "learning_rate": 2.1396591019938278e-05, - "long_answer_loss": 0.1576, - "loss": 0.1691, - "short_answer_loss": NaN, - "step": 335, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1842, - "grad_norm": 1.5859375, - "learning_rate": 2.1373534264114344e-05, - "long_answer_loss": 0.1842, - "loss": 0.1698, - "short_answer_loss": NaN, - "step": 336, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1586, - "grad_norm": 1.734375, - "learning_rate": 2.1350416486844928e-05, - "long_answer_loss": 0.1586, - "loss": 0.1704, - "short_answer_loss": NaN, - "step": 337, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1903, - "grad_norm": 1.7109375, - "learning_rate": 2.1327237847106167e-05, - "long_answer_loss": 0.1903, - "loss": 0.1772, - "short_answer_loss": NaN, - "step": 338, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.161, - "grad_norm": 1.6328125, - "learning_rate": 2.130399850429274e-05, - "long_answer_loss": 0.161, - "loss": 0.1591, - "short_answer_loss": NaN, - "step": 339, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1558, - "grad_norm": 1.78125, - "learning_rate": 2.1280698618216757e-05, - "long_answer_loss": 0.1558, - "loss": 0.1735, - "short_answer_loss": NaN, - "step": 340, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1741, - "grad_norm": 1.8203125, - "learning_rate": 2.125733834910668e-05, - "long_answer_loss": 0.1741, - "loss": 0.169, - "short_answer_loss": NaN, - "step": 341, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1632, - "grad_norm": 1.71875, - "learning_rate": 2.1233917857606212e-05, - "long_answer_loss": 0.1632, - "loss": 0.1762, - "short_answer_loss": NaN, - "step": 342, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1648, - "grad_norm": 1.7734375, - "learning_rate": 2.1210437304773185e-05, - "long_answer_loss": 0.1648, - "loss": 0.1732, - "short_answer_loss": NaN, - "step": 343, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1699, - "grad_norm": 1.765625, - "learning_rate": 2.1186896852078476e-05, - "long_answer_loss": 0.1699, - "loss": 0.176, - "short_answer_loss": NaN, - "step": 344, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1685, - "grad_norm": 1.796875, - "learning_rate": 2.1163296661404864e-05, - "long_answer_loss": 0.1685, - "loss": 0.1694, - "short_answer_loss": NaN, - "step": 345, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1652, - "grad_norm": 1.8125, - "learning_rate": 2.113963689504594e-05, - "long_answer_loss": 0.1652, - "loss": 0.167, - "short_answer_loss": NaN, - "step": 346, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1597, - "grad_norm": 1.7734375, - "learning_rate": 2.111591771570499e-05, - "long_answer_loss": 0.1597, - "loss": 0.1695, - "short_answer_loss": NaN, - "step": 347, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.2015, - "grad_norm": 1.7734375, - "learning_rate": 2.1092139286493866e-05, - "long_answer_loss": 0.2015, - "loss": 0.1803, - "short_answer_loss": NaN, - "step": 348, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1709, - "grad_norm": 1.90625, - "learning_rate": 2.106830177093187e-05, - "long_answer_loss": 0.1709, - "loss": 0.1764, - "short_answer_loss": NaN, - "step": 349, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.2019, - "grad_norm": 1.703125, - "learning_rate": 2.104440533294462e-05, - "long_answer_loss": 0.2019, - "loss": 0.1725, - "short_answer_loss": NaN, - "step": 350, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1753, - "grad_norm": 1.6875, - "learning_rate": 2.1020450136862953e-05, - "long_answer_loss": 0.1753, - "loss": 0.173, - "short_answer_loss": NaN, - "step": 351, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.18, - "grad_norm": 1.7265625, - "learning_rate": 2.0996436347421744e-05, - "long_answer_loss": 0.18, - "loss": 0.1768, - "short_answer_loss": NaN, - "step": 352, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1674, - "grad_norm": 1.640625, - "learning_rate": 2.0972364129758825e-05, - "long_answer_loss": 0.1674, - "loss": 0.1755, - "short_answer_loss": NaN, - "step": 353, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1793, - "grad_norm": 1.7734375, - "learning_rate": 2.0948233649413815e-05, - "long_answer_loss": 0.1793, - "loss": 0.1821, - "short_answer_loss": NaN, - "step": 354, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1591, - "grad_norm": 1.78125, - "learning_rate": 2.0924045072327003e-05, - "long_answer_loss": 0.1591, - "loss": 0.1628, - "short_answer_loss": NaN, - "step": 355, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1624, - "grad_norm": 1.640625, - "learning_rate": 2.089979856483819e-05, - "long_answer_loss": 0.1624, - "loss": 0.1692, - "short_answer_loss": NaN, - "step": 356, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1785, - "grad_norm": 1.765625, - "learning_rate": 2.0875494293685548e-05, - "long_answer_loss": 0.1785, - "loss": 0.1717, - "short_answer_loss": NaN, - "step": 357, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1436, - "grad_norm": 1.9375, - "learning_rate": 2.0851132426004492e-05, - "long_answer_loss": 0.1436, - "loss": 0.1697, - "short_answer_loss": NaN, - "step": 358, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1671, - "grad_norm": 1.8203125, - "learning_rate": 2.082671312932651e-05, - "long_answer_loss": 0.1671, - "loss": 0.1679, - "short_answer_loss": NaN, - "step": 359, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1741, - "grad_norm": 1.8359375, - "learning_rate": 2.0802236571578e-05, - "long_answer_loss": 0.1741, - "loss": 0.1639, - "short_answer_loss": NaN, - "step": 360, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.179, - "grad_norm": 1.8046875, - "learning_rate": 2.0777702921079163e-05, - "long_answer_loss": 0.179, - "loss": 0.1703, - "short_answer_loss": NaN, - "step": 361, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1563, - "grad_norm": 1.90625, - "learning_rate": 2.075311234654279e-05, - "long_answer_loss": 0.1563, - "loss": 0.1645, - "short_answer_loss": NaN, - "step": 362, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1723, - "grad_norm": 1.7890625, - "learning_rate": 2.072846501707314e-05, - "long_answer_loss": 0.1723, - "loss": 0.1693, - "short_answer_loss": NaN, - "step": 363, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1928, - "grad_norm": 1.6796875, - "learning_rate": 2.0703761102164764e-05, - "long_answer_loss": 0.1928, - "loss": 0.164, - "short_answer_loss": NaN, - "step": 364, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1731, - "grad_norm": 1.7890625, - "learning_rate": 2.0679000771701326e-05, - "long_answer_loss": 0.1731, - "loss": 0.1676, - "short_answer_loss": NaN, - "step": 365, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1856, - "grad_norm": 1.8515625, - "learning_rate": 2.0654184195954465e-05, - "long_answer_loss": 0.1856, - "loss": 0.1661, - "short_answer_loss": NaN, - "step": 366, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1614, - "grad_norm": 1.6875, - "learning_rate": 2.0629311545582598e-05, - "long_answer_loss": 0.1614, - "loss": 0.1805, - "short_answer_loss": NaN, - "step": 367, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.172, - "grad_norm": 1.71875, - "learning_rate": 2.0604382991629755e-05, - "long_answer_loss": 0.172, - "loss": 0.1736, - "short_answer_loss": NaN, - "step": 368, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1353, - "grad_norm": 1.875, - "learning_rate": 2.057939870552441e-05, - "long_answer_loss": 0.1353, - "loss": 0.1733, - "short_answer_loss": NaN, - "step": 369, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1488, - "grad_norm": 1.75, - "learning_rate": 2.0554358859078284e-05, - "long_answer_loss": 0.1488, - "loss": 0.1699, - "short_answer_loss": NaN, - "step": 370, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1754, - "grad_norm": 1.6484375, - "learning_rate": 2.0529263624485183e-05, - "long_answer_loss": 0.1754, - "loss": 0.1711, - "short_answer_loss": NaN, - "step": 371, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1752, - "grad_norm": 1.8984375, - "learning_rate": 2.0504113174319812e-05, - "long_answer_loss": 0.1752, - "loss": 0.1788, - "short_answer_loss": NaN, - "step": 372, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1742, - "grad_norm": 1.75, - "learning_rate": 2.0478907681536564e-05, - "long_answer_loss": 0.1742, - "loss": 0.1839, - "short_answer_loss": NaN, - "step": 373, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1508, - "grad_norm": 1.71875, - "learning_rate": 2.0453647319468368e-05, - "long_answer_loss": 0.1508, - "loss": 0.1696, - "short_answer_loss": NaN, - "step": 374, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1675, - "grad_norm": 1.7265625, - "learning_rate": 2.0428332261825456e-05, - "long_answer_loss": 0.1675, - "loss": 0.1737, - "short_answer_loss": NaN, - "step": 375, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1773, - "grad_norm": 1.8984375, - "learning_rate": 2.0402962682694214e-05, - "long_answer_loss": 0.1773, - "loss": 0.1843, - "short_answer_loss": NaN, - "step": 376, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.19, - "grad_norm": 1.9375, - "learning_rate": 2.0377538756535947e-05, - "long_answer_loss": 0.19, - "loss": 0.1772, - "short_answer_loss": NaN, - "step": 377, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1942, - "grad_norm": 1.9140625, - "learning_rate": 2.0352060658185696e-05, - "long_answer_loss": 0.1942, - "loss": 0.1824, - "short_answer_loss": NaN, - "step": 378, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1905, - "grad_norm": 1.9140625, - "learning_rate": 2.0326528562851028e-05, - "long_answer_loss": 0.1905, - "loss": 0.1717, - "short_answer_loss": NaN, - "step": 379, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1579, - "grad_norm": 1.84375, - "learning_rate": 2.030094264611084e-05, - "long_answer_loss": 0.1579, - "loss": 0.1621, - "short_answer_loss": NaN, - "step": 380, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1883, - "grad_norm": 1.796875, - "learning_rate": 2.027530308391416e-05, - "long_answer_loss": 0.1883, - "loss": 0.1653, - "short_answer_loss": NaN, - "step": 381, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1601, - "grad_norm": 1.7890625, - "learning_rate": 2.02496100525789e-05, - "long_answer_loss": 0.1601, - "loss": 0.1722, - "short_answer_loss": NaN, - "step": 382, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1761, - "grad_norm": 1.828125, - "learning_rate": 2.0223863728790682e-05, - "long_answer_loss": 0.1761, - "loss": 0.1696, - "short_answer_loss": NaN, - "step": 383, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.192, - "grad_norm": 1.78125, - "learning_rate": 2.0198064289601615e-05, - "long_answer_loss": 0.192, - "loss": 0.166, - "short_answer_loss": NaN, - "step": 384, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.2026, - "grad_norm": 1.890625, - "learning_rate": 2.017221191242906e-05, - "long_answer_loss": 0.2026, - "loss": 0.1795, - "short_answer_loss": NaN, - "step": 385, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1869, - "grad_norm": 1.8359375, - "learning_rate": 2.014630677505443e-05, - "long_answer_loss": 0.1869, - "loss": 0.1652, - "short_answer_loss": NaN, - "step": 386, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1679, - "grad_norm": 1.625, - "learning_rate": 2.0120349055621952e-05, - "long_answer_loss": 0.1679, - "loss": 0.173, - "short_answer_loss": NaN, - "step": 387, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1758, - "grad_norm": 1.671875, - "learning_rate": 2.0094338932637447e-05, - "long_answer_loss": 0.1758, - "loss": 0.1693, - "short_answer_loss": NaN, - "step": 388, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1879, - "grad_norm": 1.671875, - "learning_rate": 2.0068276584967113e-05, - "long_answer_loss": 0.1879, - "loss": 0.1621, - "short_answer_loss": NaN, - "step": 389, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1635, - "grad_norm": 1.765625, - "learning_rate": 2.0042162191836285e-05, - "long_answer_loss": 0.1635, - "loss": 0.1748, - "short_answer_loss": NaN, - "step": 390, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1437, - "grad_norm": 1.7890625, - "learning_rate": 2.00159959328282e-05, - "long_answer_loss": 0.1437, - "loss": 0.1693, - "short_answer_loss": NaN, - "step": 391, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1727, - "grad_norm": 1.71875, - "learning_rate": 1.9989777987882763e-05, - "long_answer_loss": 0.1727, - "loss": 0.1669, - "short_answer_loss": NaN, - "step": 392, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1738, - "grad_norm": 1.75, - "learning_rate": 1.996350853729532e-05, - "long_answer_loss": 0.1738, - "loss": 0.1747, - "short_answer_loss": NaN, - "step": 393, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1698, - "grad_norm": 1.796875, - "learning_rate": 1.993718776171541e-05, - "long_answer_loss": 0.1698, - "loss": 0.1653, - "short_answer_loss": NaN, - "step": 394, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1624, - "grad_norm": 1.796875, - "learning_rate": 1.9910815842145513e-05, - "long_answer_loss": 0.1624, - "loss": 0.1777, - "short_answer_loss": NaN, - "step": 395, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1493, - "grad_norm": 1.8203125, - "learning_rate": 1.9884392959939824e-05, - "long_answer_loss": 0.1493, - "loss": 0.1654, - "short_answer_loss": NaN, - "step": 396, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.166, - "grad_norm": 1.7109375, - "learning_rate": 1.9857919296803e-05, - "long_answer_loss": 0.166, - "loss": 0.1705, - "short_answer_loss": NaN, - "step": 397, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1814, - "grad_norm": 1.890625, - "learning_rate": 1.9831395034788904e-05, - "long_answer_loss": 0.1814, - "loss": 0.1733, - "short_answer_loss": NaN, - "step": 398, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.161, - "grad_norm": 1.90625, - "learning_rate": 1.9804820356299356e-05, - "long_answer_loss": 0.161, - "loss": 0.1688, - "short_answer_loss": NaN, - "step": 399, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1599, - "grad_norm": 1.703125, - "learning_rate": 1.9778195444082877e-05, - "long_answer_loss": 0.1599, - "loss": 0.1657, - "short_answer_loss": NaN, - "step": 400, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1777, - "grad_norm": 1.8828125, - "learning_rate": 1.9751520481233445e-05, - "long_answer_loss": 0.1777, - "loss": 0.1851, - "short_answer_loss": NaN, - "step": 401, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1591, - "grad_norm": 1.6796875, - "learning_rate": 1.9724795651189214e-05, - "long_answer_loss": 0.1591, - "loss": 0.1582, - "short_answer_loss": NaN, - "step": 402, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1798, - "grad_norm": 1.6875, - "learning_rate": 1.969802113773127e-05, - "long_answer_loss": 0.1798, - "loss": 0.1707, - "short_answer_loss": NaN, - "step": 403, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1611, - "grad_norm": 1.75, - "learning_rate": 1.967119712498236e-05, - "long_answer_loss": 0.1611, - "loss": 0.1632, - "short_answer_loss": NaN, - "step": 404, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1505, - "grad_norm": 1.6953125, - "learning_rate": 1.9644323797405633e-05, - "long_answer_loss": 0.1505, - "loss": 0.1616, - "short_answer_loss": NaN, - "step": 405, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.2129, - "grad_norm": 1.625, - "learning_rate": 1.961740133980336e-05, - "long_answer_loss": 0.2129, - "loss": 0.1712, - "short_answer_loss": NaN, - "step": 406, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1631, - "grad_norm": 1.65625, - "learning_rate": 1.959042993731567e-05, - "long_answer_loss": 0.1631, - "loss": 0.1586, - "short_answer_loss": NaN, - "step": 407, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1718, - "grad_norm": 1.671875, - "learning_rate": 1.956340977541927e-05, - "long_answer_loss": 0.1718, - "loss": 0.1687, - "short_answer_loss": NaN, - "step": 408, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1629, - "grad_norm": 1.6875, - "learning_rate": 1.9536341039926186e-05, - "long_answer_loss": 0.1629, - "loss": 0.1702, - "short_answer_loss": NaN, - "step": 409, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1599, - "grad_norm": 1.6953125, - "learning_rate": 1.9509223916982472e-05, - "long_answer_loss": 0.1599, - "loss": 0.1605, - "short_answer_loss": NaN, - "step": 410, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1586, - "grad_norm": 1.703125, - "learning_rate": 1.9482058593066923e-05, - "long_answer_loss": 0.1586, - "loss": 0.171, - "short_answer_loss": NaN, - "step": 411, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1573, - "grad_norm": 1.7421875, - "learning_rate": 1.9454845254989818e-05, - "long_answer_loss": 0.1573, - "loss": 0.1675, - "short_answer_loss": NaN, - "step": 412, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.177, - "grad_norm": 1.8125, - "learning_rate": 1.9427584089891598e-05, - "long_answer_loss": 0.177, - "loss": 0.1752, - "short_answer_loss": NaN, - "step": 413, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1527, - "grad_norm": 1.5625, - "learning_rate": 1.9400275285241624e-05, - "long_answer_loss": 0.1527, - "loss": 0.1598, - "short_answer_loss": NaN, - "step": 414, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1577, - "grad_norm": 1.7890625, - "learning_rate": 1.9372919028836855e-05, - "long_answer_loss": 0.1577, - "loss": 0.1734, - "short_answer_loss": NaN, - "step": 415, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.183, - "grad_norm": 1.703125, - "learning_rate": 1.9345515508800556e-05, - "long_answer_loss": 0.183, - "loss": 0.1676, - "short_answer_loss": NaN, - "step": 416, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1698, - "grad_norm": 1.7578125, - "learning_rate": 1.931806491358102e-05, - "long_answer_loss": 0.1698, - "loss": 0.1672, - "short_answer_loss": NaN, - "step": 417, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1583, - "grad_norm": 1.8828125, - "learning_rate": 1.929056743195028e-05, - "long_answer_loss": 0.1583, - "loss": 0.172, - "short_answer_loss": NaN, - "step": 418, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1488, - "grad_norm": 1.8125, - "learning_rate": 1.9263023253002773e-05, - "long_answer_loss": 0.1488, - "loss": 0.1674, - "short_answer_loss": NaN, - "step": 419, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1781, - "grad_norm": 1.7734375, - "learning_rate": 1.9235432566154084e-05, - "long_answer_loss": 0.1781, - "loss": 0.1593, - "short_answer_loss": NaN, - "step": 420, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1786, - "grad_norm": 1.6953125, - "learning_rate": 1.9207795561139614e-05, - "long_answer_loss": 0.1786, - "loss": 0.1674, - "short_answer_loss": NaN, - "step": 421, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1826, - "grad_norm": 1.734375, - "learning_rate": 1.9180112428013286e-05, - "long_answer_loss": 0.1826, - "loss": 0.1615, - "short_answer_loss": NaN, - "step": 422, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1832, - "grad_norm": 1.765625, - "learning_rate": 1.915238335714623e-05, - "long_answer_loss": 0.1832, - "loss": 0.1728, - "short_answer_loss": NaN, - "step": 423, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1843, - "grad_norm": 1.8046875, - "learning_rate": 1.9124608539225496e-05, - "long_answer_loss": 0.1843, - "loss": 0.1677, - "short_answer_loss": NaN, - "step": 424, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.169, - "grad_norm": 1.71875, - "learning_rate": 1.909678816525271e-05, - "long_answer_loss": 0.169, - "loss": 0.1727, - "short_answer_loss": NaN, - "step": 425, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1327, - "grad_norm": 1.625, - "learning_rate": 1.9068922426542783e-05, - "long_answer_loss": 0.1327, - "loss": 0.1565, - "short_answer_loss": NaN, - "step": 426, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1685, - "grad_norm": 1.71875, - "learning_rate": 1.9041011514722602e-05, - "long_answer_loss": 0.1685, - "loss": 0.1745, - "short_answer_loss": NaN, - "step": 427, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1876, - "grad_norm": 1.7578125, - "learning_rate": 1.901305562172968e-05, - "long_answer_loss": 0.1876, - "loss": 0.1722, - "short_answer_loss": NaN, - "step": 428, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1595, - "grad_norm": 1.84375, - "learning_rate": 1.898505493981087e-05, - "long_answer_loss": 0.1595, - "loss": 0.1667, - "short_answer_loss": NaN, - "step": 429, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1656, - "grad_norm": 1.609375, - "learning_rate": 1.895700966152103e-05, - "long_answer_loss": 0.1656, - "loss": 0.1568, - "short_answer_loss": NaN, - "step": 430, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1723, - "grad_norm": 1.7265625, - "learning_rate": 1.8928919979721678e-05, - "long_answer_loss": 0.1723, - "loss": 0.1661, - "short_answer_loss": NaN, - "step": 431, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1612, - "grad_norm": 1.703125, - "learning_rate": 1.8900786087579712e-05, - "long_answer_loss": 0.1612, - "loss": 0.1571, - "short_answer_loss": NaN, - "step": 432, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1496, - "grad_norm": 1.765625, - "learning_rate": 1.8872608178566043e-05, - "long_answer_loss": 0.1496, - "loss": 0.1661, - "short_answer_loss": NaN, - "step": 433, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1647, - "grad_norm": 1.8125, - "learning_rate": 1.8844386446454275e-05, - "long_answer_loss": 0.1647, - "loss": 0.1686, - "short_answer_loss": NaN, - "step": 434, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1544, - "grad_norm": 1.890625, - "learning_rate": 1.881612108531938e-05, - "long_answer_loss": 0.1544, - "loss": 0.1683, - "short_answer_loss": NaN, - "step": 435, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1494, - "grad_norm": 1.6875, - "learning_rate": 1.878781228953635e-05, - "long_answer_loss": 0.1494, - "loss": 0.1597, - "short_answer_loss": NaN, - "step": 436, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.171, - "grad_norm": 1.71875, - "learning_rate": 1.8759460253778877e-05, - "long_answer_loss": 0.171, - "loss": 0.1691, - "short_answer_loss": NaN, - "step": 437, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1657, - "grad_norm": 1.5625, - "learning_rate": 1.8731065173018e-05, - "long_answer_loss": 0.1657, - "loss": 0.1576, - "short_answer_loss": NaN, - "step": 438, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.17, - "grad_norm": 1.8515625, - "learning_rate": 1.870262724252077e-05, - "long_answer_loss": 0.17, - "loss": 0.1781, - "short_answer_loss": NaN, - "step": 439, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1849, - "grad_norm": 1.8359375, - "learning_rate": 1.8674146657848908e-05, - "long_answer_loss": 0.1849, - "loss": 0.1736, - "short_answer_loss": NaN, - "step": 440, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1792, - "grad_norm": 1.6875, - "learning_rate": 1.8645623614857455e-05, - "long_answer_loss": 0.1792, - "loss": 0.1701, - "short_answer_loss": NaN, - "step": 441, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.2047, - "grad_norm": 1.7890625, - "learning_rate": 1.8617058309693437e-05, - "long_answer_loss": 0.2047, - "loss": 0.1788, - "short_answer_loss": NaN, - "step": 442, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.163, - "grad_norm": 1.71875, - "learning_rate": 1.85884509387945e-05, - "long_answer_loss": 0.163, - "loss": 0.1705, - "short_answer_loss": NaN, - "step": 443, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1636, - "grad_norm": 1.7421875, - "learning_rate": 1.855980169888757e-05, - "long_answer_loss": 0.1636, - "loss": 0.1819, - "short_answer_loss": NaN, - "step": 444, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1581, - "grad_norm": 1.6953125, - "learning_rate": 1.85311107869875e-05, - "long_answer_loss": 0.1581, - "loss": 0.1666, - "short_answer_loss": NaN, - "step": 445, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1779, - "grad_norm": 1.8125, - "learning_rate": 1.850237840039571e-05, - "long_answer_loss": 0.1779, - "loss": 0.1726, - "short_answer_loss": NaN, - "step": 446, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1499, - "grad_norm": 1.7109375, - "learning_rate": 1.8473604736698835e-05, - "long_answer_loss": 0.1499, - "loss": 0.1626, - "short_answer_loss": NaN, - "step": 447, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.171, - "grad_norm": 1.65625, - "learning_rate": 1.844478999376736e-05, - "long_answer_loss": 0.171, - "loss": 0.1553, - "short_answer_loss": NaN, - "step": 448, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1674, - "grad_norm": 1.6953125, - "learning_rate": 1.841593436975427e-05, - "long_answer_loss": 0.1674, - "loss": 0.1701, - "short_answer_loss": NaN, - "step": 449, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1619, - "grad_norm": 1.7265625, - "learning_rate": 1.838703806309367e-05, - "long_answer_loss": 0.1619, - "loss": 0.163, - "short_answer_loss": NaN, - "step": 450, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1672, - "grad_norm": 1.734375, - "learning_rate": 1.8358101272499443e-05, - "long_answer_loss": 0.1672, - "loss": 0.1597, - "short_answer_loss": NaN, - "step": 451, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1467, - "grad_norm": 1.7109375, - "learning_rate": 1.8329124196963864e-05, - "long_answer_loss": 0.1467, - "loss": 0.1748, - "short_answer_loss": NaN, - "step": 452, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1644, - "grad_norm": 1.734375, - "learning_rate": 1.830010703575624e-05, - "long_answer_loss": 0.1644, - "loss": 0.1643, - "short_answer_loss": NaN, - "step": 453, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1672, - "grad_norm": 1.6640625, - "learning_rate": 1.827104998842154e-05, - "long_answer_loss": 0.1672, - "loss": 0.1646, - "short_answer_loss": NaN, - "step": 454, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1646, - "grad_norm": 1.8359375, - "learning_rate": 1.8241953254779027e-05, - "long_answer_loss": 0.1646, - "loss": 0.1643, - "short_answer_loss": NaN, - "step": 455, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1869, - "grad_norm": 1.78125, - "learning_rate": 1.8212817034920864e-05, - "long_answer_loss": 0.1869, - "loss": 0.1684, - "short_answer_loss": NaN, - "step": 456, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1618, - "grad_norm": 1.8046875, - "learning_rate": 1.818364152921077e-05, - "long_answer_loss": 0.1618, - "loss": 0.1577, - "short_answer_loss": NaN, - "step": 457, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1658, - "grad_norm": 1.8046875, - "learning_rate": 1.8154426938282615e-05, - "long_answer_loss": 0.1658, - "loss": 0.1688, - "short_answer_loss": NaN, - "step": 458, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1681, - "grad_norm": 1.6640625, - "learning_rate": 1.8125173463039048e-05, - "long_answer_loss": 0.1681, - "loss": 0.1663, - "short_answer_loss": NaN, - "step": 459, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1868, - "grad_norm": 1.734375, - "learning_rate": 1.8095881304650123e-05, - "long_answer_loss": 0.1868, - "loss": 0.1678, - "short_answer_loss": NaN, - "step": 460, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1519, - "grad_norm": 1.703125, - "learning_rate": 1.8066550664551904e-05, - "long_answer_loss": 0.1519, - "loss": 0.1604, - "short_answer_loss": NaN, - "step": 461, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1849, - "grad_norm": 1.734375, - "learning_rate": 1.8037181744445093e-05, - "long_answer_loss": 0.1849, - "loss": 0.1701, - "short_answer_loss": NaN, - "step": 462, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1386, - "grad_norm": 1.625, - "learning_rate": 1.8007774746293628e-05, - "long_answer_loss": 0.1386, - "loss": 0.1604, - "short_answer_loss": NaN, - "step": 463, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1884, - "grad_norm": 1.7734375, - "learning_rate": 1.7978329872323308e-05, - "long_answer_loss": 0.1884, - "loss": 0.16, - "short_answer_loss": NaN, - "step": 464, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1752, - "grad_norm": 1.8671875, - "learning_rate": 1.7948847325020394e-05, - "long_answer_loss": 0.1752, - "loss": 0.1646, - "short_answer_loss": NaN, - "step": 465, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1523, - "grad_norm": 1.7109375, - "learning_rate": 1.7919327307130217e-05, - "long_answer_loss": 0.1523, - "loss": 0.1599, - "short_answer_loss": NaN, - "step": 466, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1735, - "grad_norm": 1.8125, - "learning_rate": 1.7889770021655787e-05, - "long_answer_loss": 0.1735, - "loss": 0.1826, - "short_answer_loss": NaN, - "step": 467, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.16, - "grad_norm": 1.828125, - "learning_rate": 1.78601756718564e-05, - "long_answer_loss": 0.16, - "loss": 0.1615, - "short_answer_loss": NaN, - "step": 468, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1622, - "grad_norm": 1.6953125, - "learning_rate": 1.783054446124622e-05, - "long_answer_loss": 0.1622, - "loss": 0.1562, - "short_answer_loss": NaN, - "step": 469, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1735, - "grad_norm": 1.796875, - "learning_rate": 1.7800876593592912e-05, - "long_answer_loss": 0.1735, - "loss": 0.1663, - "short_answer_loss": NaN, - "step": 470, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.148, - "grad_norm": 1.7109375, - "learning_rate": 1.777117227291622e-05, - "long_answer_loss": 0.148, - "loss": 0.167, - "short_answer_loss": NaN, - "step": 471, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1756, - "grad_norm": 1.7578125, - "learning_rate": 1.7741431703486562e-05, - "long_answer_loss": 0.1756, - "loss": 0.1616, - "short_answer_loss": NaN, - "step": 472, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1404, - "grad_norm": 1.75, - "learning_rate": 1.7711655089823638e-05, - "long_answer_loss": 0.1404, - "loss": 0.1633, - "short_answer_loss": NaN, - "step": 473, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1554, - "grad_norm": 1.859375, - "learning_rate": 1.7681842636695007e-05, - "long_answer_loss": 0.1554, - "loss": 0.1644, - "short_answer_loss": NaN, - "step": 474, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1773, - "grad_norm": 1.859375, - "learning_rate": 1.7651994549114702e-05, - "long_answer_loss": 0.1773, - "loss": 0.1651, - "short_answer_loss": NaN, - "step": 475, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1536, - "grad_norm": 1.5390625, - "learning_rate": 1.7622111032341797e-05, - "long_answer_loss": 0.1536, - "loss": 0.1628, - "short_answer_loss": NaN, - "step": 476, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1713, - "grad_norm": 1.8359375, - "learning_rate": 1.7592192291879008e-05, - "long_answer_loss": 0.1713, - "loss": 0.1678, - "short_answer_loss": NaN, - "step": 477, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1917, - "grad_norm": 1.65625, - "learning_rate": 1.756223853347127e-05, - "long_answer_loss": 0.1917, - "loss": 0.1679, - "short_answer_loss": NaN, - "step": 478, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1724, - "grad_norm": 1.6875, - "learning_rate": 1.7532249963104344e-05, - "long_answer_loss": 0.1724, - "loss": 0.167, - "short_answer_loss": NaN, - "step": 479, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1845, - "grad_norm": 1.765625, - "learning_rate": 1.7502226787003378e-05, - "long_answer_loss": 0.1845, - "loss": 0.1716, - "short_answer_loss": NaN, - "step": 480, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1741, - "grad_norm": 1.78125, - "learning_rate": 1.747216921163149e-05, - "long_answer_loss": 0.1741, - "loss": 0.1599, - "short_answer_loss": NaN, - "step": 481, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1524, - "grad_norm": 1.7265625, - "learning_rate": 1.7442077443688364e-05, - "long_answer_loss": 0.1524, - "loss": 0.1597, - "short_answer_loss": NaN, - "step": 482, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1621, - "grad_norm": 1.5859375, - "learning_rate": 1.741195169010882e-05, - "long_answer_loss": 0.1621, - "loss": 0.1614, - "short_answer_loss": NaN, - "step": 483, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1449, - "grad_norm": 1.7421875, - "learning_rate": 1.7381792158061378e-05, - "long_answer_loss": 0.1449, - "loss": 0.1676, - "short_answer_loss": NaN, - "step": 484, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1858, - "grad_norm": 1.671875, - "learning_rate": 1.7351599054946853e-05, - "long_answer_loss": 0.1858, - "loss": 0.1664, - "short_answer_loss": NaN, - "step": 485, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1459, - "grad_norm": 1.625, - "learning_rate": 1.732137258839693e-05, - "long_answer_loss": 0.1459, - "loss": 0.1635, - "short_answer_loss": NaN, - "step": 486, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1591, - "grad_norm": 1.6484375, - "learning_rate": 1.7291112966272707e-05, - "long_answer_loss": 0.1591, - "loss": 0.1481, - "short_answer_loss": NaN, - "step": 487, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1824, - "grad_norm": 1.6953125, - "learning_rate": 1.7260820396663307e-05, - "long_answer_loss": 0.1824, - "loss": 0.1691, - "short_answer_loss": NaN, - "step": 488, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.2103, - "grad_norm": 1.7265625, - "learning_rate": 1.723049508788442e-05, - "long_answer_loss": 0.2103, - "loss": 0.1607, - "short_answer_loss": NaN, - "step": 489, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1865, - "grad_norm": 1.6796875, - "learning_rate": 1.720013724847686e-05, - "long_answer_loss": 0.1865, - "loss": 0.1544, - "short_answer_loss": NaN, - "step": 490, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1951, - "grad_norm": 1.78125, - "learning_rate": 1.716974708720517e-05, - "long_answer_loss": 0.1951, - "loss": 0.1694, - "short_answer_loss": NaN, - "step": 491, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1457, - "grad_norm": 1.734375, - "learning_rate": 1.7139324813056155e-05, - "long_answer_loss": 0.1457, - "loss": 0.1615, - "short_answer_loss": NaN, - "step": 492, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1946, - "grad_norm": 1.6015625, - "learning_rate": 1.7108870635237444e-05, - "long_answer_loss": 0.1946, - "loss": 0.1607, - "short_answer_loss": NaN, - "step": 493, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1513, - "grad_norm": 1.7578125, - "learning_rate": 1.7078384763176084e-05, - "long_answer_loss": 0.1513, - "loss": 0.1649, - "short_answer_loss": NaN, - "step": 494, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1478, - "grad_norm": 1.7265625, - "learning_rate": 1.7047867406517047e-05, - "long_answer_loss": 0.1478, - "loss": 0.1599, - "short_answer_loss": NaN, - "step": 495, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1656, - "grad_norm": 1.7890625, - "learning_rate": 1.7017318775121845e-05, - "long_answer_loss": 0.1656, - "loss": 0.1581, - "short_answer_loss": NaN, - "step": 496, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1545, - "grad_norm": 1.6484375, - "learning_rate": 1.6986739079067047e-05, - "long_answer_loss": 0.1545, - "loss": 0.1649, - "short_answer_loss": NaN, - "step": 497, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.155, - "grad_norm": 1.7421875, - "learning_rate": 1.6956128528642842e-05, - "long_answer_loss": 0.155, - "loss": 0.1563, - "short_answer_loss": NaN, - "step": 498, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1534, - "grad_norm": 1.703125, - "learning_rate": 1.6925487334351613e-05, - "long_answer_loss": 0.1534, - "loss": 0.1636, - "short_answer_loss": NaN, - "step": 499, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.169, - "grad_norm": 1.7890625, - "learning_rate": 1.6894815706906458e-05, - "long_answer_loss": 0.169, - "loss": 0.1567, - "short_answer_loss": NaN, - "step": 500, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.168, - "grad_norm": 1.7734375, - "learning_rate": 1.686411385722977e-05, - "long_answer_loss": 0.168, - "loss": 0.16, - "short_answer_loss": NaN, - "step": 501, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1441, - "grad_norm": 1.7890625, - "learning_rate": 1.683338199645177e-05, - "long_answer_loss": 0.1441, - "loss": 0.1669, - "short_answer_loss": NaN, - "step": 502, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1572, - "grad_norm": 1.7265625, - "learning_rate": 1.6802620335909054e-05, - "long_answer_loss": 0.1572, - "loss": 0.1643, - "short_answer_loss": NaN, - "step": 503, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1503, - "grad_norm": 1.7578125, - "learning_rate": 1.6771829087143156e-05, - "long_answer_loss": 0.1503, - "loss": 0.1601, - "short_answer_loss": NaN, - "step": 504, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1735, - "grad_norm": 1.6875, - "learning_rate": 1.6741008461899073e-05, - "long_answer_loss": 0.1735, - "loss": 0.1561, - "short_answer_loss": NaN, - "step": 505, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.183, - "grad_norm": 1.7109375, - "learning_rate": 1.6710158672123818e-05, - "long_answer_loss": 0.183, - "loss": 0.1685, - "short_answer_loss": NaN, - "step": 506, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.16, - "grad_norm": 1.7734375, - "learning_rate": 1.6679279929964968e-05, - "long_answer_loss": 0.16, - "loss": 0.1582, - "short_answer_loss": NaN, - "step": 507, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1683, - "grad_norm": 1.7578125, - "learning_rate": 1.6648372447769197e-05, - "long_answer_loss": 0.1683, - "loss": 0.167, - "short_answer_loss": NaN, - "step": 508, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1724, - "grad_norm": 1.6640625, - "learning_rate": 1.6617436438080812e-05, - "long_answer_loss": 0.1724, - "loss": 0.16, - "short_answer_loss": NaN, - "step": 509, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1824, - "grad_norm": 1.8203125, - "learning_rate": 1.6586472113640306e-05, - "long_answer_loss": 0.1824, - "loss": 0.1644, - "short_answer_loss": NaN, - "step": 510, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1504, - "grad_norm": 1.828125, - "learning_rate": 1.6555479687382887e-05, - "long_answer_loss": 0.1504, - "loss": 0.1576, - "short_answer_loss": NaN, - "step": 511, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1488, - "grad_norm": 1.7421875, - "learning_rate": 1.6524459372437004e-05, - "long_answer_loss": 0.1488, - "loss": 0.1612, - "short_answer_loss": NaN, - "step": 512, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1622, - "grad_norm": 1.6953125, - "learning_rate": 1.64934113821229e-05, - "long_answer_loss": 0.1622, - "loss": 0.1553, - "short_answer_loss": NaN, - "step": 513, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1501, - "grad_norm": 1.7890625, - "learning_rate": 1.6462335929951133e-05, - "long_answer_loss": 0.1501, - "loss": 0.1545, - "short_answer_loss": NaN, - "step": 514, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1467, - "grad_norm": 1.7421875, - "learning_rate": 1.643123322962111e-05, - "long_answer_loss": 0.1467, - "loss": 0.154, - "short_answer_loss": NaN, - "step": 515, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1546, - "grad_norm": 1.6953125, - "learning_rate": 1.6400103495019618e-05, - "long_answer_loss": 0.1546, - "loss": 0.1612, - "short_answer_loss": NaN, - "step": 516, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1756, - "grad_norm": 1.6953125, - "learning_rate": 1.6368946940219352e-05, - "long_answer_loss": 0.1756, - "loss": 0.1622, - "short_answer_loss": NaN, - "step": 517, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1914, - "grad_norm": 1.75, - "learning_rate": 1.633776377947745e-05, - "long_answer_loss": 0.1914, - "loss": 0.1644, - "short_answer_loss": NaN, - "step": 518, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1482, - "grad_norm": 1.671875, - "learning_rate": 1.6306554227233994e-05, - "long_answer_loss": 0.1482, - "loss": 0.1614, - "short_answer_loss": NaN, - "step": 519, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.147, - "grad_norm": 1.7890625, - "learning_rate": 1.6275318498110585e-05, - "long_answer_loss": 0.147, - "loss": 0.1617, - "short_answer_loss": NaN, - "step": 520, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1683, - "grad_norm": 1.8125, - "learning_rate": 1.6244056806908816e-05, - "long_answer_loss": 0.1683, - "loss": 0.1655, - "short_answer_loss": NaN, - "step": 521, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1623, - "grad_norm": 1.6875, - "learning_rate": 1.621276936860882e-05, - "long_answer_loss": 0.1623, - "loss": 0.1619, - "short_answer_loss": NaN, - "step": 522, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1877, - "grad_norm": 1.6953125, - "learning_rate": 1.6181456398367788e-05, - "long_answer_loss": 0.1877, - "loss": 0.1621, - "short_answer_loss": NaN, - "step": 523, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1704, - "grad_norm": 1.828125, - "learning_rate": 1.6150118111518493e-05, - "long_answer_loss": 0.1704, - "loss": 0.1613, - "short_answer_loss": NaN, - "step": 524, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1437, - "grad_norm": 1.6640625, - "learning_rate": 1.6118754723567798e-05, - "long_answer_loss": 0.1437, - "loss": 0.1453, - "short_answer_loss": NaN, - "step": 525, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1142, - "grad_norm": 1.640625, - "learning_rate": 1.608736645019518e-05, - "long_answer_loss": 0.1142, - "loss": 0.1564, - "short_answer_loss": NaN, - "step": 526, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1551, - "grad_norm": 1.6328125, - "learning_rate": 1.605595350725126e-05, - "long_answer_loss": 0.1551, - "loss": 0.1566, - "short_answer_loss": NaN, - "step": 527, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.2099, - "grad_norm": 1.765625, - "learning_rate": 1.6024516110756296e-05, - "long_answer_loss": 0.2099, - "loss": 0.1686, - "short_answer_loss": NaN, - "step": 528, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1506, - "grad_norm": 1.7421875, - "learning_rate": 1.5993054476898708e-05, - "long_answer_loss": 0.1506, - "loss": 0.1566, - "short_answer_loss": NaN, - "step": 529, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1487, - "grad_norm": 1.640625, - "learning_rate": 1.59615688220336e-05, - "long_answer_loss": 0.1487, - "loss": 0.1565, - "short_answer_loss": NaN, - "step": 530, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1496, - "grad_norm": 1.71875, - "learning_rate": 1.593005936268125e-05, - "long_answer_loss": 0.1496, - "loss": 0.1567, - "short_answer_loss": NaN, - "step": 531, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1571, - "grad_norm": 1.78125, - "learning_rate": 1.5898526315525646e-05, - "long_answer_loss": 0.1571, - "loss": 0.1597, - "short_answer_loss": NaN, - "step": 532, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1515, - "grad_norm": 1.8515625, - "learning_rate": 1.5866969897412984e-05, - "long_answer_loss": 0.1515, - "loss": 0.1731, - "short_answer_loss": NaN, - "step": 533, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1773, - "grad_norm": 1.7421875, - "learning_rate": 1.583539032535017e-05, - "long_answer_loss": 0.1773, - "loss": 0.1649, - "short_answer_loss": NaN, - "step": 534, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.16, - "grad_norm": 1.8125, - "learning_rate": 1.5803787816503336e-05, - "long_answer_loss": 0.16, - "loss": 0.1577, - "short_answer_loss": NaN, - "step": 535, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1701, - "grad_norm": 1.875, - "learning_rate": 1.577216258819635e-05, - "long_answer_loss": 0.1701, - "loss": 0.1614, - "short_answer_loss": NaN, - "step": 536, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.137, - "grad_norm": 1.7890625, - "learning_rate": 1.5740514857909312e-05, - "long_answer_loss": 0.137, - "loss": 0.1698, - "short_answer_loss": NaN, - "step": 537, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1651, - "grad_norm": 1.71875, - "learning_rate": 1.570884484327707e-05, - "long_answer_loss": 0.1651, - "loss": 0.1594, - "short_answer_loss": NaN, - "step": 538, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.132, - "grad_norm": 1.828125, - "learning_rate": 1.5677152762087714e-05, - "long_answer_loss": 0.132, - "loss": 0.1594, - "short_answer_loss": NaN, - "step": 539, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1649, - "grad_norm": 1.7109375, - "learning_rate": 1.5645438832281077e-05, - "long_answer_loss": 0.1649, - "loss": 0.1537, - "short_answer_loss": NaN, - "step": 540, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1636, - "grad_norm": 1.6328125, - "learning_rate": 1.561370327194725e-05, - "long_answer_loss": 0.1636, - "loss": 0.1579, - "short_answer_loss": NaN, - "step": 541, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1688, - "grad_norm": 1.75, - "learning_rate": 1.558194629932506e-05, - "long_answer_loss": 0.1688, - "loss": 0.1685, - "short_answer_loss": NaN, - "step": 542, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1594, - "grad_norm": 1.7109375, - "learning_rate": 1.5550168132800585e-05, - "long_answer_loss": 0.1594, - "loss": 0.1544, - "short_answer_loss": NaN, - "step": 543, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1528, - "grad_norm": 1.6875, - "learning_rate": 1.5518368990905664e-05, - "long_answer_loss": 0.1528, - "loss": 0.1653, - "short_answer_loss": NaN, - "step": 544, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1369, - "grad_norm": 1.6015625, - "learning_rate": 1.5486549092316355e-05, - "long_answer_loss": 0.1369, - "loss": 0.1556, - "short_answer_loss": NaN, - "step": 545, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1606, - "grad_norm": 1.71875, - "learning_rate": 1.545470865585147e-05, - "long_answer_loss": 0.1606, - "loss": 0.1528, - "short_answer_loss": NaN, - "step": 546, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.171, - "grad_norm": 1.65625, - "learning_rate": 1.5422847900471063e-05, - "long_answer_loss": 0.171, - "loss": 0.1659, - "short_answer_loss": NaN, - "step": 547, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1376, - "grad_norm": 1.6875, - "learning_rate": 1.53909670452749e-05, - "long_answer_loss": 0.1376, - "loss": 0.1562, - "short_answer_loss": NaN, - "step": 548, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.161, - "grad_norm": 1.703125, - "learning_rate": 1.5359066309500974e-05, - "long_answer_loss": 0.161, - "loss": 0.1572, - "short_answer_loss": NaN, - "step": 549, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1466, - "grad_norm": 1.65625, - "learning_rate": 1.5327145912524e-05, - "long_answer_loss": 0.1466, - "loss": 0.1558, - "short_answer_loss": NaN, - "step": 550, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1694, - "grad_norm": 1.84375, - "learning_rate": 1.5295206073853896e-05, - "long_answer_loss": 0.1694, - "loss": 0.1675, - "short_answer_loss": NaN, - "step": 551, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1796, - "grad_norm": 1.84375, - "learning_rate": 1.526324701313427e-05, - "long_answer_loss": 0.1796, - "loss": 0.16, - "short_answer_loss": NaN, - "step": 552, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1545, - "grad_norm": 1.703125, - "learning_rate": 1.5231268950140926e-05, - "long_answer_loss": 0.1545, - "loss": 0.1545, - "short_answer_loss": NaN, - "step": 553, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.147, - "grad_norm": 1.6796875, - "learning_rate": 1.5199272104780332e-05, - "long_answer_loss": 0.147, - "loss": 0.1467, - "short_answer_loss": NaN, - "step": 554, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.149, - "grad_norm": 1.703125, - "learning_rate": 1.5167256697088128e-05, - "long_answer_loss": 0.149, - "loss": 0.1609, - "short_answer_loss": NaN, - "step": 555, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.165, - "grad_norm": 1.6484375, - "learning_rate": 1.5135222947227598e-05, - "long_answer_loss": 0.165, - "loss": 0.158, - "short_answer_loss": NaN, - "step": 556, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1821, - "grad_norm": 1.7578125, - "learning_rate": 1.510317107548816e-05, - "long_answer_loss": 0.1821, - "loss": 0.1679, - "short_answer_loss": NaN, - "step": 557, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1337, - "grad_norm": 1.6953125, - "learning_rate": 1.507110130228386e-05, - "long_answer_loss": 0.1337, - "loss": 0.1562, - "short_answer_loss": NaN, - "step": 558, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1507, - "grad_norm": 1.6484375, - "learning_rate": 1.5039013848151839e-05, - "long_answer_loss": 0.1507, - "loss": 0.1464, - "short_answer_loss": NaN, - "step": 559, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1295, - "grad_norm": 1.640625, - "learning_rate": 1.5006908933750829e-05, - "long_answer_loss": 0.1295, - "loss": 0.1484, - "short_answer_loss": NaN, - "step": 560, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1581, - "grad_norm": 1.65625, - "learning_rate": 1.4974786779859642e-05, - "long_answer_loss": 0.1581, - "loss": 0.1511, - "short_answer_loss": NaN, - "step": 561, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1504, - "grad_norm": 1.6640625, - "learning_rate": 1.4942647607375629e-05, - "long_answer_loss": 0.1504, - "loss": 0.1664, - "short_answer_loss": NaN, - "step": 562, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1607, - "grad_norm": 1.8046875, - "learning_rate": 1.4910491637313176e-05, - "long_answer_loss": 0.1607, - "loss": 0.1653, - "short_answer_loss": NaN, - "step": 563, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.168, - "grad_norm": 1.6875, - "learning_rate": 1.4878319090802196e-05, - "long_answer_loss": 0.168, - "loss": 0.1615, - "short_answer_loss": NaN, - "step": 564, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1271, - "grad_norm": 1.6015625, - "learning_rate": 1.4846130189086577e-05, - "long_answer_loss": 0.1271, - "loss": 0.1421, - "short_answer_loss": NaN, - "step": 565, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1694, - "grad_norm": 1.6875, - "learning_rate": 1.4813925153522693e-05, - "long_answer_loss": 0.1694, - "loss": 0.1608, - "short_answer_loss": NaN, - "step": 566, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1439, - "grad_norm": 1.640625, - "learning_rate": 1.4781704205577856e-05, - "long_answer_loss": 0.1439, - "loss": 0.1462, - "short_answer_loss": NaN, - "step": 567, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1338, - "grad_norm": 1.828125, - "learning_rate": 1.4749467566828808e-05, - "long_answer_loss": 0.1338, - "loss": 0.153, - "short_answer_loss": NaN, - "step": 568, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.169, - "grad_norm": 1.7890625, - "learning_rate": 1.4717215458960198e-05, - "long_answer_loss": 0.169, - "loss": 0.1626, - "short_answer_loss": NaN, - "step": 569, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1542, - "grad_norm": 1.7265625, - "learning_rate": 1.4684948103763046e-05, - "long_answer_loss": 0.1542, - "loss": 0.1596, - "short_answer_loss": NaN, - "step": 570, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1458, - "grad_norm": 1.7578125, - "learning_rate": 1.465266572313323e-05, - "long_answer_loss": 0.1458, - "loss": 0.1556, - "short_answer_loss": NaN, - "step": 571, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1489, - "grad_norm": 1.828125, - "learning_rate": 1.462036853906995e-05, - "long_answer_loss": 0.1489, - "loss": 0.1652, - "short_answer_loss": NaN, - "step": 572, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1884, - "grad_norm": 1.78125, - "learning_rate": 1.4588056773674209e-05, - "long_answer_loss": 0.1884, - "loss": 0.164, - "short_answer_loss": NaN, - "step": 573, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1575, - "grad_norm": 1.734375, - "learning_rate": 1.4555730649147285e-05, - "long_answer_loss": 0.1575, - "loss": 0.1548, - "short_answer_loss": NaN, - "step": 574, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1655, - "grad_norm": 1.7734375, - "learning_rate": 1.4523390387789193e-05, - "long_answer_loss": 0.1655, - "loss": 0.1563, - "short_answer_loss": NaN, - "step": 575, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1531, - "grad_norm": 1.859375, - "learning_rate": 1.4491036211997175e-05, - "long_answer_loss": 0.1531, - "loss": 0.1585, - "short_answer_loss": NaN, - "step": 576, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1501, - "grad_norm": 1.875, - "learning_rate": 1.4458668344264151e-05, - "long_answer_loss": 0.1501, - "loss": 0.1459, - "short_answer_loss": NaN, - "step": 577, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1541, - "grad_norm": 1.75, - "learning_rate": 1.4426287007177197e-05, - "long_answer_loss": 0.1541, - "loss": 0.1511, - "short_answer_loss": NaN, - "step": 578, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1538, - "grad_norm": 1.578125, - "learning_rate": 1.4393892423416025e-05, - "long_answer_loss": 0.1538, - "loss": 0.1514, - "short_answer_loss": NaN, - "step": 579, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1665, - "grad_norm": 1.5703125, - "learning_rate": 1.4361484815751434e-05, - "long_answer_loss": 0.1665, - "loss": 0.1508, - "short_answer_loss": NaN, - "step": 580, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1698, - "grad_norm": 1.6796875, - "learning_rate": 1.432906440704378e-05, - "long_answer_loss": 0.1698, - "loss": 0.155, - "short_answer_loss": NaN, - "step": 581, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1477, - "grad_norm": 1.6796875, - "learning_rate": 1.4296631420241463e-05, - "long_answer_loss": 0.1477, - "loss": 0.1553, - "short_answer_loss": NaN, - "step": 582, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1408, - "grad_norm": 1.703125, - "learning_rate": 1.4264186078379369e-05, - "long_answer_loss": 0.1408, - "loss": 0.1527, - "short_answer_loss": NaN, - "step": 583, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1557, - "grad_norm": 1.7109375, - "learning_rate": 1.4231728604577352e-05, - "long_answer_loss": 0.1557, - "loss": 0.1566, - "short_answer_loss": NaN, - "step": 584, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1524, - "grad_norm": 1.8203125, - "learning_rate": 1.4199259222038694e-05, - "long_answer_loss": 0.1524, - "loss": 0.1529, - "short_answer_loss": NaN, - "step": 585, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1409, - "grad_norm": 1.640625, - "learning_rate": 1.416677815404857e-05, - "long_answer_loss": 0.1409, - "loss": 0.1479, - "short_answer_loss": NaN, - "step": 586, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1292, - "grad_norm": 1.78125, - "learning_rate": 1.4134285623972514e-05, - "long_answer_loss": 0.1292, - "loss": 0.1496, - "short_answer_loss": NaN, - "step": 587, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1612, - "grad_norm": 1.6875, - "learning_rate": 1.4101781855254883e-05, - "long_answer_loss": 0.1612, - "loss": 0.1509, - "short_answer_loss": NaN, - "step": 588, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1768, - "grad_norm": 1.78125, - "learning_rate": 1.406926707141732e-05, - "long_answer_loss": 0.1768, - "loss": 0.1594, - "short_answer_loss": NaN, - "step": 589, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1408, - "grad_norm": 1.6953125, - "learning_rate": 1.4036741496057213e-05, - "long_answer_loss": 0.1408, - "loss": 0.1453, - "short_answer_loss": NaN, - "step": 590, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1442, - "grad_norm": 1.7734375, - "learning_rate": 1.4004205352846164e-05, - "long_answer_loss": 0.1442, - "loss": 0.1534, - "short_answer_loss": NaN, - "step": 591, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1261, - "grad_norm": 1.78125, - "learning_rate": 1.3971658865528451e-05, - "long_answer_loss": 0.1261, - "loss": 0.1595, - "short_answer_loss": NaN, - "step": 592, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.182, - "grad_norm": 1.734375, - "learning_rate": 1.3939102257919481e-05, - "long_answer_loss": 0.182, - "loss": 0.1481, - "short_answer_loss": NaN, - "step": 593, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1522, - "grad_norm": 1.75, - "learning_rate": 1.390653575390426e-05, - "long_answer_loss": 0.1522, - "loss": 0.1629, - "short_answer_loss": NaN, - "step": 594, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1329, - "grad_norm": 1.8125, - "learning_rate": 1.3873959577435847e-05, - "long_answer_loss": 0.1329, - "loss": 0.1608, - "short_answer_loss": NaN, - "step": 595, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1544, - "grad_norm": 1.7265625, - "learning_rate": 1.3841373952533812e-05, - "long_answer_loss": 0.1544, - "loss": 0.1516, - "short_answer_loss": NaN, - "step": 596, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1549, - "grad_norm": 1.671875, - "learning_rate": 1.3808779103282712e-05, - "long_answer_loss": 0.1549, - "loss": 0.1498, - "short_answer_loss": NaN, - "step": 597, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1468, - "grad_norm": 1.625, - "learning_rate": 1.3776175253830531e-05, - "long_answer_loss": 0.1468, - "loss": 0.1496, - "short_answer_loss": NaN, - "step": 598, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1515, - "grad_norm": 1.703125, - "learning_rate": 1.3743562628387141e-05, - "long_answer_loss": 0.1515, - "loss": 0.1535, - "short_answer_loss": NaN, - "step": 599, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1576, - "grad_norm": 1.734375, - "learning_rate": 1.3710941451222776e-05, - "long_answer_loss": 0.1576, - "loss": 0.1506, - "short_answer_loss": NaN, - "step": 600, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1565, - "grad_norm": 1.6015625, - "learning_rate": 1.367831194666646e-05, - "long_answer_loss": 0.1565, - "loss": 0.1484, - "short_answer_loss": NaN, - "step": 601, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1385, - "grad_norm": 1.765625, - "learning_rate": 1.3645674339104508e-05, - "long_answer_loss": 0.1385, - "loss": 0.1581, - "short_answer_loss": NaN, - "step": 602, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1386, - "grad_norm": 1.7265625, - "learning_rate": 1.3613028852978934e-05, - "long_answer_loss": 0.1386, - "loss": 0.1505, - "short_answer_loss": NaN, - "step": 603, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1686, - "grad_norm": 1.625, - "learning_rate": 1.3580375712785945e-05, - "long_answer_loss": 0.1686, - "loss": 0.1552, - "short_answer_loss": NaN, - "step": 604, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1538, - "grad_norm": 1.71875, - "learning_rate": 1.354771514307438e-05, - "long_answer_loss": 0.1538, - "loss": 0.1548, - "short_answer_loss": NaN, - "step": 605, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1412, - "grad_norm": 1.65625, - "learning_rate": 1.3515047368444169e-05, - "long_answer_loss": 0.1412, - "loss": 0.1488, - "short_answer_loss": NaN, - "step": 606, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1467, - "grad_norm": 1.6171875, - "learning_rate": 1.3482372613544788e-05, - "long_answer_loss": 0.1467, - "loss": 0.15, - "short_answer_loss": NaN, - "step": 607, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.191, - "grad_norm": 1.671875, - "learning_rate": 1.3449691103073714e-05, - "long_answer_loss": 0.191, - "loss": 0.1575, - "short_answer_loss": NaN, - "step": 608, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1561, - "grad_norm": 1.6171875, - "learning_rate": 1.3417003061774886e-05, - "long_answer_loss": 0.1561, - "loss": 0.1569, - "short_answer_loss": NaN, - "step": 609, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1616, - "grad_norm": 1.6875, - "learning_rate": 1.3384308714437146e-05, - "long_answer_loss": 0.1616, - "loss": 0.1667, - "short_answer_loss": NaN, - "step": 610, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1367, - "grad_norm": 1.6484375, - "learning_rate": 1.3351608285892708e-05, - "long_answer_loss": 0.1367, - "loss": 0.1531, - "short_answer_loss": NaN, - "step": 611, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1453, - "grad_norm": 1.71875, - "learning_rate": 1.3318902001015602e-05, - "long_answer_loss": 0.1453, - "loss": 0.1489, - "short_answer_loss": NaN, - "step": 612, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1351, - "grad_norm": 1.7265625, - "learning_rate": 1.328619008472013e-05, - "long_answer_loss": 0.1351, - "loss": 0.1525, - "short_answer_loss": NaN, - "step": 613, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1462, - "grad_norm": 1.734375, - "learning_rate": 1.3253472761959326e-05, - "long_answer_loss": 0.1462, - "loss": 0.1559, - "short_answer_loss": NaN, - "step": 614, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1419, - "grad_norm": 1.7578125, - "learning_rate": 1.3220750257723397e-05, - "long_answer_loss": 0.1419, - "loss": 0.159, - "short_answer_loss": NaN, - "step": 615, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1622, - "grad_norm": 1.796875, - "learning_rate": 1.3188022797038183e-05, - "long_answer_loss": 0.1622, - "loss": 0.1514, - "short_answer_loss": NaN, - "step": 616, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1706, - "grad_norm": 1.75, - "learning_rate": 1.3155290604963613e-05, - "long_answer_loss": 0.1706, - "loss": 0.1585, - "short_answer_loss": NaN, - "step": 617, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1639, - "grad_norm": 1.765625, - "learning_rate": 1.3122553906592142e-05, - "long_answer_loss": 0.1639, - "loss": 0.1556, - "short_answer_loss": NaN, - "step": 618, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.114, - "grad_norm": 1.5703125, - "learning_rate": 1.3089812927047224e-05, - "long_answer_loss": 0.114, - "loss": 0.1224, - "short_answer_loss": NaN, - "step": 619, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.0845, - "grad_norm": 1.4140625, - "learning_rate": 1.3057067891481752e-05, - "long_answer_loss": 0.0845, - "loss": 0.0856, - "short_answer_loss": NaN, - "step": 620, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.0776, - "grad_norm": 1.4609375, - "learning_rate": 1.3024319025076509e-05, - "long_answer_loss": 0.0776, - "loss": 0.0818, - "short_answer_loss": NaN, - "step": 621, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0675, - "grad_norm": 1.421875, - "learning_rate": 1.2991566553038623e-05, - "long_answer_loss": 0.0675, - "loss": 0.0814, - "short_answer_loss": NaN, - "step": 622, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0837, - "grad_norm": 1.609375, - "learning_rate": 1.2958810700600017e-05, - "long_answer_loss": 0.0837, - "loss": 0.0857, - "short_answer_loss": NaN, - "step": 623, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0696, - "grad_norm": 1.6328125, - "learning_rate": 1.2926051693015858e-05, - "long_answer_loss": 0.0696, - "loss": 0.0811, - "short_answer_loss": NaN, - "step": 624, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0957, - "grad_norm": 1.796875, - "learning_rate": 1.2893289755563017e-05, - "long_answer_loss": 0.0957, - "loss": 0.0848, - "short_answer_loss": NaN, - "step": 625, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0815, - "grad_norm": 1.5546875, - "learning_rate": 1.2860525113538505e-05, - "long_answer_loss": 0.0815, - "loss": 0.0819, - "short_answer_loss": NaN, - "step": 626, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0937, - "grad_norm": 1.8203125, - "learning_rate": 1.2827757992257939e-05, - "long_answer_loss": 0.0937, - "loss": 0.0895, - "short_answer_loss": NaN, - "step": 627, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0934, - "grad_norm": 1.8359375, - "learning_rate": 1.2794988617053979e-05, - "long_answer_loss": 0.0934, - "loss": 0.082, - "short_answer_loss": NaN, - "step": 628, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0839, - "grad_norm": 1.8515625, - "learning_rate": 1.2762217213274788e-05, - "long_answer_loss": 0.0839, - "loss": 0.0818, - "short_answer_loss": NaN, - "step": 629, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0908, - "grad_norm": 1.6328125, - "learning_rate": 1.2729444006282481e-05, - "long_answer_loss": 0.0908, - "loss": 0.0785, - "short_answer_loss": NaN, - "step": 630, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0839, - "grad_norm": 1.7109375, - "learning_rate": 1.269666922145157e-05, - "long_answer_loss": 0.0839, - "loss": 0.0829, - "short_answer_loss": NaN, - "step": 631, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0815, - "grad_norm": 1.9140625, - "learning_rate": 1.266389308416742e-05, - "long_answer_loss": 0.0815, - "loss": 0.0868, - "short_answer_loss": NaN, - "step": 632, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0666, - "grad_norm": 1.6484375, - "learning_rate": 1.2631115819824688e-05, - "long_answer_loss": 0.0666, - "loss": 0.0788, - "short_answer_loss": NaN, - "step": 633, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0738, - "grad_norm": 1.6328125, - "learning_rate": 1.2598337653825798e-05, - "long_answer_loss": 0.0738, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 634, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.08, - "grad_norm": 1.7109375, - "learning_rate": 1.2565558811579359e-05, - "long_answer_loss": 0.08, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 635, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0682, - "grad_norm": 1.5234375, - "learning_rate": 1.2532779518498639e-05, - "long_answer_loss": 0.0682, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 636, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0862, - "grad_norm": 1.5390625, - "learning_rate": 1.25e-05, - "long_answer_loss": 0.0862, - "loss": 0.08, - "short_answer_loss": NaN, - "step": 637, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0853, - "grad_norm": 1.5234375, - "learning_rate": 1.2467220481501365e-05, - "long_answer_loss": 0.0853, - "loss": 0.0784, - "short_answer_loss": NaN, - "step": 638, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0645, - "grad_norm": 1.4453125, - "learning_rate": 1.2434441188420644e-05, - "long_answer_loss": 0.0645, - "loss": 0.0767, - "short_answer_loss": NaN, - "step": 639, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.1023, - "grad_norm": 1.578125, - "learning_rate": 1.2401662346174206e-05, - "long_answer_loss": 0.1023, - "loss": 0.0801, - "short_answer_loss": NaN, - "step": 640, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0786, - "grad_norm": 1.484375, - "learning_rate": 1.2368884180175313e-05, - "long_answer_loss": 0.0786, - "loss": 0.0799, - "short_answer_loss": NaN, - "step": 641, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0839, - "grad_norm": 1.4921875, - "learning_rate": 1.2336106915832585e-05, - "long_answer_loss": 0.0839, - "loss": 0.0813, - "short_answer_loss": NaN, - "step": 642, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0719, - "grad_norm": 1.46875, - "learning_rate": 1.2303330778548433e-05, - "long_answer_loss": 0.0719, - "loss": 0.0821, - "short_answer_loss": NaN, - "step": 643, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0633, - "grad_norm": 1.484375, - "learning_rate": 1.2270555993717521e-05, - "long_answer_loss": 0.0633, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 644, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0928, - "grad_norm": 1.546875, - "learning_rate": 1.2237782786725215e-05, - "long_answer_loss": 0.0928, - "loss": 0.0798, - "short_answer_loss": NaN, - "step": 645, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0697, - "grad_norm": 1.484375, - "learning_rate": 1.2205011382946024e-05, - "long_answer_loss": 0.0697, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 646, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0761, - "grad_norm": 1.46875, - "learning_rate": 1.2172242007742066e-05, - "long_answer_loss": 0.0761, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 647, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0855, - "grad_norm": 1.640625, - "learning_rate": 1.21394748864615e-05, - "long_answer_loss": 0.0855, - "loss": 0.0834, - "short_answer_loss": NaN, - "step": 648, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0786, - "grad_norm": 1.625, - "learning_rate": 1.210671024443699e-05, - "long_answer_loss": 0.0786, - "loss": 0.0752, - "short_answer_loss": NaN, - "step": 649, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0761, - "grad_norm": 1.640625, - "learning_rate": 1.2073948306984148e-05, - "long_answer_loss": 0.0761, - "loss": 0.0794, - "short_answer_loss": NaN, - "step": 650, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.093, - "grad_norm": 1.5390625, - "learning_rate": 1.2041189299399991e-05, - "long_answer_loss": 0.093, - "loss": 0.0855, - "short_answer_loss": NaN, - "step": 651, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0824, - "grad_norm": 1.5234375, - "learning_rate": 1.2008433446961384e-05, - "long_answer_loss": 0.0824, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 652, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0785, - "grad_norm": 1.53125, - "learning_rate": 1.1975680974923497e-05, - "long_answer_loss": 0.0785, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 653, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0799, - "grad_norm": 1.734375, - "learning_rate": 1.194293210851825e-05, - "long_answer_loss": 0.0799, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 654, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0642, - "grad_norm": 1.5078125, - "learning_rate": 1.1910187072952779e-05, - "long_answer_loss": 0.0642, - "loss": 0.0764, - "short_answer_loss": NaN, - "step": 655, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0772, - "grad_norm": 1.53125, - "learning_rate": 1.1877446093407861e-05, - "long_answer_loss": 0.0772, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 656, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0799, - "grad_norm": 1.578125, - "learning_rate": 1.184470939503639e-05, - "long_answer_loss": 0.0799, - "loss": 0.0781, - "short_answer_loss": NaN, - "step": 657, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0733, - "grad_norm": 1.734375, - "learning_rate": 1.1811977202961817e-05, - "long_answer_loss": 0.0733, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 658, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.089, - "grad_norm": 1.5390625, - "learning_rate": 1.1779249742276603e-05, - "long_answer_loss": 0.089, - "loss": 0.0743, - "short_answer_loss": NaN, - "step": 659, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0853, - "grad_norm": 1.59375, - "learning_rate": 1.1746527238040674e-05, - "long_answer_loss": 0.0853, - "loss": 0.0819, - "short_answer_loss": NaN, - "step": 660, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.069, - "grad_norm": 1.5703125, - "learning_rate": 1.171380991527987e-05, - "long_answer_loss": 0.069, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 661, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0942, - "grad_norm": 1.671875, - "learning_rate": 1.1681097998984401e-05, - "long_answer_loss": 0.0942, - "loss": 0.0812, - "short_answer_loss": NaN, - "step": 662, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0842, - "grad_norm": 1.5703125, - "learning_rate": 1.1648391714107295e-05, - "long_answer_loss": 0.0842, - "loss": 0.0781, - "short_answer_loss": NaN, - "step": 663, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0594, - "grad_norm": 1.5703125, - "learning_rate": 1.1615691285562857e-05, - "long_answer_loss": 0.0594, - "loss": 0.076, - "short_answer_loss": NaN, - "step": 664, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0882, - "grad_norm": 1.6640625, - "learning_rate": 1.1582996938225119e-05, - "long_answer_loss": 0.0882, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 665, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0722, - "grad_norm": 1.4921875, - "learning_rate": 1.1550308896926288e-05, - "long_answer_loss": 0.0722, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 666, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0663, - "grad_norm": 1.5546875, - "learning_rate": 1.1517627386455215e-05, - "long_answer_loss": 0.0663, - "loss": 0.0778, - "short_answer_loss": NaN, - "step": 667, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0864, - "grad_norm": 1.609375, - "learning_rate": 1.1484952631555834e-05, - "long_answer_loss": 0.0864, - "loss": 0.0764, - "short_answer_loss": NaN, - "step": 668, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0743, - "grad_norm": 1.484375, - "learning_rate": 1.1452284856925621e-05, - "long_answer_loss": 0.0743, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 669, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0793, - "grad_norm": 1.609375, - "learning_rate": 1.1419624287214057e-05, - "long_answer_loss": 0.0793, - "loss": 0.0777, - "short_answer_loss": NaN, - "step": 670, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0805, - "grad_norm": 1.6796875, - "learning_rate": 1.1386971147021067e-05, - "long_answer_loss": 0.0805, - "loss": 0.0801, - "short_answer_loss": NaN, - "step": 671, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0593, - "grad_norm": 1.5078125, - "learning_rate": 1.1354325660895496e-05, - "long_answer_loss": 0.0593, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 672, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0708, - "grad_norm": 1.609375, - "learning_rate": 1.132168805333354e-05, - "long_answer_loss": 0.0708, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 673, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0748, - "grad_norm": 1.5546875, - "learning_rate": 1.1289058548777229e-05, - "long_answer_loss": 0.0748, - "loss": 0.0804, - "short_answer_loss": NaN, - "step": 674, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.089, - "grad_norm": 1.46875, - "learning_rate": 1.125643737161286e-05, - "long_answer_loss": 0.089, - "loss": 0.0764, - "short_answer_loss": NaN, - "step": 675, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0728, - "grad_norm": 1.578125, - "learning_rate": 1.1223824746169472e-05, - "long_answer_loss": 0.0728, - "loss": 0.0801, - "short_answer_loss": NaN, - "step": 676, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0789, - "grad_norm": 1.4453125, - "learning_rate": 1.119122089671729e-05, - "long_answer_loss": 0.0789, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 677, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0764, - "grad_norm": 1.6484375, - "learning_rate": 1.1158626047466191e-05, - "long_answer_loss": 0.0764, - "loss": 0.0802, - "short_answer_loss": NaN, - "step": 678, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.085, - "grad_norm": 1.5, - "learning_rate": 1.112604042256416e-05, - "long_answer_loss": 0.085, - "loss": 0.0775, - "short_answer_loss": NaN, - "step": 679, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0571, - "grad_norm": 1.5234375, - "learning_rate": 1.1093464246095746e-05, - "long_answer_loss": 0.0571, - "loss": 0.0771, - "short_answer_loss": NaN, - "step": 680, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0718, - "grad_norm": 1.515625, - "learning_rate": 1.1060897742080525e-05, - "long_answer_loss": 0.0718, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 681, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0846, - "grad_norm": 1.65625, - "learning_rate": 1.1028341134471554e-05, - "long_answer_loss": 0.0846, - "loss": 0.0819, - "short_answer_loss": NaN, - "step": 682, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0797, - "grad_norm": 1.5625, - "learning_rate": 1.0995794647153842e-05, - "long_answer_loss": 0.0797, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 683, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.078, - "grad_norm": 1.6953125, - "learning_rate": 1.0963258503942795e-05, - "long_answer_loss": 0.078, - "loss": 0.0824, - "short_answer_loss": NaN, - "step": 684, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.1139, - "grad_norm": 1.4375, - "learning_rate": 1.0930732928582687e-05, - "long_answer_loss": 0.1139, - "loss": 0.0814, - "short_answer_loss": NaN, - "step": 685, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0878, - "grad_norm": 1.578125, - "learning_rate": 1.0898218144745123e-05, - "long_answer_loss": 0.0878, - "loss": 0.076, - "short_answer_loss": NaN, - "step": 686, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0792, - "grad_norm": 1.53125, - "learning_rate": 1.0865714376027488e-05, - "long_answer_loss": 0.0792, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 687, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0748, - "grad_norm": 1.640625, - "learning_rate": 1.0833221845951433e-05, - "long_answer_loss": 0.0748, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 688, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0648, - "grad_norm": 1.453125, - "learning_rate": 1.080074077796131e-05, - "long_answer_loss": 0.0648, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 689, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0664, - "grad_norm": 1.5234375, - "learning_rate": 1.0768271395422651e-05, - "long_answer_loss": 0.0664, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 690, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0841, - "grad_norm": 1.484375, - "learning_rate": 1.0735813921620634e-05, - "long_answer_loss": 0.0841, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 691, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.079, - "grad_norm": 1.6796875, - "learning_rate": 1.070336857975854e-05, - "long_answer_loss": 0.079, - "loss": 0.0814, - "short_answer_loss": NaN, - "step": 692, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0902, - "grad_norm": 1.65625, - "learning_rate": 1.0670935592956223e-05, - "long_answer_loss": 0.0902, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 693, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0867, - "grad_norm": 1.4609375, - "learning_rate": 1.0638515184248571e-05, - "long_answer_loss": 0.0867, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 694, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0694, - "grad_norm": 1.5, - "learning_rate": 1.0606107576583976e-05, - "long_answer_loss": 0.0694, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 695, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.1009, - "grad_norm": 1.59375, - "learning_rate": 1.0573712992822804e-05, - "long_answer_loss": 0.1009, - "loss": 0.0839, - "short_answer_loss": NaN, - "step": 696, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0949, - "grad_norm": 1.484375, - "learning_rate": 1.0541331655735853e-05, - "long_answer_loss": 0.0949, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 697, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0804, - "grad_norm": 1.5703125, - "learning_rate": 1.0508963788002827e-05, - "long_answer_loss": 0.0804, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 698, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0741, - "grad_norm": 1.5703125, - "learning_rate": 1.0476609612210808e-05, - "long_answer_loss": 0.0741, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 699, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0814, - "grad_norm": 1.65625, - "learning_rate": 1.0444269350852718e-05, - "long_answer_loss": 0.0814, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 700, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0724, - "grad_norm": 1.578125, - "learning_rate": 1.0411943226325793e-05, - "long_answer_loss": 0.0724, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 701, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.084, - "grad_norm": 1.546875, - "learning_rate": 1.0379631460930054e-05, - "long_answer_loss": 0.084, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 702, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0656, - "grad_norm": 1.671875, - "learning_rate": 1.0347334276866772e-05, - "long_answer_loss": 0.0656, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 703, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0907, - "grad_norm": 1.6953125, - "learning_rate": 1.0315051896236955e-05, - "long_answer_loss": 0.0907, - "loss": 0.0867, - "short_answer_loss": NaN, - "step": 704, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.073, - "grad_norm": 1.625, - "learning_rate": 1.0282784541039804e-05, - "long_answer_loss": 0.073, - "loss": 0.0808, - "short_answer_loss": NaN, - "step": 705, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0825, - "grad_norm": 1.6328125, - "learning_rate": 1.0250532433171194e-05, - "long_answer_loss": 0.0825, - "loss": 0.0803, - "short_answer_loss": NaN, - "step": 706, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0669, - "grad_norm": 1.53125, - "learning_rate": 1.0218295794422147e-05, - "long_answer_loss": 0.0669, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 707, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.08, - "grad_norm": 1.65625, - "learning_rate": 1.018607484647731e-05, - "long_answer_loss": 0.08, - "loss": 0.0771, - "short_answer_loss": NaN, - "step": 708, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0723, - "grad_norm": 1.5703125, - "learning_rate": 1.0153869810913424e-05, - "long_answer_loss": 0.0723, - "loss": 0.0799, - "short_answer_loss": NaN, - "step": 709, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0776, - "grad_norm": 1.6171875, - "learning_rate": 1.0121680909197809e-05, - "long_answer_loss": 0.0776, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 710, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0681, - "grad_norm": 1.546875, - "learning_rate": 1.0089508362686827e-05, - "long_answer_loss": 0.0681, - "loss": 0.0792, - "short_answer_loss": NaN, - "step": 711, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0799, - "grad_norm": 1.546875, - "learning_rate": 1.0057352392624377e-05, - "long_answer_loss": 0.0799, - "loss": 0.079, - "short_answer_loss": NaN, - "step": 712, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.074, - "grad_norm": 1.6015625, - "learning_rate": 1.0025213220140364e-05, - "long_answer_loss": 0.074, - "loss": 0.0752, - "short_answer_loss": NaN, - "step": 713, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0629, - "grad_norm": 1.578125, - "learning_rate": 9.993091066249174e-06, - "long_answer_loss": 0.0629, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 714, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0891, - "grad_norm": 1.5546875, - "learning_rate": 9.960986151848167e-06, - "long_answer_loss": 0.0891, - "loss": 0.0742, - "short_answer_loss": NaN, - "step": 715, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0686, - "grad_norm": 1.5, - "learning_rate": 9.928898697716147e-06, - "long_answer_loss": 0.0686, - "loss": 0.0675, - "short_answer_loss": NaN, - "step": 716, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0818, - "grad_norm": 1.453125, - "learning_rate": 9.896828924511845e-06, - "long_answer_loss": 0.0818, - "loss": 0.0782, - "short_answer_loss": NaN, - "step": 717, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0696, - "grad_norm": 1.53125, - "learning_rate": 9.864777052772407e-06, - "long_answer_loss": 0.0696, - "loss": 0.077, - "short_answer_loss": NaN, - "step": 718, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0777, - "grad_norm": 1.5546875, - "learning_rate": 9.832743302911876e-06, - "long_answer_loss": 0.0777, - "loss": 0.0785, - "short_answer_loss": NaN, - "step": 719, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0767, - "grad_norm": 1.5703125, - "learning_rate": 9.800727895219672e-06, - "long_answer_loss": 0.0767, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 720, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0671, - "grad_norm": 1.5, - "learning_rate": 9.768731049859073e-06, - "long_answer_loss": 0.0671, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 721, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.072, - "grad_norm": 1.546875, - "learning_rate": 9.736752986865727e-06, - "long_answer_loss": 0.072, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 722, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0713, - "grad_norm": 1.5078125, - "learning_rate": 9.704793926146102e-06, - "long_answer_loss": 0.0713, - "loss": 0.0699, - "short_answer_loss": NaN, - "step": 723, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0717, - "grad_norm": 1.484375, - "learning_rate": 9.672854087475997e-06, - "long_answer_loss": 0.0717, - "loss": 0.0706, - "short_answer_loss": NaN, - "step": 724, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.072, - "grad_norm": 1.5078125, - "learning_rate": 9.640933690499027e-06, - "long_answer_loss": 0.072, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 725, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0798, - "grad_norm": 1.53125, - "learning_rate": 9.609032954725104e-06, - "long_answer_loss": 0.0798, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 726, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0731, - "grad_norm": 1.4765625, - "learning_rate": 9.57715209952894e-06, - "long_answer_loss": 0.0731, - "loss": 0.069, - "short_answer_loss": NaN, - "step": 727, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0856, - "grad_norm": 1.4453125, - "learning_rate": 9.54529134414853e-06, - "long_answer_loss": 0.0856, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 728, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0758, - "grad_norm": 1.6875, - "learning_rate": 9.51345090768365e-06, - "long_answer_loss": 0.0758, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 729, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0776, - "grad_norm": 1.5234375, - "learning_rate": 9.481631009094341e-06, - "long_answer_loss": 0.0776, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 730, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0583, - "grad_norm": 1.5078125, - "learning_rate": 9.449831867199416e-06, - "long_answer_loss": 0.0583, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 731, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0661, - "grad_norm": 1.4921875, - "learning_rate": 9.418053700674944e-06, - "long_answer_loss": 0.0661, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 732, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0722, - "grad_norm": 1.6015625, - "learning_rate": 9.386296728052753e-06, - "long_answer_loss": 0.0722, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 733, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0594, - "grad_norm": 1.6328125, - "learning_rate": 9.354561167718922e-06, - "long_answer_loss": 0.0594, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 734, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0713, - "grad_norm": 1.65625, - "learning_rate": 9.322847237912288e-06, - "long_answer_loss": 0.0713, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 735, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.1066, - "grad_norm": 1.5859375, - "learning_rate": 9.29115515672293e-06, - "long_answer_loss": 0.1066, - "loss": 0.0806, - "short_answer_loss": NaN, - "step": 736, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0749, - "grad_norm": 1.609375, - "learning_rate": 9.25948514209069e-06, - "long_answer_loss": 0.0749, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 737, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0544, - "grad_norm": 1.5703125, - "learning_rate": 9.227837411803656e-06, - "long_answer_loss": 0.0544, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 738, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0895, - "grad_norm": 1.546875, - "learning_rate": 9.196212183496669e-06, - "long_answer_loss": 0.0895, - "loss": 0.0714, - "short_answer_loss": NaN, - "step": 739, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.1009, - "grad_norm": 1.546875, - "learning_rate": 9.164609674649835e-06, - "long_answer_loss": 0.1009, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 740, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0684, - "grad_norm": 1.6171875, - "learning_rate": 9.133030102587019e-06, - "long_answer_loss": 0.0684, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 741, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0647, - "grad_norm": 1.5703125, - "learning_rate": 9.101473684474354e-06, - "long_answer_loss": 0.0647, - "loss": 0.0785, - "short_answer_loss": NaN, - "step": 742, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0748, - "grad_norm": 1.4609375, - "learning_rate": 9.069940637318752e-06, - "long_answer_loss": 0.0748, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 743, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0662, - "grad_norm": 1.46875, - "learning_rate": 9.038431177966406e-06, - "long_answer_loss": 0.0662, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 744, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0857, - "grad_norm": 1.4453125, - "learning_rate": 9.006945523101295e-06, - "long_answer_loss": 0.0857, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 745, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0737, - "grad_norm": 1.484375, - "learning_rate": 8.975483889243709e-06, - "long_answer_loss": 0.0737, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 746, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0732, - "grad_norm": 1.4609375, - "learning_rate": 8.944046492748746e-06, - "long_answer_loss": 0.0732, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 747, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0683, - "grad_norm": 1.546875, - "learning_rate": 8.912633549804824e-06, - "long_answer_loss": 0.0683, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 748, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0797, - "grad_norm": 1.484375, - "learning_rate": 8.88124527643221e-06, - "long_answer_loss": 0.0797, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 749, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0627, - "grad_norm": 1.5, - "learning_rate": 8.849881888481513e-06, - "long_answer_loss": 0.0627, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 750, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0746, - "grad_norm": 1.5546875, - "learning_rate": 8.818543601632215e-06, - "long_answer_loss": 0.0746, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 751, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0773, - "grad_norm": 1.4765625, - "learning_rate": 8.787230631391185e-06, - "long_answer_loss": 0.0773, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 752, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0852, - "grad_norm": 1.5703125, - "learning_rate": 8.755943193091187e-06, - "long_answer_loss": 0.0852, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 753, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0669, - "grad_norm": 1.4765625, - "learning_rate": 8.724681501889413e-06, - "long_answer_loss": 0.0669, - "loss": 0.0706, - "short_answer_loss": NaN, - "step": 754, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0729, - "grad_norm": 1.609375, - "learning_rate": 8.693445772766003e-06, - "long_answer_loss": 0.0729, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 755, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0827, - "grad_norm": 1.453125, - "learning_rate": 8.662236220522554e-06, - "long_answer_loss": 0.0827, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 756, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0792, - "grad_norm": 1.5390625, - "learning_rate": 8.631053059780647e-06, - "long_answer_loss": 0.0792, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 757, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0687, - "grad_norm": 1.53125, - "learning_rate": 8.599896504980384e-06, - "long_answer_loss": 0.0687, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 758, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0661, - "grad_norm": 1.4296875, - "learning_rate": 8.568766770378892e-06, - "long_answer_loss": 0.0661, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 759, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0701, - "grad_norm": 1.5390625, - "learning_rate": 8.537664070048867e-06, - "long_answer_loss": 0.0701, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 760, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0782, - "grad_norm": 1.5078125, - "learning_rate": 8.506588617877102e-06, - "long_answer_loss": 0.0782, - "loss": 0.0786, - "short_answer_loss": NaN, - "step": 761, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0597, - "grad_norm": 1.5390625, - "learning_rate": 8.475540627563e-06, - "long_answer_loss": 0.0597, - "loss": 0.0724, - "short_answer_loss": NaN, - "step": 762, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0623, - "grad_norm": 1.5, - "learning_rate": 8.444520312617118e-06, - "long_answer_loss": 0.0623, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 763, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0693, - "grad_norm": 1.4921875, - "learning_rate": 8.413527886359695e-06, - "long_answer_loss": 0.0693, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 764, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0785, - "grad_norm": 1.4296875, - "learning_rate": 8.382563561919191e-06, - "long_answer_loss": 0.0785, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 765, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0677, - "grad_norm": 1.6171875, - "learning_rate": 8.351627552230806e-06, - "long_answer_loss": 0.0677, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 766, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0751, - "grad_norm": 1.609375, - "learning_rate": 8.320720070035035e-06, - "long_answer_loss": 0.0751, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 767, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0693, - "grad_norm": 1.53125, - "learning_rate": 8.289841327876183e-06, - "long_answer_loss": 0.0693, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 768, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0721, - "grad_norm": 1.5234375, - "learning_rate": 8.25899153810093e-06, - "long_answer_loss": 0.0721, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 769, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0587, - "grad_norm": 1.515625, - "learning_rate": 8.228170912856845e-06, - "long_answer_loss": 0.0587, - "loss": 0.0699, - "short_answer_loss": NaN, - "step": 770, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0852, - "grad_norm": 1.5078125, - "learning_rate": 8.197379664090947e-06, - "long_answer_loss": 0.0852, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 771, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0758, - "grad_norm": 1.6015625, - "learning_rate": 8.166618003548235e-06, - "long_answer_loss": 0.0758, - "loss": 0.0774, - "short_answer_loss": NaN, - "step": 772, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0672, - "grad_norm": 1.40625, - "learning_rate": 8.135886142770232e-06, - "long_answer_loss": 0.0672, - "loss": 0.0679, - "short_answer_loss": NaN, - "step": 773, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0588, - "grad_norm": 1.453125, - "learning_rate": 8.105184293093545e-06, - "long_answer_loss": 0.0588, - "loss": 0.0678, - "short_answer_loss": NaN, - "step": 774, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0543, - "grad_norm": 1.4765625, - "learning_rate": 8.074512665648392e-06, - "long_answer_loss": 0.0543, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 775, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.076, - "grad_norm": 1.515625, - "learning_rate": 8.04387147135716e-06, - "long_answer_loss": 0.076, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 776, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0697, - "grad_norm": 1.4765625, - "learning_rate": 8.013260920932957e-06, - "long_answer_loss": 0.0697, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 777, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.085, - "grad_norm": 1.46875, - "learning_rate": 7.982681224878157e-06, - "long_answer_loss": 0.085, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 778, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0754, - "grad_norm": 1.5546875, - "learning_rate": 7.952132593482956e-06, - "long_answer_loss": 0.0754, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 779, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0818, - "grad_norm": 1.484375, - "learning_rate": 7.921615236823924e-06, - "long_answer_loss": 0.0818, - "loss": 0.0676, - "short_answer_loss": NaN, - "step": 780, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0902, - "grad_norm": 1.53125, - "learning_rate": 7.891129364762559e-06, - "long_answer_loss": 0.0902, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 781, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0507, - "grad_norm": 1.484375, - "learning_rate": 7.860675186943853e-06, - "long_answer_loss": 0.0507, - "loss": 0.0654, - "short_answer_loss": NaN, - "step": 782, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0574, - "grad_norm": 1.5, - "learning_rate": 7.830252912794836e-06, - "long_answer_loss": 0.0574, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 783, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0782, - "grad_norm": 1.59375, - "learning_rate": 7.799862751523146e-06, - "long_answer_loss": 0.0782, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 784, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0597, - "grad_norm": 1.4765625, - "learning_rate": 7.769504912115588e-06, - "long_answer_loss": 0.0597, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 785, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0651, - "grad_norm": 1.46875, - "learning_rate": 7.739179603336696e-06, - "long_answer_loss": 0.0651, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 786, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0657, - "grad_norm": 1.4296875, - "learning_rate": 7.708887033727291e-06, - "long_answer_loss": 0.0657, - "loss": 0.0688, - "short_answer_loss": NaN, - "step": 787, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0599, - "grad_norm": 1.40625, - "learning_rate": 7.678627411603074e-06, - "long_answer_loss": 0.0599, - "loss": 0.0687, - "short_answer_loss": NaN, - "step": 788, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0741, - "grad_norm": 1.4921875, - "learning_rate": 7.648400945053146e-06, - "long_answer_loss": 0.0741, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 789, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0807, - "grad_norm": 1.5546875, - "learning_rate": 7.618207841938624e-06, - "long_answer_loss": 0.0807, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 790, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0706, - "grad_norm": 1.53125, - "learning_rate": 7.588048309891181e-06, - "long_answer_loss": 0.0706, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 791, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0788, - "grad_norm": 1.4609375, - "learning_rate": 7.557922556311634e-06, - "long_answer_loss": 0.0788, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 792, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0771, - "grad_norm": 1.4609375, - "learning_rate": 7.527830788368509e-06, - "long_answer_loss": 0.0771, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 793, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0605, - "grad_norm": 1.6484375, - "learning_rate": 7.497773212996623e-06, - "long_answer_loss": 0.0605, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 794, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0751, - "grad_norm": 1.59375, - "learning_rate": 7.467750036895657e-06, - "long_answer_loss": 0.0751, - "loss": 0.0742, - "short_answer_loss": NaN, - "step": 795, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0719, - "grad_norm": 1.4921875, - "learning_rate": 7.437761466528731e-06, - "long_answer_loss": 0.0719, - "loss": 0.0717, - "short_answer_loss": NaN, - "step": 796, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0624, - "grad_norm": 1.5234375, - "learning_rate": 7.407807708120998e-06, - "long_answer_loss": 0.0624, - "loss": 0.0686, - "short_answer_loss": NaN, - "step": 797, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0642, - "grad_norm": 1.4140625, - "learning_rate": 7.377888967658206e-06, - "long_answer_loss": 0.0642, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 798, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0696, - "grad_norm": 1.5390625, - "learning_rate": 7.348005450885301e-06, - "long_answer_loss": 0.0696, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 799, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0766, - "grad_norm": 1.5234375, - "learning_rate": 7.318157363304995e-06, - "long_answer_loss": 0.0766, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 800, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0667, - "grad_norm": 1.5390625, - "learning_rate": 7.288344910176365e-06, - "long_answer_loss": 0.0667, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 801, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0701, - "grad_norm": 1.5625, - "learning_rate": 7.258568296513439e-06, - "long_answer_loss": 0.0701, - "loss": 0.0686, - "short_answer_loss": NaN, - "step": 802, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0903, - "grad_norm": 1.6171875, - "learning_rate": 7.228827727083781e-06, - "long_answer_loss": 0.0903, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 803, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0726, - "grad_norm": 1.53125, - "learning_rate": 7.199123406407089e-06, - "long_answer_loss": 0.0726, - "loss": 0.0688, - "short_answer_loss": NaN, - "step": 804, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0762, - "grad_norm": 1.5, - "learning_rate": 7.169455538753783e-06, - "long_answer_loss": 0.0762, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 805, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0821, - "grad_norm": 1.453125, - "learning_rate": 7.139824328143604e-06, - "long_answer_loss": 0.0821, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 806, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0701, - "grad_norm": 1.4765625, - "learning_rate": 7.110229978344212e-06, - "long_answer_loss": 0.0701, - "loss": 0.0714, - "short_answer_loss": NaN, - "step": 807, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0829, - "grad_norm": 1.5625, - "learning_rate": 7.080672692869783e-06, - "long_answer_loss": 0.0829, - "loss": 0.0677, - "short_answer_loss": NaN, - "step": 808, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0703, - "grad_norm": 1.40625, - "learning_rate": 7.051152674979608e-06, - "long_answer_loss": 0.0703, - "loss": 0.0672, - "short_answer_loss": NaN, - "step": 809, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0737, - "grad_norm": 1.5078125, - "learning_rate": 7.0216701276766936e-06, - "long_answer_loss": 0.0737, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 810, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0751, - "grad_norm": 1.5546875, - "learning_rate": 6.992225253706374e-06, - "long_answer_loss": 0.0751, - "loss": 0.0699, - "short_answer_loss": NaN, - "step": 811, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0662, - "grad_norm": 1.625, - "learning_rate": 6.962818255554911e-06, - "long_answer_loss": 0.0662, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 812, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0629, - "grad_norm": 1.4765625, - "learning_rate": 6.9334493354480985e-06, - "long_answer_loss": 0.0629, - "loss": 0.0684, - "short_answer_loss": NaN, - "step": 813, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0756, - "grad_norm": 1.5234375, - "learning_rate": 6.904118695349882e-06, - "long_answer_loss": 0.0756, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 814, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0698, - "grad_norm": 1.5390625, - "learning_rate": 6.874826536960954e-06, - "long_answer_loss": 0.0698, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 815, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0696, - "grad_norm": 1.5078125, - "learning_rate": 6.845573061717387e-06, - "long_answer_loss": 0.0696, - "loss": 0.0777, - "short_answer_loss": NaN, - "step": 816, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0788, - "grad_norm": 1.5546875, - "learning_rate": 6.8163584707892306e-06, - "long_answer_loss": 0.0788, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 817, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0925, - "grad_norm": 1.6171875, - "learning_rate": 6.7871829650791365e-06, - "long_answer_loss": 0.0925, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 818, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0659, - "grad_norm": 1.5703125, - "learning_rate": 6.758046745220978e-06, - "long_answer_loss": 0.0659, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 819, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.063, - "grad_norm": 1.59375, - "learning_rate": 6.728950011578462e-06, - "long_answer_loss": 0.063, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 820, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0639, - "grad_norm": 1.578125, - "learning_rate": 6.6998929642437645e-06, - "long_answer_loss": 0.0639, - "loss": 0.0731, - "short_answer_loss": NaN, - "step": 821, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0546, - "grad_norm": 1.421875, - "learning_rate": 6.670875803036141e-06, - "long_answer_loss": 0.0546, - "loss": 0.07, - "short_answer_loss": NaN, - "step": 822, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0727, - "grad_norm": 1.5625, - "learning_rate": 6.64189872750056e-06, - "long_answer_loss": 0.0727, - "loss": 0.0686, - "short_answer_loss": NaN, - "step": 823, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0826, - "grad_norm": 1.4765625, - "learning_rate": 6.612961936906333e-06, - "long_answer_loss": 0.0826, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 824, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0692, - "grad_norm": 1.4453125, - "learning_rate": 6.584065630245734e-06, - "long_answer_loss": 0.0692, - "loss": 0.0688, - "short_answer_loss": NaN, - "step": 825, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0577, - "grad_norm": 1.4921875, - "learning_rate": 6.55521000623264e-06, - "long_answer_loss": 0.0577, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 826, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0838, - "grad_norm": 1.515625, - "learning_rate": 6.526395263301166e-06, - "long_answer_loss": 0.0838, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 827, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0701, - "grad_norm": 1.5078125, - "learning_rate": 6.497621599604292e-06, - "long_answer_loss": 0.0701, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 828, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.082, - "grad_norm": 1.5078125, - "learning_rate": 6.468889213012502e-06, - "long_answer_loss": 0.082, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 829, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0891, - "grad_norm": 1.4375, - "learning_rate": 6.440198301112434e-06, - "long_answer_loss": 0.0891, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 830, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0972, - "grad_norm": 1.5859375, - "learning_rate": 6.411549061205505e-06, - "long_answer_loss": 0.0972, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 831, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0513, - "grad_norm": 1.5703125, - "learning_rate": 6.382941690306568e-06, - "long_answer_loss": 0.0513, - "loss": 0.0653, - "short_answer_loss": NaN, - "step": 832, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0743, - "grad_norm": 1.4765625, - "learning_rate": 6.35437638514255e-06, - "long_answer_loss": 0.0743, - "loss": 0.0673, - "short_answer_loss": NaN, - "step": 833, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0729, - "grad_norm": 1.390625, - "learning_rate": 6.325853342151097e-06, - "long_answer_loss": 0.0729, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 834, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0751, - "grad_norm": 1.6484375, - "learning_rate": 6.2973727574792345e-06, - "long_answer_loss": 0.0751, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 835, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0665, - "grad_norm": 1.6640625, - "learning_rate": 6.2689348269820036e-06, - "long_answer_loss": 0.0665, - "loss": 0.0742, - "short_answer_loss": NaN, - "step": 836, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0658, - "grad_norm": 1.5078125, - "learning_rate": 6.240539746221127e-06, - "long_answer_loss": 0.0658, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 837, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0639, - "grad_norm": 1.4375, - "learning_rate": 6.212187710463654e-06, - "long_answer_loss": 0.0639, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 838, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0708, - "grad_norm": 1.5703125, - "learning_rate": 6.1838789146806254e-06, - "long_answer_loss": 0.0708, - "loss": 0.0681, - "short_answer_loss": NaN, - "step": 839, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0712, - "grad_norm": 1.5859375, - "learning_rate": 6.155613553545729e-06, - "long_answer_loss": 0.0712, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 840, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0751, - "grad_norm": 1.40625, - "learning_rate": 6.127391821433961e-06, - "long_answer_loss": 0.0751, - "loss": 0.0717, - "short_answer_loss": NaN, - "step": 841, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0764, - "grad_norm": 1.5546875, - "learning_rate": 6.0992139124202914e-06, - "long_answer_loss": 0.0764, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 842, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0701, - "grad_norm": 1.5703125, - "learning_rate": 6.071080020278326e-06, - "long_answer_loss": 0.0701, - "loss": 0.0706, - "short_answer_loss": NaN, - "step": 843, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0805, - "grad_norm": 1.609375, - "learning_rate": 6.0429903384789775e-06, - "long_answer_loss": 0.0805, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 844, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0748, - "grad_norm": 1.6640625, - "learning_rate": 6.0149450601891325e-06, - "long_answer_loss": 0.0748, - "loss": 0.0817, - "short_answer_loss": NaN, - "step": 845, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0626, - "grad_norm": 1.5390625, - "learning_rate": 5.986944378270323e-06, - "long_answer_loss": 0.0626, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 846, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0844, - "grad_norm": 1.578125, - "learning_rate": 5.958988485277401e-06, - "long_answer_loss": 0.0844, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 847, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0873, - "grad_norm": 1.71875, - "learning_rate": 5.93107757345722e-06, - "long_answer_loss": 0.0873, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 848, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0692, - "grad_norm": 1.453125, - "learning_rate": 5.9032118347472965e-06, - "long_answer_loss": 0.0692, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 849, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0654, - "grad_norm": 1.453125, - "learning_rate": 5.87539146077451e-06, - "long_answer_loss": 0.0654, - "loss": 0.0667, - "short_answer_loss": NaN, - "step": 850, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0774, - "grad_norm": 1.4765625, - "learning_rate": 5.847616642853773e-06, - "long_answer_loss": 0.0774, - "loss": 0.069, - "short_answer_loss": NaN, - "step": 851, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0641, - "grad_norm": 1.5234375, - "learning_rate": 5.81988757198672e-06, - "long_answer_loss": 0.0641, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 852, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0771, - "grad_norm": 1.5390625, - "learning_rate": 5.792204438860391e-06, - "long_answer_loss": 0.0771, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 853, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0661, - "grad_norm": 1.515625, - "learning_rate": 5.764567433845915e-06, - "long_answer_loss": 0.0661, - "loss": 0.07, - "short_answer_loss": NaN, - "step": 854, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0721, - "grad_norm": 1.5, - "learning_rate": 5.736976746997226e-06, - "long_answer_loss": 0.0721, - "loss": 0.069, - "short_answer_loss": NaN, - "step": 855, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0943, - "grad_norm": 1.453125, - "learning_rate": 5.709432568049722e-06, - "long_answer_loss": 0.0943, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 856, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0672, - "grad_norm": 1.515625, - "learning_rate": 5.681935086418978e-06, - "long_answer_loss": 0.0672, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 857, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0625, - "grad_norm": 1.6328125, - "learning_rate": 5.654484491199446e-06, - "long_answer_loss": 0.0625, - "loss": 0.0724, - "short_answer_loss": NaN, - "step": 858, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0597, - "grad_norm": 1.5, - "learning_rate": 5.627080971163146e-06, - "long_answer_loss": 0.0597, - "loss": 0.0674, - "short_answer_loss": NaN, - "step": 859, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0597, - "grad_norm": 1.4921875, - "learning_rate": 5.599724714758374e-06, - "long_answer_loss": 0.0597, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 860, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0699, - "grad_norm": 1.515625, - "learning_rate": 5.572415910108401e-06, - "long_answer_loss": 0.0699, - "loss": 0.0742, - "short_answer_loss": NaN, - "step": 861, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0782, - "grad_norm": 1.515625, - "learning_rate": 5.545154745010187e-06, - "long_answer_loss": 0.0782, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 862, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0752, - "grad_norm": 1.4375, - "learning_rate": 5.5179414069330786e-06, - "long_answer_loss": 0.0752, - "loss": 0.0714, - "short_answer_loss": NaN, - "step": 863, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.06, - "grad_norm": 1.375, - "learning_rate": 5.490776083017532e-06, - "long_answer_loss": 0.06, - "loss": 0.0691, - "short_answer_loss": NaN, - "step": 864, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0703, - "grad_norm": 1.5078125, - "learning_rate": 5.463658960073816e-06, - "long_answer_loss": 0.0703, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 865, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.063, - "grad_norm": 1.4140625, - "learning_rate": 5.436590224580733e-06, - "long_answer_loss": 0.063, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 866, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0656, - "grad_norm": 1.4296875, - "learning_rate": 5.409570062684334e-06, - "long_answer_loss": 0.0656, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 867, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0677, - "grad_norm": 1.4921875, - "learning_rate": 5.382598660196642e-06, - "long_answer_loss": 0.0677, - "loss": 0.0706, - "short_answer_loss": NaN, - "step": 868, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0569, - "grad_norm": 1.4375, - "learning_rate": 5.355676202594367e-06, - "long_answer_loss": 0.0569, - "loss": 0.0679, - "short_answer_loss": NaN, - "step": 869, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0708, - "grad_norm": 1.5390625, - "learning_rate": 5.3288028750176395e-06, - "long_answer_loss": 0.0708, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 870, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0767, - "grad_norm": 1.4609375, - "learning_rate": 5.301978862268733e-06, - "long_answer_loss": 0.0767, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 871, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0593, - "grad_norm": 1.4375, - "learning_rate": 5.275204348810789e-06, - "long_answer_loss": 0.0593, - "loss": 0.0681, - "short_answer_loss": NaN, - "step": 872, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0584, - "grad_norm": 1.3984375, - "learning_rate": 5.248479518766558e-06, - "long_answer_loss": 0.0584, - "loss": 0.0679, - "short_answer_loss": NaN, - "step": 873, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0605, - "grad_norm": 1.390625, - "learning_rate": 5.221804555917123e-06, - "long_answer_loss": 0.0605, - "loss": 0.0648, - "short_answer_loss": NaN, - "step": 874, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0637, - "grad_norm": 1.4140625, - "learning_rate": 5.195179643700646e-06, - "long_answer_loss": 0.0637, - "loss": 0.0661, - "short_answer_loss": NaN, - "step": 875, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0591, - "grad_norm": 1.5390625, - "learning_rate": 5.168604965211096e-06, - "long_answer_loss": 0.0591, - "loss": 0.0699, - "short_answer_loss": NaN, - "step": 876, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0751, - "grad_norm": 1.5234375, - "learning_rate": 5.142080703197e-06, - "long_answer_loss": 0.0751, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 877, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0624, - "grad_norm": 1.4765625, - "learning_rate": 5.115607040060177e-06, - "long_answer_loss": 0.0624, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 878, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0758, - "grad_norm": 1.4375, - "learning_rate": 5.089184157854491e-06, - "long_answer_loss": 0.0758, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 879, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.06, - "grad_norm": 1.578125, - "learning_rate": 5.0628122382845935e-06, - "long_answer_loss": 0.06, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 880, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.069, - "grad_norm": 1.4765625, - "learning_rate": 5.036491462704682e-06, - "long_answer_loss": 0.069, - "loss": 0.077, - "short_answer_loss": NaN, - "step": 881, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0749, - "grad_norm": 1.5625, - "learning_rate": 5.010222012117238e-06, - "long_answer_loss": 0.0749, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 882, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0579, - "grad_norm": 1.5234375, - "learning_rate": 4.984004067171803e-06, - "long_answer_loss": 0.0579, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 883, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0791, - "grad_norm": 1.4140625, - "learning_rate": 4.957837808163718e-06, - "long_answer_loss": 0.0791, - "loss": 0.0688, - "short_answer_loss": NaN, - "step": 884, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0586, - "grad_norm": 1.546875, - "learning_rate": 4.931723415032889e-06, - "long_answer_loss": 0.0586, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 885, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0948, - "grad_norm": 1.46875, - "learning_rate": 4.905661067362558e-06, - "long_answer_loss": 0.0948, - "loss": 0.0772, - "short_answer_loss": NaN, - "step": 886, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.072, - "grad_norm": 1.4140625, - "learning_rate": 4.87965094437805e-06, - "long_answer_loss": 0.072, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 887, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0702, - "grad_norm": 1.53125, - "learning_rate": 4.853693224945569e-06, - "long_answer_loss": 0.0702, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 888, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0709, - "grad_norm": 1.3671875, - "learning_rate": 4.827788087570936e-06, - "long_answer_loss": 0.0709, - "loss": 0.0689, - "short_answer_loss": NaN, - "step": 889, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0619, - "grad_norm": 1.4609375, - "learning_rate": 4.801935710398382e-06, - "long_answer_loss": 0.0619, - "loss": 0.0666, - "short_answer_loss": NaN, - "step": 890, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0726, - "grad_norm": 1.4375, - "learning_rate": 4.776136271209315e-06, - "long_answer_loss": 0.0726, - "loss": 0.0689, - "short_answer_loss": NaN, - "step": 891, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.074, - "grad_norm": 1.5390625, - "learning_rate": 4.750389947421101e-06, - "long_answer_loss": 0.074, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 892, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0644, - "grad_norm": 1.4375, - "learning_rate": 4.724696916085841e-06, - "long_answer_loss": 0.0644, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 893, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0673, - "grad_norm": 1.4296875, - "learning_rate": 4.699057353889157e-06, - "long_answer_loss": 0.0673, - "loss": 0.0686, - "short_answer_loss": NaN, - "step": 894, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0895, - "grad_norm": 1.5078125, - "learning_rate": 4.673471437148973e-06, - "long_answer_loss": 0.0895, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 895, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0645, - "grad_norm": 1.3828125, - "learning_rate": 4.64793934181431e-06, - "long_answer_loss": 0.0645, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 896, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0683, - "grad_norm": 1.34375, - "learning_rate": 4.6224612434640575e-06, - "long_answer_loss": 0.0683, - "loss": 0.0634, - "short_answer_loss": NaN, - "step": 897, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0634, - "grad_norm": 1.4453125, - "learning_rate": 4.597037317305788e-06, - "long_answer_loss": 0.0634, - "loss": 0.0654, - "short_answer_loss": NaN, - "step": 898, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0683, - "grad_norm": 1.453125, - "learning_rate": 4.571667738174547e-06, - "long_answer_loss": 0.0683, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 899, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0723, - "grad_norm": 1.5625, - "learning_rate": 4.546352680531639e-06, - "long_answer_loss": 0.0723, - "loss": 0.0689, - "short_answer_loss": NaN, - "step": 900, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0885, - "grad_norm": 1.5078125, - "learning_rate": 4.521092318463439e-06, - "long_answer_loss": 0.0885, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 901, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0772, - "grad_norm": 1.421875, - "learning_rate": 4.495886825680192e-06, - "long_answer_loss": 0.0772, - "loss": 0.0681, - "short_answer_loss": NaN, - "step": 902, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0694, - "grad_norm": 1.6171875, - "learning_rate": 4.470736375514818e-06, - "long_answer_loss": 0.0694, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 903, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0899, - "grad_norm": 1.5, - "learning_rate": 4.445641140921721e-06, - "long_answer_loss": 0.0899, - "loss": 0.068, - "short_answer_loss": NaN, - "step": 904, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0637, - "grad_norm": 1.5, - "learning_rate": 4.420601294475595e-06, - "long_answer_loss": 0.0637, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 905, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0602, - "grad_norm": 1.375, - "learning_rate": 4.395617008370248e-06, - "long_answer_loss": 0.0602, - "loss": 0.0677, - "short_answer_loss": NaN, - "step": 906, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0693, - "grad_norm": 1.4921875, - "learning_rate": 4.370688454417405e-06, - "long_answer_loss": 0.0693, - "loss": 0.0706, - "short_answer_loss": NaN, - "step": 907, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0855, - "grad_norm": 1.4921875, - "learning_rate": 4.345815804045539e-06, - "long_answer_loss": 0.0855, - "loss": 0.0667, - "short_answer_loss": NaN, - "step": 908, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0583, - "grad_norm": 1.3828125, - "learning_rate": 4.320999228298678e-06, - "long_answer_loss": 0.0583, - "loss": 0.064, - "short_answer_loss": NaN, - "step": 909, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0899, - "grad_norm": 1.4765625, - "learning_rate": 4.2962388978352435e-06, - "long_answer_loss": 0.0899, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 910, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0773, - "grad_norm": 1.515625, - "learning_rate": 4.271534982926864e-06, - "long_answer_loss": 0.0773, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 911, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0494, - "grad_norm": 1.421875, - "learning_rate": 4.246887653457216e-06, - "long_answer_loss": 0.0494, - "loss": 0.0607, - "short_answer_loss": NaN, - "step": 912, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0653, - "grad_norm": 1.4296875, - "learning_rate": 4.222297078920845e-06, - "long_answer_loss": 0.0653, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 913, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0765, - "grad_norm": 1.453125, - "learning_rate": 4.197763428422005e-06, - "long_answer_loss": 0.0765, - "loss": 0.0679, - "short_answer_loss": NaN, - "step": 914, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0643, - "grad_norm": 1.4453125, - "learning_rate": 4.173286870673498e-06, - "long_answer_loss": 0.0643, - "loss": 0.067, - "short_answer_loss": NaN, - "step": 915, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0814, - "grad_norm": 1.5078125, - "learning_rate": 4.148867573995511e-06, - "long_answer_loss": 0.0814, - "loss": 0.0765, - "short_answer_loss": NaN, - "step": 916, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0628, - "grad_norm": 1.4453125, - "learning_rate": 4.124505706314455e-06, - "long_answer_loss": 0.0628, - "loss": 0.0687, - "short_answer_loss": NaN, - "step": 917, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0651, - "grad_norm": 1.625, - "learning_rate": 4.100201435161817e-06, - "long_answer_loss": 0.0651, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 918, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.089, - "grad_norm": 1.375, - "learning_rate": 4.0759549276730025e-06, - "long_answer_loss": 0.089, - "loss": 0.0727, - "short_answer_loss": NaN, - "step": 919, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0845, - "grad_norm": 1.453125, - "learning_rate": 4.051766350586187e-06, - "long_answer_loss": 0.0845, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 920, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0693, - "grad_norm": 1.421875, - "learning_rate": 4.027635870241178e-06, - "long_answer_loss": 0.0693, - "loss": 0.0664, - "short_answer_loss": NaN, - "step": 921, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0697, - "grad_norm": 1.484375, - "learning_rate": 4.003563652578258e-06, - "long_answer_loss": 0.0697, - "loss": 0.07, - "short_answer_loss": NaN, - "step": 922, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0664, - "grad_norm": 1.453125, - "learning_rate": 3.9795498631370515e-06, - "long_answer_loss": 0.0664, - "loss": 0.0751, - "short_answer_loss": NaN, - "step": 923, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0788, - "grad_norm": 1.4921875, - "learning_rate": 3.9555946670553774e-06, - "long_answer_loss": 0.0788, - "loss": 0.0727, - "short_answer_loss": NaN, - "step": 924, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0861, - "grad_norm": 1.5, - "learning_rate": 3.931698229068131e-06, - "long_answer_loss": 0.0861, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 925, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0731, - "grad_norm": 1.4609375, - "learning_rate": 3.907860713506132e-06, - "long_answer_loss": 0.0731, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 926, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0647, - "grad_norm": 1.453125, - "learning_rate": 3.884082284295008e-06, - "long_answer_loss": 0.0647, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 927, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0623, - "grad_norm": 1.4296875, - "learning_rate": 3.860363104954059e-06, - "long_answer_loss": 0.0623, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 928, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0813, - "grad_norm": 1.4140625, - "learning_rate": 3.836703338595138e-06, - "long_answer_loss": 0.0813, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 929, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0863, - "grad_norm": 1.5625, - "learning_rate": 3.813103147921526e-06, - "long_answer_loss": 0.0863, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 930, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0466, - "grad_norm": 1.421875, - "learning_rate": 3.7895626952268155e-06, - "long_answer_loss": 0.0466, - "loss": 0.0638, - "short_answer_loss": NaN, - "step": 931, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0831, - "grad_norm": 1.484375, - "learning_rate": 3.766082142393791e-06, - "long_answer_loss": 0.0831, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 932, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0468, - "grad_norm": 1.484375, - "learning_rate": 3.7426616508933214e-06, - "long_answer_loss": 0.0468, - "loss": 0.0678, - "short_answer_loss": NaN, - "step": 933, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0818, - "grad_norm": 1.40625, - "learning_rate": 3.7193013817832454e-06, - "long_answer_loss": 0.0818, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 934, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0733, - "grad_norm": 1.484375, - "learning_rate": 3.696001495707263e-06, - "long_answer_loss": 0.0733, - "loss": 0.0689, - "short_answer_loss": NaN, - "step": 935, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.067, - "grad_norm": 1.40625, - "learning_rate": 3.672762152893834e-06, - "long_answer_loss": 0.067, - "loss": 0.0668, - "short_answer_loss": NaN, - "step": 936, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0583, - "grad_norm": 1.53125, - "learning_rate": 3.6495835131550748e-06, - "long_answer_loss": 0.0583, - "loss": 0.0686, - "short_answer_loss": NaN, - "step": 937, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0655, - "grad_norm": 1.4609375, - "learning_rate": 3.6264657358856604e-06, - "long_answer_loss": 0.0655, - "loss": 0.067, - "short_answer_loss": NaN, - "step": 938, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0497, - "grad_norm": 1.4296875, - "learning_rate": 3.603408980061726e-06, - "long_answer_loss": 0.0497, - "loss": 0.0665, - "short_answer_loss": NaN, - "step": 939, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0841, - "grad_norm": 1.3984375, - "learning_rate": 3.5804134042397743e-06, - "long_answer_loss": 0.0841, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 940, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0685, - "grad_norm": 1.3984375, - "learning_rate": 3.5574791665555882e-06, - "long_answer_loss": 0.0685, - "loss": 0.0672, - "short_answer_loss": NaN, - "step": 941, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.071, - "grad_norm": 1.484375, - "learning_rate": 3.5346064247231387e-06, - "long_answer_loss": 0.071, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 942, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.059, - "grad_norm": 1.4609375, - "learning_rate": 3.511795336033505e-06, - "long_answer_loss": 0.059, - "loss": 0.0691, - "short_answer_loss": NaN, - "step": 943, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0613, - "grad_norm": 1.375, - "learning_rate": 3.489046057353787e-06, - "long_answer_loss": 0.0613, - "loss": 0.0661, - "short_answer_loss": NaN, - "step": 944, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0751, - "grad_norm": 1.59375, - "learning_rate": 3.466358745126033e-06, - "long_answer_loss": 0.0751, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 945, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0732, - "grad_norm": 1.5546875, - "learning_rate": 3.4437335553661605e-06, - "long_answer_loss": 0.0732, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 946, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0718, - "grad_norm": 1.4765625, - "learning_rate": 3.421170643662884e-06, - "long_answer_loss": 0.0718, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 947, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0718, - "grad_norm": 1.421875, - "learning_rate": 3.3986701651766426e-06, - "long_answer_loss": 0.0718, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 948, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.071, - "grad_norm": 1.453125, - "learning_rate": 3.3762322746385383e-06, - "long_answer_loss": 0.071, - "loss": 0.0695, - "short_answer_loss": NaN, - "step": 949, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0776, - "grad_norm": 1.53125, - "learning_rate": 3.353857126349265e-06, - "long_answer_loss": 0.0776, - "loss": 0.0751, - "short_answer_loss": NaN, - "step": 950, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0624, - "grad_norm": 1.5625, - "learning_rate": 3.3315448741780566e-06, - "long_answer_loss": 0.0624, - "loss": 0.0659, - "short_answer_loss": NaN, - "step": 951, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0799, - "grad_norm": 1.359375, - "learning_rate": 3.309295671561617e-06, - "long_answer_loss": 0.0799, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 952, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.079, - "grad_norm": 1.390625, - "learning_rate": 3.287109671503079e-06, - "long_answer_loss": 0.079, - "loss": 0.0684, - "short_answer_loss": NaN, - "step": 953, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0607, - "grad_norm": 1.390625, - "learning_rate": 3.2649870265709314e-06, - "long_answer_loss": 0.0607, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 954, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0714, - "grad_norm": 1.4296875, - "learning_rate": 3.2429278888980034e-06, - "long_answer_loss": 0.0714, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 955, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0825, - "grad_norm": 1.515625, - "learning_rate": 3.220932410180383e-06, - "long_answer_loss": 0.0825, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 956, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.088, - "grad_norm": 1.5, - "learning_rate": 3.1990007416763904e-06, - "long_answer_loss": 0.088, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 957, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0535, - "grad_norm": 1.375, - "learning_rate": 3.1771330342055387e-06, - "long_answer_loss": 0.0535, - "loss": 0.0677, - "short_answer_loss": NaN, - "step": 958, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0861, - "grad_norm": 1.46875, - "learning_rate": 3.1553294381474946e-06, - "long_answer_loss": 0.0861, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 959, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0687, - "grad_norm": 1.3984375, - "learning_rate": 3.133590103441042e-06, - "long_answer_loss": 0.0687, - "loss": 0.069, - "short_answer_loss": NaN, - "step": 960, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.066, - "grad_norm": 1.3515625, - "learning_rate": 3.1119151795830567e-06, - "long_answer_loss": 0.066, - "loss": 0.0673, - "short_answer_loss": NaN, - "step": 961, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0574, - "grad_norm": 1.4453125, - "learning_rate": 3.0903048156274707e-06, - "long_answer_loss": 0.0574, - "loss": 0.0683, - "short_answer_loss": NaN, - "step": 962, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.068, - "grad_norm": 1.4609375, - "learning_rate": 3.0687591601842524e-06, - "long_answer_loss": 0.068, - "loss": 0.0683, - "short_answer_loss": NaN, - "step": 963, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.056, - "grad_norm": 1.375, - "learning_rate": 3.047278361418382e-06, - "long_answer_loss": 0.056, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 964, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0583, - "grad_norm": 1.4453125, - "learning_rate": 3.0258625670488373e-06, - "long_answer_loss": 0.0583, - "loss": 0.0677, - "short_answer_loss": NaN, - "step": 965, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0657, - "grad_norm": 1.515625, - "learning_rate": 3.0045119243475696e-06, - "long_answer_loss": 0.0657, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 966, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0676, - "grad_norm": 1.4140625, - "learning_rate": 2.9832265801385e-06, - "long_answer_loss": 0.0676, - "loss": 0.0658, - "short_answer_loss": NaN, - "step": 967, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0581, - "grad_norm": 1.640625, - "learning_rate": 2.962006680796503e-06, - "long_answer_loss": 0.0581, - "loss": 0.0727, - "short_answer_loss": NaN, - "step": 968, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.093, - "grad_norm": 1.375, - "learning_rate": 2.940852372246404e-06, - "long_answer_loss": 0.093, - "loss": 0.062, - "short_answer_loss": NaN, - "step": 969, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0712, - "grad_norm": 1.546875, - "learning_rate": 2.9197637999619733e-06, - "long_answer_loss": 0.0712, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 970, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0704, - "grad_norm": 1.4765625, - "learning_rate": 2.898741108964925e-06, - "long_answer_loss": 0.0704, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 971, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0738, - "grad_norm": 1.3984375, - "learning_rate": 2.877784443823926e-06, - "long_answer_loss": 0.0738, - "loss": 0.068, - "short_answer_loss": NaN, - "step": 972, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0691, - "grad_norm": 1.5625, - "learning_rate": 2.856893948653591e-06, - "long_answer_loss": 0.0691, - "loss": 0.0727, - "short_answer_loss": NaN, - "step": 973, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0732, - "grad_norm": 1.421875, - "learning_rate": 2.836069767113503e-06, - "long_answer_loss": 0.0732, - "loss": 0.0667, - "short_answer_loss": NaN, - "step": 974, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0636, - "grad_norm": 1.4765625, - "learning_rate": 2.8153120424072156e-06, - "long_answer_loss": 0.0636, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 975, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.087, - "grad_norm": 1.453125, - "learning_rate": 2.794620917281278e-06, - "long_answer_loss": 0.087, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 976, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0716, - "grad_norm": 1.3046875, - "learning_rate": 2.773996534024241e-06, - "long_answer_loss": 0.0716, - "loss": 0.0605, - "short_answer_loss": NaN, - "step": 977, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0541, - "grad_norm": 1.421875, - "learning_rate": 2.753439034465695e-06, - "long_answer_loss": 0.0541, - "loss": 0.0657, - "short_answer_loss": NaN, - "step": 978, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0486, - "grad_norm": 1.3671875, - "learning_rate": 2.732948559975271e-06, - "long_answer_loss": 0.0486, - "loss": 0.062, - "short_answer_loss": NaN, - "step": 979, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0765, - "grad_norm": 1.4296875, - "learning_rate": 2.7125252514616966e-06, - "long_answer_loss": 0.0765, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 980, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0601, - "grad_norm": 1.3984375, - "learning_rate": 2.692169249371804e-06, - "long_answer_loss": 0.0601, - "loss": 0.0653, - "short_answer_loss": NaN, - "step": 981, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0817, - "grad_norm": 1.375, - "learning_rate": 2.6718806936895796e-06, - "long_answer_loss": 0.0817, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 982, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.066, - "grad_norm": 1.4453125, - "learning_rate": 2.651659723935189e-06, - "long_answer_loss": 0.066, - "loss": 0.0659, - "short_answer_loss": NaN, - "step": 983, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0578, - "grad_norm": 1.4609375, - "learning_rate": 2.6315064791640296e-06, - "long_answer_loss": 0.0578, - "loss": 0.0668, - "short_answer_loss": NaN, - "step": 984, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0679, - "grad_norm": 1.6796875, - "learning_rate": 2.6114210979657606e-06, - "long_answer_loss": 0.0679, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 985, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.064, - "grad_norm": 1.4296875, - "learning_rate": 2.5914037184633656e-06, - "long_answer_loss": 0.064, - "loss": 0.064, - "short_answer_loss": NaN, - "step": 986, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0693, - "grad_norm": 1.453125, - "learning_rate": 2.571454478312185e-06, - "long_answer_loss": 0.0693, - "loss": 0.0659, - "short_answer_loss": NaN, - "step": 987, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0723, - "grad_norm": 1.46875, - "learning_rate": 2.5515735146989933e-06, - "long_answer_loss": 0.0723, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 988, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0792, - "grad_norm": 1.5703125, - "learning_rate": 2.531760964341029e-06, - "long_answer_loss": 0.0792, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 989, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0577, - "grad_norm": 1.5234375, - "learning_rate": 2.5120169634850713e-06, - "long_answer_loss": 0.0577, - "loss": 0.0652, - "short_answer_loss": NaN, - "step": 990, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0653, - "grad_norm": 1.3984375, - "learning_rate": 2.4923416479064987e-06, - "long_answer_loss": 0.0653, - "loss": 0.064, - "short_answer_loss": NaN, - "step": 991, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0761, - "grad_norm": 1.421875, - "learning_rate": 2.4727351529083536e-06, - "long_answer_loss": 0.0761, - "loss": 0.0687, - "short_answer_loss": NaN, - "step": 992, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0722, - "grad_norm": 1.34375, - "learning_rate": 2.4531976133204184e-06, - "long_answer_loss": 0.0722, - "loss": 0.0642, - "short_answer_loss": NaN, - "step": 993, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0715, - "grad_norm": 1.34375, - "learning_rate": 2.4337291634982757e-06, - "long_answer_loss": 0.0715, - "loss": 0.0659, - "short_answer_loss": NaN, - "step": 994, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0635, - "grad_norm": 1.421875, - "learning_rate": 2.4143299373224015e-06, - "long_answer_loss": 0.0635, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 995, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0864, - "grad_norm": 1.4609375, - "learning_rate": 2.3950000681972284e-06, - "long_answer_loss": 0.0864, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 996, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0716, - "grad_norm": 1.4375, - "learning_rate": 2.3757396890502382e-06, - "long_answer_loss": 0.0716, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 997, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0498, - "grad_norm": 1.3828125, - "learning_rate": 2.3565489323310402e-06, - "long_answer_loss": 0.0498, - "loss": 0.0667, - "short_answer_loss": NaN, - "step": 998, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0565, - "grad_norm": 1.4296875, - "learning_rate": 2.3374279300104733e-06, - "long_answer_loss": 0.0565, - "loss": 0.0625, - "short_answer_loss": NaN, - "step": 999, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0556, - "grad_norm": 1.4375, - "learning_rate": 2.31837681357968e-06, - "long_answer_loss": 0.0556, - "loss": 0.0653, - "short_answer_loss": NaN, - "step": 1000, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0831, - "grad_norm": 1.5078125, - "learning_rate": 2.2993957140492197e-06, - "long_answer_loss": 0.0831, - "loss": 0.067, - "short_answer_loss": NaN, - "step": 1001, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0598, - "grad_norm": 1.4140625, - "learning_rate": 2.2804847619481552e-06, - "long_answer_loss": 0.0598, - "loss": 0.0646, - "short_answer_loss": NaN, - "step": 1002, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0641, - "grad_norm": 1.6171875, - "learning_rate": 2.2616440873231655e-06, - "long_answer_loss": 0.0641, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 1003, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0806, - "grad_norm": 1.5078125, - "learning_rate": 2.2428738197376397e-06, - "long_answer_loss": 0.0806, - "loss": 0.067, - "short_answer_loss": NaN, - "step": 1004, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0572, - "grad_norm": 1.3984375, - "learning_rate": 2.224174088270796e-06, - "long_answer_loss": 0.0572, - "loss": 0.0665, - "short_answer_loss": NaN, - "step": 1005, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0618, - "grad_norm": 1.53125, - "learning_rate": 2.20554502151679e-06, - "long_answer_loss": 0.0618, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 1006, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0638, - "grad_norm": 1.4375, - "learning_rate": 2.1869867475838317e-06, - "long_answer_loss": 0.0638, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 1007, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0662, - "grad_norm": 1.390625, - "learning_rate": 2.1684993940933013e-06, - "long_answer_loss": 0.0662, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 1008, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0761, - "grad_norm": 1.4765625, - "learning_rate": 2.150083088178875e-06, - "long_answer_loss": 0.0761, - "loss": 0.0658, - "short_answer_loss": NaN, - "step": 1009, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0672, - "grad_norm": 1.546875, - "learning_rate": 2.131737956485652e-06, - "long_answer_loss": 0.0672, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 1010, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0695, - "grad_norm": 1.4140625, - "learning_rate": 2.113464125169276e-06, - "long_answer_loss": 0.0695, - "loss": 0.0657, - "short_answer_loss": NaN, - "step": 1011, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0669, - "grad_norm": 1.4296875, - "learning_rate": 2.0952617198950765e-06, - "long_answer_loss": 0.0669, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 1012, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0679, - "grad_norm": 1.4921875, - "learning_rate": 2.0771308658372015e-06, - "long_answer_loss": 0.0679, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 1013, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0656, - "grad_norm": 1.5, - "learning_rate": 2.059071687677755e-06, - "long_answer_loss": 0.0656, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 1014, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0591, - "grad_norm": 1.515625, - "learning_rate": 2.0410843096059394e-06, - "long_answer_loss": 0.0591, - "loss": 0.068, - "short_answer_loss": NaN, - "step": 1015, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0654, - "grad_norm": 1.5625, - "learning_rate": 2.0231688553172064e-06, - "long_answer_loss": 0.0654, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 1016, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0745, - "grad_norm": 1.5234375, - "learning_rate": 2.0053254480123977e-06, - "long_answer_loss": 0.0745, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 1017, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0657, - "grad_norm": 1.4453125, - "learning_rate": 1.9875542103969094e-06, - "long_answer_loss": 0.0657, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 1018, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0704, - "grad_norm": 1.46875, - "learning_rate": 1.969855264679836e-06, - "long_answer_loss": 0.0704, - "loss": 0.0684, - "short_answer_loss": NaN, - "step": 1019, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.07, - "grad_norm": 1.4375, - "learning_rate": 1.9522287325731357e-06, - "long_answer_loss": 0.07, - "loss": 0.0666, - "short_answer_loss": NaN, - "step": 1020, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.061, - "grad_norm": 1.3984375, - "learning_rate": 1.934674735290802e-06, - "long_answer_loss": 0.061, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 1021, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0564, - "grad_norm": 1.4765625, - "learning_rate": 1.91719339354801e-06, - "long_answer_loss": 0.0564, - "loss": 0.0674, - "short_answer_loss": NaN, - "step": 1022, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0543, - "grad_norm": 1.453125, - "learning_rate": 1.8997848275603067e-06, - "long_answer_loss": 0.0543, - "loss": 0.066, - "short_answer_loss": NaN, - "step": 1023, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0813, - "grad_norm": 1.4921875, - "learning_rate": 1.8824491570427676e-06, - "long_answer_loss": 0.0813, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 1024, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0644, - "grad_norm": 1.390625, - "learning_rate": 1.8651865012091888e-06, - "long_answer_loss": 0.0644, - "loss": 0.0631, - "short_answer_loss": NaN, - "step": 1025, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0845, - "grad_norm": 1.3671875, - "learning_rate": 1.8479969787712533e-06, - "long_answer_loss": 0.0845, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 1026, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0604, - "grad_norm": 1.6953125, - "learning_rate": 1.830880707937725e-06, - "long_answer_loss": 0.0604, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 1027, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0618, - "grad_norm": 1.5078125, - "learning_rate": 1.8138378064136318e-06, - "long_answer_loss": 0.0618, - "loss": 0.068, - "short_answer_loss": NaN, - "step": 1028, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0637, - "grad_norm": 1.4140625, - "learning_rate": 1.796868391399452e-06, - "long_answer_loss": 0.0637, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 1029, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0725, - "grad_norm": 1.5625, - "learning_rate": 1.7799725795903193e-06, - "long_answer_loss": 0.0725, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 1030, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0685, - "grad_norm": 1.390625, - "learning_rate": 1.7631504871752066e-06, - "long_answer_loss": 0.0685, - "loss": 0.0648, - "short_answer_loss": NaN, - "step": 1031, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0777, - "grad_norm": 1.4375, - "learning_rate": 1.7464022298361374e-06, - "long_answer_loss": 0.0777, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 1032, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0717, - "grad_norm": 1.4375, - "learning_rate": 1.7297279227473874e-06, - "long_answer_loss": 0.0717, - "loss": 0.0673, - "short_answer_loss": NaN, - "step": 1033, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0736, - "grad_norm": 1.515625, - "learning_rate": 1.7131276805746902e-06, - "long_answer_loss": 0.0736, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 1034, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0637, - "grad_norm": 1.421875, - "learning_rate": 1.6966016174744499e-06, - "long_answer_loss": 0.0637, - "loss": 0.0688, - "short_answer_loss": NaN, - "step": 1035, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0729, - "grad_norm": 1.453125, - "learning_rate": 1.68014984709296e-06, - "long_answer_loss": 0.0729, - "loss": 0.0668, - "short_answer_loss": NaN, - "step": 1036, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0574, - "grad_norm": 1.3828125, - "learning_rate": 1.6637724825656147e-06, - "long_answer_loss": 0.0574, - "loss": 0.0669, - "short_answer_loss": NaN, - "step": 1037, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0873, - "grad_norm": 1.40625, - "learning_rate": 1.6474696365161358e-06, - "long_answer_loss": 0.0873, - "loss": 0.0667, - "short_answer_loss": NaN, - "step": 1038, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.064, - "grad_norm": 1.53125, - "learning_rate": 1.6312414210557972e-06, - "long_answer_loss": 0.064, - "loss": 0.0688, - "short_answer_loss": NaN, - "step": 1039, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0635, - "grad_norm": 1.4609375, - "learning_rate": 1.615087947782655e-06, - "long_answer_loss": 0.0635, - "loss": 0.0644, - "short_answer_loss": NaN, - "step": 1040, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.073, - "grad_norm": 1.4296875, - "learning_rate": 1.5990093277807775e-06, - "long_answer_loss": 0.073, - "loss": 0.0657, - "short_answer_loss": NaN, - "step": 1041, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0586, - "grad_norm": 1.40625, - "learning_rate": 1.583005671619482e-06, - "long_answer_loss": 0.0586, - "loss": 0.0662, - "short_answer_loss": NaN, - "step": 1042, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0633, - "grad_norm": 1.4375, - "learning_rate": 1.5670770893525768e-06, - "long_answer_loss": 0.0633, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 1043, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0764, - "grad_norm": 1.4375, - "learning_rate": 1.5512236905176018e-06, - "long_answer_loss": 0.0764, - "loss": 0.0657, - "short_answer_loss": NaN, - "step": 1044, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0818, - "grad_norm": 1.515625, - "learning_rate": 1.5354455841350756e-06, - "long_answer_loss": 0.0818, - "loss": 0.0619, - "short_answer_loss": NaN, - "step": 1045, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0697, - "grad_norm": 1.4453125, - "learning_rate": 1.5197428787077472e-06, - "long_answer_loss": 0.0697, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 1046, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0705, - "grad_norm": 1.421875, - "learning_rate": 1.5041156822198492e-06, - "long_answer_loss": 0.0705, - "loss": 0.0626, - "short_answer_loss": NaN, - "step": 1047, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0695, - "grad_norm": 1.5, - "learning_rate": 1.4885641021363541e-06, - "long_answer_loss": 0.0695, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 1048, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0676, - "grad_norm": 1.453125, - "learning_rate": 1.4730882454022362e-06, - "long_answer_loss": 0.0676, - "loss": 0.066, - "short_answer_loss": NaN, - "step": 1049, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0787, - "grad_norm": 1.5546875, - "learning_rate": 1.457688218441737e-06, - "long_answer_loss": 0.0787, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 1050, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0595, - "grad_norm": 1.515625, - "learning_rate": 1.442364127157632e-06, - "long_answer_loss": 0.0595, - "loss": 0.0766, - "short_answer_loss": NaN, - "step": 1051, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0641, - "grad_norm": 1.453125, - "learning_rate": 1.4271160769305014e-06, - "long_answer_loss": 0.0641, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 1052, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0668, - "grad_norm": 1.359375, - "learning_rate": 1.4119441726180085e-06, - "long_answer_loss": 0.0668, - "loss": 0.064, - "short_answer_loss": NaN, - "step": 1053, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0754, - "grad_norm": 1.3203125, - "learning_rate": 1.396848518554178e-06, - "long_answer_loss": 0.0754, - "loss": 0.0658, - "short_answer_loss": NaN, - "step": 1054, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0611, - "grad_norm": 1.4609375, - "learning_rate": 1.3818292185486749e-06, - "long_answer_loss": 0.0611, - "loss": 0.066, - "short_answer_loss": NaN, - "step": 1055, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0653, - "grad_norm": 1.453125, - "learning_rate": 1.366886375886095e-06, - "long_answer_loss": 0.0653, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 1056, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0558, - "grad_norm": 1.5546875, - "learning_rate": 1.3520200933252542e-06, - "long_answer_loss": 0.0558, - "loss": 0.0654, - "short_answer_loss": NaN, - "step": 1057, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0661, - "grad_norm": 1.5703125, - "learning_rate": 1.337230473098476e-06, - "long_answer_loss": 0.0661, - "loss": 0.07, - "short_answer_loss": NaN, - "step": 1058, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0689, - "grad_norm": 1.609375, - "learning_rate": 1.322517616910897e-06, - "long_answer_loss": 0.0689, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 1059, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0685, - "grad_norm": 1.4296875, - "learning_rate": 1.3078816259397635e-06, - "long_answer_loss": 0.0685, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 1060, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0617, - "grad_norm": 1.4453125, - "learning_rate": 1.2933226008337324e-06, - "long_answer_loss": 0.0617, - "loss": 0.066, - "short_answer_loss": NaN, - "step": 1061, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0637, - "grad_norm": 1.4140625, - "learning_rate": 1.2788406417121867e-06, - "long_answer_loss": 0.0637, - "loss": 0.0643, - "short_answer_loss": NaN, - "step": 1062, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0675, - "grad_norm": 1.5, - "learning_rate": 1.2644358481645399e-06, - "long_answer_loss": 0.0675, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 1063, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0769, - "grad_norm": 1.3515625, - "learning_rate": 1.2501083192495544e-06, - "long_answer_loss": 0.0769, - "loss": 0.0676, - "short_answer_loss": NaN, - "step": 1064, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0555, - "grad_norm": 1.46875, - "learning_rate": 1.2358581534946594e-06, - "long_answer_loss": 0.0555, - "loss": 0.0682, - "short_answer_loss": NaN, - "step": 1065, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0875, - "grad_norm": 1.484375, - "learning_rate": 1.2216854488952753e-06, - "long_answer_loss": 0.0875, - "loss": 0.0691, - "short_answer_loss": NaN, - "step": 1066, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0669, - "grad_norm": 1.453125, - "learning_rate": 1.2075903029141384e-06, - "long_answer_loss": 0.0669, - "loss": 0.065, - "short_answer_loss": NaN, - "step": 1067, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0809, - "grad_norm": 1.4375, - "learning_rate": 1.193572812480627e-06, - "long_answer_loss": 0.0809, - "loss": 0.0714, - "short_answer_loss": NaN, - "step": 1068, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0661, - "grad_norm": 1.3515625, - "learning_rate": 1.1796330739901024e-06, - "long_answer_loss": 0.0661, - "loss": 0.0676, - "short_answer_loss": NaN, - "step": 1069, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0572, - "grad_norm": 1.484375, - "learning_rate": 1.1657711833032394e-06, - "long_answer_loss": 0.0572, - "loss": 0.0651, - "short_answer_loss": NaN, - "step": 1070, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0708, - "grad_norm": 1.5078125, - "learning_rate": 1.1519872357453734e-06, - "long_answer_loss": 0.0708, - "loss": 0.0682, - "short_answer_loss": NaN, - "step": 1071, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0714, - "grad_norm": 1.5390625, - "learning_rate": 1.1382813261058349e-06, - "long_answer_loss": 0.0714, - "loss": 0.0686, - "short_answer_loss": NaN, - "step": 1072, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0728, - "grad_norm": 1.375, - "learning_rate": 1.124653548637311e-06, - "long_answer_loss": 0.0728, - "loss": 0.0677, - "short_answer_loss": NaN, - "step": 1073, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0528, - "grad_norm": 1.4765625, - "learning_rate": 1.111103997055185e-06, - "long_answer_loss": 0.0528, - "loss": 0.0657, - "short_answer_loss": NaN, - "step": 1074, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0611, - "grad_norm": 1.375, - "learning_rate": 1.0976327645368975e-06, - "long_answer_loss": 0.0611, - "loss": 0.0662, - "short_answer_loss": NaN, - "step": 1075, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0783, - "grad_norm": 1.515625, - "learning_rate": 1.0842399437213103e-06, - "long_answer_loss": 0.0783, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 1076, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0896, - "grad_norm": 1.5859375, - "learning_rate": 1.0709256267080566e-06, - "long_answer_loss": 0.0896, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 1077, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0789, - "grad_norm": 1.4765625, - "learning_rate": 1.0576899050569204e-06, - "long_answer_loss": 0.0789, - "loss": 0.0686, - "short_answer_loss": NaN, - "step": 1078, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0569, - "grad_norm": 1.5, - "learning_rate": 1.0445328697872015e-06, - "long_answer_loss": 0.0569, - "loss": 0.0667, - "short_answer_loss": NaN, - "step": 1079, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0676, - "grad_norm": 1.5234375, - "learning_rate": 1.0314546113770876e-06, - "long_answer_loss": 0.0676, - "loss": 0.0672, - "short_answer_loss": NaN, - "step": 1080, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0694, - "grad_norm": 1.5546875, - "learning_rate": 1.018455219763037e-06, - "long_answer_loss": 0.0694, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 1081, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0741, - "grad_norm": 1.453125, - "learning_rate": 1.0055347843391557e-06, - "long_answer_loss": 0.0741, - "loss": 0.066, - "short_answer_loss": NaN, - "step": 1082, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0622, - "grad_norm": 1.4765625, - "learning_rate": 9.92693393956584e-07, - "long_answer_loss": 0.0622, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 1083, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0568, - "grad_norm": 1.4140625, - "learning_rate": 9.79931136922889e-07, - "long_answer_loss": 0.0568, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 1084, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0558, - "grad_norm": 1.3671875, - "learning_rate": 9.672481010014486e-07, - "long_answer_loss": 0.0558, - "loss": 0.0639, - "short_answer_loss": NaN, - "step": 1085, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0688, - "grad_norm": 1.4609375, - "learning_rate": 9.54644373410861e-07, - "long_answer_loss": 0.0688, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 1086, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.1051, - "grad_norm": 1.546875, - "learning_rate": 9.421200408243277e-07, - "long_answer_loss": 0.1051, - "loss": 0.0781, - "short_answer_loss": NaN, - "step": 1087, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0607, - "grad_norm": 1.421875, - "learning_rate": 9.296751893690808e-07, - "long_answer_loss": 0.0607, - "loss": 0.0668, - "short_answer_loss": NaN, - "step": 1088, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0595, - "grad_norm": 1.4765625, - "learning_rate": 9.173099046257655e-07, - "long_answer_loss": 0.0595, - "loss": 0.067, - "short_answer_loss": NaN, - "step": 1089, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0611, - "grad_norm": 1.53125, - "learning_rate": 9.050242716278676e-07, - "long_answer_loss": 0.0611, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 1090, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0559, - "grad_norm": 1.4765625, - "learning_rate": 8.928183748611263e-07, - "long_answer_loss": 0.0559, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 1091, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0727, - "grad_norm": 1.3984375, - "learning_rate": 8.806922982629473e-07, - "long_answer_loss": 0.0727, - "loss": 0.0642, - "short_answer_loss": NaN, - "step": 1092, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0722, - "grad_norm": 1.5390625, - "learning_rate": 8.686461252218323e-07, - "long_answer_loss": 0.0722, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 1093, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0591, - "grad_norm": 1.3671875, - "learning_rate": 8.566799385768015e-07, - "long_answer_loss": 0.0591, - "loss": 0.0672, - "short_answer_loss": NaN, - "step": 1094, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0665, - "grad_norm": 1.3828125, - "learning_rate": 8.447938206168279e-07, - "long_answer_loss": 0.0665, - "loss": 0.0645, - "short_answer_loss": NaN, - "step": 1095, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0767, - "grad_norm": 1.4296875, - "learning_rate": 8.329878530802665e-07, - "long_answer_loss": 0.0767, - "loss": 0.0677, - "short_answer_loss": NaN, - "step": 1096, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.058, - "grad_norm": 1.375, - "learning_rate": 8.21262117154295e-07, - "long_answer_loss": 0.058, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 1097, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0668, - "grad_norm": 1.4140625, - "learning_rate": 8.096166934743549e-07, - "long_answer_loss": 0.0668, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 1098, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0738, - "grad_norm": 1.625, - "learning_rate": 7.980516621235973e-07, - "long_answer_loss": 0.0738, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 1099, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0578, - "grad_norm": 1.421875, - "learning_rate": 7.865671026323323e-07, - "long_answer_loss": 0.0578, - "loss": 0.0638, - "short_answer_loss": NaN, - "step": 1100, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0657, - "grad_norm": 1.40625, - "learning_rate": 7.751630939774823e-07, - "long_answer_loss": 0.0657, - "loss": 0.0627, - "short_answer_loss": NaN, - "step": 1101, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0713, - "grad_norm": 1.453125, - "learning_rate": 7.638397145820361e-07, - "long_answer_loss": 0.0713, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 1102, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0662, - "grad_norm": 1.4921875, - "learning_rate": 7.525970423145165e-07, - "long_answer_loss": 0.0662, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 1103, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0626, - "grad_norm": 1.515625, - "learning_rate": 7.414351544884332e-07, - "long_answer_loss": 0.0626, - "loss": 0.0699, - "short_answer_loss": NaN, - "step": 1104, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0612, - "grad_norm": 1.5546875, - "learning_rate": 7.303541278617654e-07, - "long_answer_loss": 0.0612, - "loss": 0.0668, - "short_answer_loss": NaN, - "step": 1105, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0803, - "grad_norm": 1.46875, - "learning_rate": 7.193540386364203e-07, - "long_answer_loss": 0.0803, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 1106, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.1016, - "grad_norm": 1.4765625, - "learning_rate": 7.084349624577213e-07, - "long_answer_loss": 0.1016, - "loss": 0.0779, - "short_answer_loss": NaN, - "step": 1107, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0583, - "grad_norm": 1.3828125, - "learning_rate": 6.975969744138791e-07, - "long_answer_loss": 0.0583, - "loss": 0.0642, - "short_answer_loss": NaN, - "step": 1108, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0548, - "grad_norm": 1.4921875, - "learning_rate": 6.868401490354767e-07, - "long_answer_loss": 0.0548, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 1109, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0653, - "grad_norm": 1.53125, - "learning_rate": 6.761645602949618e-07, - "long_answer_loss": 0.0653, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 1110, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0604, - "grad_norm": 1.4140625, - "learning_rate": 6.655702816061316e-07, - "long_answer_loss": 0.0604, - "loss": 0.0645, - "short_answer_loss": NaN, - "step": 1111, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0587, - "grad_norm": 1.46875, - "learning_rate": 6.55057385823632e-07, - "long_answer_loss": 0.0587, - "loss": 0.0689, - "short_answer_loss": NaN, - "step": 1112, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0708, - "grad_norm": 1.3984375, - "learning_rate": 6.446259452424566e-07, - "long_answer_loss": 0.0708, - "loss": 0.0687, - "short_answer_loss": NaN, - "step": 1113, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0642, - "grad_norm": 1.5546875, - "learning_rate": 6.342760315974485e-07, - "long_answer_loss": 0.0642, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 1114, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0649, - "grad_norm": 1.46875, - "learning_rate": 6.240077160628063e-07, - "long_answer_loss": 0.0649, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 1115, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0659, - "grad_norm": 1.5859375, - "learning_rate": 6.138210692515939e-07, - "long_answer_loss": 0.0659, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 1116, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0616, - "grad_norm": 1.3984375, - "learning_rate": 6.037161612152606e-07, - "long_answer_loss": 0.0616, - "loss": 0.0654, - "short_answer_loss": NaN, - "step": 1117, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0791, - "grad_norm": 1.4140625, - "learning_rate": 5.936930614431499e-07, - "long_answer_loss": 0.0791, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 1118, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0768, - "grad_norm": 1.6328125, - "learning_rate": 5.837518388620317e-07, - "long_answer_loss": 0.0768, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 1119, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0762, - "grad_norm": 1.3515625, - "learning_rate": 5.738925618356206e-07, - "long_answer_loss": 0.0762, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 1120, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0957, - "grad_norm": 1.4140625, - "learning_rate": 5.641152981641084e-07, - "long_answer_loss": 0.0957, - "loss": 0.0706, - "short_answer_loss": NaN, - "step": 1121, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0673, - "grad_norm": 1.4296875, - "learning_rate": 5.544201150837023e-07, - "long_answer_loss": 0.0673, - "loss": 0.0661, - "short_answer_loss": NaN, - "step": 1122, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0617, - "grad_norm": 1.4609375, - "learning_rate": 5.448070792661533e-07, - "long_answer_loss": 0.0617, - "loss": 0.067, - "short_answer_loss": NaN, - "step": 1123, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.072, - "grad_norm": 1.4140625, - "learning_rate": 5.352762568183067e-07, - "long_answer_loss": 0.072, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 1124, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0719, - "grad_norm": 1.53125, - "learning_rate": 5.258277132816388e-07, - "long_answer_loss": 0.0719, - "loss": 0.0691, - "short_answer_loss": NaN, - "step": 1125, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0673, - "grad_norm": 1.4375, - "learning_rate": 5.164615136318163e-07, - "long_answer_loss": 0.0673, - "loss": 0.0663, - "short_answer_loss": NaN, - "step": 1126, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0696, - "grad_norm": 1.515625, - "learning_rate": 5.071777222782417e-07, - "long_answer_loss": 0.0696, - "loss": 0.0656, - "short_answer_loss": NaN, - "step": 1127, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0713, - "grad_norm": 1.328125, - "learning_rate": 4.979764030636116e-07, - "long_answer_loss": 0.0713, - "loss": 0.0634, - "short_answer_loss": NaN, - "step": 1128, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0627, - "grad_norm": 1.421875, - "learning_rate": 4.888576192634817e-07, - "long_answer_loss": 0.0627, - "loss": 0.0661, - "short_answer_loss": NaN, - "step": 1129, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0769, - "grad_norm": 1.515625, - "learning_rate": 4.798214335858267e-07, - "long_answer_loss": 0.0769, - "loss": 0.066, - "short_answer_loss": NaN, - "step": 1130, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0609, - "grad_norm": 1.71875, - "learning_rate": 4.708679081706136e-07, - "long_answer_loss": 0.0609, - "loss": 0.0662, - "short_answer_loss": NaN, - "step": 1131, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0563, - "grad_norm": 1.5234375, - "learning_rate": 4.6199710458936644e-07, - "long_answer_loss": 0.0563, - "loss": 0.0627, - "short_answer_loss": NaN, - "step": 1132, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0636, - "grad_norm": 1.4375, - "learning_rate": 4.532090838447564e-07, - "long_answer_loss": 0.0636, - "loss": 0.0637, - "short_answer_loss": NaN, - "step": 1133, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0653, - "grad_norm": 1.53125, - "learning_rate": 4.4450390637016946e-07, - "long_answer_loss": 0.0653, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 1134, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0794, - "grad_norm": 1.4375, - "learning_rate": 4.358816320292947e-07, - "long_answer_loss": 0.0794, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 1135, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0536, - "grad_norm": 1.453125, - "learning_rate": 4.273423201157159e-07, - "long_answer_loss": 0.0536, - "loss": 0.0651, - "short_answer_loss": NaN, - "step": 1136, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0628, - "grad_norm": 1.46875, - "learning_rate": 4.1888602935250267e-07, - "long_answer_loss": 0.0628, - "loss": 0.0664, - "short_answer_loss": NaN, - "step": 1137, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0555, - "grad_norm": 1.421875, - "learning_rate": 4.105128178918033e-07, - "long_answer_loss": 0.0555, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 1138, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0888, - "grad_norm": 1.4453125, - "learning_rate": 4.022227433144468e-07, - "long_answer_loss": 0.0888, - "loss": 0.0665, - "short_answer_loss": NaN, - "step": 1139, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0606, - "grad_norm": 1.4453125, - "learning_rate": 3.940158626295501e-07, - "long_answer_loss": 0.0606, - "loss": 0.0662, - "short_answer_loss": NaN, - "step": 1140, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0557, - "grad_norm": 1.4609375, - "learning_rate": 3.858922322741182e-07, - "long_answer_loss": 0.0557, - "loss": 0.0664, - "short_answer_loss": NaN, - "step": 1141, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0565, - "grad_norm": 1.390625, - "learning_rate": 3.778519081126641e-07, - "long_answer_loss": 0.0565, - "loss": 0.0645, - "short_answer_loss": NaN, - "step": 1142, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0817, - "grad_norm": 1.578125, - "learning_rate": 3.698949454368231e-07, - "long_answer_loss": 0.0817, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 1143, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0748, - "grad_norm": 1.421875, - "learning_rate": 3.620213989649679e-07, - "long_answer_loss": 0.0748, - "loss": 0.0667, - "short_answer_loss": NaN, - "step": 1144, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0776, - "grad_norm": 1.5234375, - "learning_rate": 3.542313228418359e-07, - "long_answer_loss": 0.0776, - "loss": 0.0682, - "short_answer_loss": NaN, - "step": 1145, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0691, - "grad_norm": 1.375, - "learning_rate": 3.4652477063815833e-07, - "long_answer_loss": 0.0691, - "loss": 0.0669, - "short_answer_loss": NaN, - "step": 1146, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0666, - "grad_norm": 1.5234375, - "learning_rate": 3.3890179535028544e-07, - "long_answer_loss": 0.0666, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1147, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0666, - "grad_norm": 1.359375, - "learning_rate": 3.313624493998316e-07, - "long_answer_loss": 0.0666, - "loss": 0.0599, - "short_answer_loss": NaN, - "step": 1148, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0711, - "grad_norm": 1.453125, - "learning_rate": 3.2390678463330713e-07, - "long_answer_loss": 0.0711, - "loss": 0.0667, - "short_answer_loss": NaN, - "step": 1149, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0645, - "grad_norm": 1.40625, - "learning_rate": 3.165348523217634e-07, - "long_answer_loss": 0.0645, - "loss": 0.0642, - "short_answer_loss": NaN, - "step": 1150, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0717, - "grad_norm": 1.4296875, - "learning_rate": 3.092467031604443e-07, - "long_answer_loss": 0.0717, - "loss": 0.0687, - "short_answer_loss": NaN, - "step": 1151, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.077, - "grad_norm": 1.421875, - "learning_rate": 3.0204238726842945e-07, - "long_answer_loss": 0.077, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 1152, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0727, - "grad_norm": 1.4765625, - "learning_rate": 2.9492195418829997e-07, - "long_answer_loss": 0.0727, - "loss": 0.0674, - "short_answer_loss": NaN, - "step": 1153, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0613, - "grad_norm": 1.40625, - "learning_rate": 2.878854528857888e-07, - "long_answer_loss": 0.0613, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 1154, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0614, - "grad_norm": 1.375, - "learning_rate": 2.8093293174944883e-07, - "long_answer_loss": 0.0614, - "loss": 0.0649, - "short_answer_loss": NaN, - "step": 1155, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0862, - "grad_norm": 1.515625, - "learning_rate": 2.740644385903199e-07, - "long_answer_loss": 0.0862, - "loss": 0.0664, - "short_answer_loss": NaN, - "step": 1156, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0653, - "grad_norm": 1.4921875, - "learning_rate": 2.672800206415971e-07, - "long_answer_loss": 0.0653, - "loss": 0.0659, - "short_answer_loss": NaN, - "step": 1157, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0598, - "grad_norm": 1.4375, - "learning_rate": 2.605797245583075e-07, - "long_answer_loss": 0.0598, - "loss": 0.0659, - "short_answer_loss": NaN, - "step": 1158, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0685, - "grad_norm": 1.453125, - "learning_rate": 2.5396359641699093e-07, - "long_answer_loss": 0.0685, - "loss": 0.0667, - "short_answer_loss": NaN, - "step": 1159, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0711, - "grad_norm": 1.4765625, - "learning_rate": 2.474316817153821e-07, - "long_answer_loss": 0.0711, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 1160, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0771, - "grad_norm": 1.4296875, - "learning_rate": 2.4098402537209577e-07, - "long_answer_loss": 0.0771, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 1161, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0707, - "grad_norm": 1.4375, - "learning_rate": 2.3462067172632246e-07, - "long_answer_loss": 0.0707, - "loss": 0.065, - "short_answer_loss": NaN, - "step": 1162, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.072, - "grad_norm": 1.4765625, - "learning_rate": 2.2834166453751805e-07, - "long_answer_loss": 0.072, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 1163, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0784, - "grad_norm": 1.46875, - "learning_rate": 2.2214704698510503e-07, - "long_answer_loss": 0.0784, - "loss": 0.0681, - "short_answer_loss": NaN, - "step": 1164, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0633, - "grad_norm": 1.4140625, - "learning_rate": 2.160368616681785e-07, - "long_answer_loss": 0.0633, - "loss": 0.062, - "short_answer_loss": NaN, - "step": 1165, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0567, - "grad_norm": 1.4609375, - "learning_rate": 2.1001115060520772e-07, - "long_answer_loss": 0.0567, - "loss": 0.0656, - "short_answer_loss": NaN, - "step": 1166, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0799, - "grad_norm": 1.359375, - "learning_rate": 2.04069955233753e-07, - "long_answer_loss": 0.0799, - "loss": 0.0628, - "short_answer_loss": NaN, - "step": 1167, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0707, - "grad_norm": 1.5546875, - "learning_rate": 1.9821331641017572e-07, - "long_answer_loss": 0.0707, - "loss": 0.07, - "short_answer_loss": NaN, - "step": 1168, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0638, - "grad_norm": 1.375, - "learning_rate": 1.9244127440936066e-07, - "long_answer_loss": 0.0638, - "loss": 0.0649, - "short_answer_loss": NaN, - "step": 1169, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0837, - "grad_norm": 1.515625, - "learning_rate": 1.8675386892443858e-07, - "long_answer_loss": 0.0837, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 1170, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0755, - "grad_norm": 1.5390625, - "learning_rate": 1.8115113906650856e-07, - "long_answer_loss": 0.0755, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 1171, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0655, - "grad_norm": 1.4453125, - "learning_rate": 1.7563312336437848e-07, - "long_answer_loss": 0.0655, - "loss": 0.0668, - "short_answer_loss": NaN, - "step": 1172, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0614, - "grad_norm": 1.4296875, - "learning_rate": 1.7019985976429174e-07, - "long_answer_loss": 0.0614, - "loss": 0.0653, - "short_answer_loss": NaN, - "step": 1173, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0657, - "grad_norm": 1.4296875, - "learning_rate": 1.6485138562966906e-07, - "long_answer_loss": 0.0657, - "loss": 0.0665, - "short_answer_loss": NaN, - "step": 1174, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0609, - "grad_norm": 1.421875, - "learning_rate": 1.5958773774085166e-07, - "long_answer_loss": 0.0609, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 1175, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0635, - "grad_norm": 1.3515625, - "learning_rate": 1.5440895229485026e-07, - "long_answer_loss": 0.0635, - "loss": 0.0637, - "short_answer_loss": NaN, - "step": 1176, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0547, - "grad_norm": 1.5078125, - "learning_rate": 1.493150649050923e-07, - "long_answer_loss": 0.0547, - "loss": 0.068, - "short_answer_loss": NaN, - "step": 1177, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0559, - "grad_norm": 1.53125, - "learning_rate": 1.4430611060117922e-07, - "long_answer_loss": 0.0559, - "loss": 0.0781, - "short_answer_loss": NaN, - "step": 1178, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0682, - "grad_norm": 1.4921875, - "learning_rate": 1.3938212382864497e-07, - "long_answer_loss": 0.0682, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 1179, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0589, - "grad_norm": 1.453125, - "learning_rate": 1.345431384487214e-07, - "long_answer_loss": 0.0589, - "loss": 0.0646, - "short_answer_loss": NaN, - "step": 1180, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.073, - "grad_norm": 1.4296875, - "learning_rate": 1.2978918773810243e-07, - "long_answer_loss": 0.073, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 1181, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.06, - "grad_norm": 1.453125, - "learning_rate": 1.251203043887164e-07, - "long_answer_loss": 0.06, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 1182, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0635, - "grad_norm": 1.515625, - "learning_rate": 1.2053652050749846e-07, - "long_answer_loss": 0.0635, - "loss": 0.0673, - "short_answer_loss": NaN, - "step": 1183, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0641, - "grad_norm": 1.5, - "learning_rate": 1.160378676161783e-07, - "long_answer_loss": 0.0641, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 1184, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0841, - "grad_norm": 1.453125, - "learning_rate": 1.1162437665105108e-07, - "long_answer_loss": 0.0841, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 1185, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0664, - "grad_norm": 1.390625, - "learning_rate": 1.0729607796277629e-07, - "long_answer_loss": 0.0664, - "loss": 0.0669, - "short_answer_loss": NaN, - "step": 1186, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0783, - "grad_norm": 1.3515625, - "learning_rate": 1.0305300131616125e-07, - "long_answer_loss": 0.0783, - "loss": 0.0656, - "short_answer_loss": NaN, - "step": 1187, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0659, - "grad_norm": 1.3671875, - "learning_rate": 9.889517588995839e-08, - "long_answer_loss": 0.0659, - "loss": 0.0679, - "short_answer_loss": NaN, - "step": 1188, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0664, - "grad_norm": 1.5234375, - "learning_rate": 9.482263027666833e-08, - "long_answer_loss": 0.0664, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 1189, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0586, - "grad_norm": 1.3359375, - "learning_rate": 9.083539248233852e-08, - "long_answer_loss": 0.0586, - "loss": 0.0622, - "short_answer_loss": NaN, - "step": 1190, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0623, - "grad_norm": 1.4453125, - "learning_rate": 8.693348992637046e-08, - "long_answer_loss": 0.0623, - "loss": 0.0672, - "short_answer_loss": NaN, - "step": 1191, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0905, - "grad_norm": 1.4921875, - "learning_rate": 8.311694944133502e-08, - "long_answer_loss": 0.0905, - "loss": 0.0699, - "short_answer_loss": NaN, - "step": 1192, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0673, - "grad_norm": 1.484375, - "learning_rate": 7.938579727278517e-08, - "long_answer_loss": 0.0673, - "loss": 0.068, - "short_answer_loss": NaN, - "step": 1193, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0676, - "grad_norm": 1.5078125, - "learning_rate": 7.574005907907966e-08, - "long_answer_loss": 0.0676, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 1194, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0656, - "grad_norm": 1.3984375, - "learning_rate": 7.217975993119713e-08, - "long_answer_loss": 0.0656, - "loss": 0.0641, - "short_answer_loss": NaN, - "step": 1195, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0809, - "grad_norm": 1.390625, - "learning_rate": 6.87049243125723e-08, - "long_answer_loss": 0.0809, - "loss": 0.0661, - "short_answer_loss": NaN, - "step": 1196, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0564, - "grad_norm": 1.484375, - "learning_rate": 6.531557611892669e-08, - "long_answer_loss": 0.0564, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 1197, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0766, - "grad_norm": 1.4140625, - "learning_rate": 6.201173865810207e-08, - "long_answer_loss": 0.0766, - "loss": 0.0637, - "short_answer_loss": NaN, - "step": 1198, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0868, - "grad_norm": 1.8046875, - "learning_rate": 5.879343464989806e-08, - "long_answer_loss": 0.0868, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 1199, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0726, - "grad_norm": 1.4453125, - "learning_rate": 5.566068622592235e-08, - "long_answer_loss": 0.0726, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 1200, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0771, - "grad_norm": 1.5, - "learning_rate": 5.261351492943101e-08, - "long_answer_loss": 0.0771, - "loss": 0.0651, - "short_answer_loss": NaN, - "step": 1201, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0595, - "grad_norm": 1.53125, - "learning_rate": 4.965194171518833e-08, - "long_answer_loss": 0.0595, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 1202, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0644, - "grad_norm": 1.4609375, - "learning_rate": 4.677598694931285e-08, - "long_answer_loss": 0.0644, - "loss": 0.0731, - "short_answer_loss": NaN, - "step": 1203, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0676, - "grad_norm": 1.453125, - "learning_rate": 4.3985670409148196e-08, - "long_answer_loss": 0.0676, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 1204, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0634, - "grad_norm": 1.390625, - "learning_rate": 4.128101128312023e-08, - "long_answer_loss": 0.0634, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 1205, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0868, - "grad_norm": 1.484375, - "learning_rate": 3.866202817060377e-08, - "long_answer_loss": 0.0868, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 1206, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0788, - "grad_norm": 1.4453125, - "learning_rate": 3.612873908180048e-08, - "long_answer_loss": 0.0788, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 1207, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.073, - "grad_norm": 1.609375, - "learning_rate": 3.3681161437612575e-08, - "long_answer_loss": 0.073, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 1208, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0587, - "grad_norm": 1.421875, - "learning_rate": 3.131931206951933e-08, - "long_answer_loss": 0.0587, - "loss": 0.064, - "short_answer_loss": NaN, - "step": 1209, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.053, - "grad_norm": 1.5390625, - "learning_rate": 2.9043207219468795e-08, - "long_answer_loss": 0.053, - "loss": 0.0679, - "short_answer_loss": NaN, - "step": 1210, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0482, - "grad_norm": 1.5625, - "learning_rate": 2.6852862539757106e-08, - "long_answer_loss": 0.0482, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 1211, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0736, - "grad_norm": 1.40625, - "learning_rate": 2.4748293092931308e-08, - "long_answer_loss": 0.0736, - "loss": 0.067, - "short_answer_loss": NaN, - "step": 1212, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0611, - "grad_norm": 1.46875, - "learning_rate": 2.2729513351672783e-08, - "long_answer_loss": 0.0611, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 1213, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0569, - "grad_norm": 1.4609375, - "learning_rate": 2.0796537198712608e-08, - "long_answer_loss": 0.0569, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 1214, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0782, - "grad_norm": 1.4375, - "learning_rate": 1.894937792672191e-08, - "long_answer_loss": 0.0782, - "loss": 0.0714, - "short_answer_loss": NaN, - "step": 1215, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0704, - "grad_norm": 1.515625, - "learning_rate": 1.7188048238232778e-08, - "long_answer_loss": 0.0704, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 1216, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0611, - "grad_norm": 1.40625, - "learning_rate": 1.5512560245541097e-08, - "long_answer_loss": 0.0611, - "loss": 0.0676, - "short_answer_loss": NaN, - "step": 1217, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.062, - "grad_norm": 1.5, - "learning_rate": 1.3922925470627458e-08, - "long_answer_loss": 0.062, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 1218, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0516, - "grad_norm": 1.4375, - "learning_rate": 1.2419154845079439e-08, - "long_answer_loss": 0.0516, - "loss": 0.0623, - "short_answer_loss": NaN, - "step": 1219, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0778, - "grad_norm": 1.4765625, - "learning_rate": 1.1001258710015283e-08, - "long_answer_loss": 0.0778, - "loss": 0.0637, - "short_answer_loss": NaN, - "step": 1220, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0571, - "grad_norm": 1.484375, - "learning_rate": 9.669246816010335e-09, - "long_answer_loss": 0.0571, - "loss": 0.0676, - "short_answer_loss": NaN, - "step": 1221, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0721, - "grad_norm": 1.4140625, - "learning_rate": 8.423128323033213e-09, - "long_answer_loss": 0.0721, - "loss": 0.0687, - "short_answer_loss": NaN, - "step": 1222, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0665, - "grad_norm": 1.515625, - "learning_rate": 7.262911800379191e-09, - "long_answer_loss": 0.0665, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 1223, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0723, - "grad_norm": 1.5078125, - "learning_rate": 6.188605226618849e-09, - "long_answer_loss": 0.0723, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 1224, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0667, - "grad_norm": 1.390625, - "learning_rate": 5.200215989531465e-09, - "long_answer_loss": 0.0667, - "loss": 0.0653, - "short_answer_loss": NaN, - "step": 1225, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.061, - "grad_norm": 1.421875, - "learning_rate": 4.297750886064766e-09, - "long_answer_loss": 0.061, - "loss": 0.0637, - "short_answer_loss": NaN, - "step": 1226, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0719, - "grad_norm": 1.484375, - "learning_rate": 3.481216122284969e-09, - "long_answer_loss": 0.0719, - "loss": 0.0714, - "short_answer_loss": NaN, - "step": 1227, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0724, - "grad_norm": 1.421875, - "learning_rate": 2.7506173133282075e-09, - "long_answer_loss": 0.0724, - "loss": 0.0656, - "short_answer_loss": NaN, - "step": 1228, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0666, - "grad_norm": 1.515625, - "learning_rate": 2.105959483371389e-09, - "long_answer_loss": 0.0666, - "loss": 0.0666, - "short_answer_loss": NaN, - "step": 1229, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0949, - "grad_norm": 1.4375, - "learning_rate": 1.547247065593338e-09, - "long_answer_loss": 0.0949, - "loss": 0.0695, - "short_answer_loss": NaN, - "step": 1230, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0807, - "grad_norm": 1.4296875, - "learning_rate": 1.0744839021428755e-09, - "long_answer_loss": 0.0807, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 1231, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0711, - "grad_norm": 1.5, - "learning_rate": 6.876732441110645e-10, - "long_answer_loss": 0.0711, - "loss": 0.0677, - "short_answer_loss": NaN, - "step": 1232, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0797, - "grad_norm": 1.46875, - "learning_rate": 3.868177515173321e-10, - "long_answer_loss": 0.0797, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 1233, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "full_loss": 0.0711, - "grad_norm": 1.4140625, - "learning_rate": 1.7191949328032587e-10, - "long_answer_loss": 0.0711, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 1234, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "full_loss": 0.0733, - "grad_norm": 1.375, - "learning_rate": 4.297994721097487e-11, - "long_answer_loss": 0.0733, - "loss": 0.0645, - "short_answer_loss": NaN, - "step": 1235, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "full_loss": 0.0661, - "grad_norm": 1.65625, - "learning_rate": 0.0, - "long_answer_loss": 0.0661, - "loss": 0.0717, - "short_answer_loss": NaN, - "step": 1236, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "step": 1236, - "total_flos": 9.29924110325121e+17, - "train_loss": 0.12603917728482616, - "train_runtime": 5247.2889, - "train_samples_per_second": 30.175, - "train_steps_per_second": 0.236 - } - ], - "logging_steps": 1.0, - "max_steps": 1236, - "num_input_tokens_seen": 0, - "num_train_epochs": 2, - "save_steps": 1.0, - "total_flos": 9.29924110325121e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}