diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100755--- "a/trainer_state.json" +++ /dev/null @@ -1,28806 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.9996178100515958, - "eval_steps": 25.0, - "global_step": 2616, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "full_loss": 0.609, - "grad_norm": 11.4375, - "learning_rate": 3.1645569620253163e-07, - "long_answer_loss": 0.609, - "loss": 0.5548, - "short_answer_loss": NaN, - "step": 1, - "template_loss": 0.0 - }, - { - "epoch": 0.0, - "full_loss": 0.6229, - "grad_norm": 11.5, - "learning_rate": 6.329113924050633e-07, - "long_answer_loss": 0.6229, - "loss": 0.5756, - "short_answer_loss": NaN, - "step": 2, - "template_loss": 0.0 - }, - { - "epoch": 0.0, - "full_loss": 0.5579, - "grad_norm": 11.6875, - "learning_rate": 9.493670886075951e-07, - "long_answer_loss": 0.5579, - "loss": 0.5652, - "short_answer_loss": NaN, - "step": 3, - "template_loss": 0.0 - }, - { - "epoch": 0.0, - "full_loss": 0.6103, - "grad_norm": 11.625, - "learning_rate": 1.2658227848101265e-06, - "long_answer_loss": 0.6103, - "loss": 0.578, - "short_answer_loss": NaN, - "step": 4, - "template_loss": 0.0 - }, - { - "epoch": 0.0, - "full_loss": 0.5432, - "grad_norm": 11.0, - "learning_rate": 1.5822784810126583e-06, - "long_answer_loss": 0.5432, - "loss": 0.555, - "short_answer_loss": NaN, - "step": 5, - "template_loss": 0.0 - }, - { - "epoch": 0.0, - "full_loss": 0.5516, - "grad_norm": 10.5, - "learning_rate": 1.8987341772151901e-06, - "long_answer_loss": 0.5516, - "loss": 0.5425, - "short_answer_loss": NaN, - "step": 6, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.5406, - "grad_norm": 10.1875, - "learning_rate": 2.2151898734177215e-06, - "long_answer_loss": 0.5406, - "loss": 0.5275, - "short_answer_loss": NaN, - "step": 7, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.516, - "grad_norm": 9.375, - "learning_rate": 2.531645569620253e-06, - "long_answer_loss": 0.516, - "loss": 0.5037, - "short_answer_loss": NaN, - "step": 8, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.439, - "grad_norm": 7.9375, - "learning_rate": 2.848101265822785e-06, - "long_answer_loss": 0.439, - "loss": 0.4711, - "short_answer_loss": NaN, - "step": 9, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.3844, - "grad_norm": 6.4375, - "learning_rate": 3.1645569620253167e-06, - "long_answer_loss": 0.3844, - "loss": 0.4277, - "short_answer_loss": NaN, - "step": 10, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.3622, - "grad_norm": 5.875, - "learning_rate": 3.4810126582278482e-06, - "long_answer_loss": 0.3622, - "loss": 0.3967, - "short_answer_loss": NaN, - "step": 11, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.3716, - "grad_norm": 4.96875, - "learning_rate": 3.7974683544303802e-06, - "long_answer_loss": 0.3716, - "loss": 0.3841, - "short_answer_loss": NaN, - "step": 12, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.3691, - "grad_norm": 5.03125, - "learning_rate": 4.113924050632911e-06, - "long_answer_loss": 0.3691, - "loss": 0.3762, - "short_answer_loss": NaN, - "step": 13, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.3616, - "grad_norm": 4.625, - "learning_rate": 4.430379746835443e-06, - "long_answer_loss": 0.3616, - "loss": 0.3497, - "short_answer_loss": NaN, - "step": 14, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.3443, - "grad_norm": 5.0, - "learning_rate": 4.746835443037975e-06, - "long_answer_loss": 0.3443, - "loss": 0.3311, - "short_answer_loss": NaN, - "step": 15, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.2819, - "grad_norm": 4.625, - "learning_rate": 5.063291139240506e-06, - "long_answer_loss": 0.2819, - "loss": 0.3011, - "short_answer_loss": NaN, - "step": 16, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.2899, - "grad_norm": 4.59375, - "learning_rate": 5.379746835443038e-06, - "long_answer_loss": 0.2899, - "loss": 0.2938, - "short_answer_loss": NaN, - "step": 17, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.258, - "grad_norm": 4.0625, - "learning_rate": 5.69620253164557e-06, - "long_answer_loss": 0.258, - "loss": 0.2691, - "short_answer_loss": NaN, - "step": 18, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.2726, - "grad_norm": 3.6875, - "learning_rate": 6.012658227848101e-06, - "long_answer_loss": 0.2726, - "loss": 0.2777, - "short_answer_loss": NaN, - "step": 19, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.284, - "grad_norm": 2.953125, - "learning_rate": 6.329113924050633e-06, - "long_answer_loss": 0.284, - "loss": 0.2524, - "short_answer_loss": NaN, - "step": 20, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2594, - "grad_norm": 2.765625, - "learning_rate": 6.6455696202531645e-06, - "long_answer_loss": 0.2594, - "loss": 0.2483, - "short_answer_loss": NaN, - "step": 21, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2399, - "grad_norm": 2.703125, - "learning_rate": 6.9620253164556965e-06, - "long_answer_loss": 0.2399, - "loss": 0.2333, - "short_answer_loss": NaN, - "step": 22, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2543, - "grad_norm": 2.75, - "learning_rate": 7.2784810126582285e-06, - "long_answer_loss": 0.2543, - "loss": 0.2374, - "short_answer_loss": NaN, - "step": 23, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2499, - "grad_norm": 2.640625, - "learning_rate": 7.5949367088607605e-06, - "long_answer_loss": 0.2499, - "loss": 0.2296, - "short_answer_loss": NaN, - "step": 24, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2313, - "grad_norm": 2.96875, - "learning_rate": 7.911392405063292e-06, - "long_answer_loss": 0.2313, - "loss": 0.2251, - "short_answer_loss": NaN, - "step": 25, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2233, - "grad_norm": 2.75, - "learning_rate": 8.227848101265822e-06, - "long_answer_loss": 0.2233, - "loss": 0.2148, - "short_answer_loss": NaN, - "step": 26, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2203, - "grad_norm": 2.828125, - "learning_rate": 8.544303797468354e-06, - "long_answer_loss": 0.2203, - "loss": 0.2176, - "short_answer_loss": NaN, - "step": 27, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.1993, - "grad_norm": 2.578125, - "learning_rate": 8.860759493670886e-06, - "long_answer_loss": 0.1993, - "loss": 0.2042, - "short_answer_loss": NaN, - "step": 28, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.197, - "grad_norm": 2.578125, - "learning_rate": 9.177215189873418e-06, - "long_answer_loss": 0.197, - "loss": 0.2116, - "short_answer_loss": NaN, - "step": 29, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.186, - "grad_norm": 2.953125, - "learning_rate": 9.49367088607595e-06, - "long_answer_loss": 0.186, - "loss": 0.2126, - "short_answer_loss": NaN, - "step": 30, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2096, - "grad_norm": 3.09375, - "learning_rate": 9.81012658227848e-06, - "long_answer_loss": 0.2096, - "loss": 0.1996, - "short_answer_loss": NaN, - "step": 31, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2196, - "grad_norm": 2.796875, - "learning_rate": 1.0126582278481012e-05, - "long_answer_loss": 0.2196, - "loss": 0.2065, - "short_answer_loss": NaN, - "step": 32, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1981, - "grad_norm": 2.78125, - "learning_rate": 1.0443037974683544e-05, - "long_answer_loss": 0.1981, - "loss": 0.1855, - "short_answer_loss": NaN, - "step": 33, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.2032, - "grad_norm": 2.546875, - "learning_rate": 1.0759493670886076e-05, - "long_answer_loss": 0.2032, - "loss": 0.1861, - "short_answer_loss": NaN, - "step": 34, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1814, - "grad_norm": 2.6875, - "learning_rate": 1.1075949367088608e-05, - "long_answer_loss": 0.1814, - "loss": 0.1831, - "short_answer_loss": NaN, - "step": 35, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1701, - "grad_norm": 2.421875, - "learning_rate": 1.139240506329114e-05, - "long_answer_loss": 0.1701, - "loss": 0.1845, - "short_answer_loss": NaN, - "step": 36, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1923, - "grad_norm": 2.625, - "learning_rate": 1.170886075949367e-05, - "long_answer_loss": 0.1923, - "loss": 0.1795, - "short_answer_loss": NaN, - "step": 37, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1675, - "grad_norm": 2.421875, - "learning_rate": 1.2025316455696203e-05, - "long_answer_loss": 0.1675, - "loss": 0.1782, - "short_answer_loss": NaN, - "step": 38, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1975, - "grad_norm": 2.46875, - "learning_rate": 1.2341772151898735e-05, - "long_answer_loss": 0.1975, - "loss": 0.1831, - "short_answer_loss": NaN, - "step": 39, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.2041, - "grad_norm": 2.90625, - "learning_rate": 1.2658227848101267e-05, - "long_answer_loss": 0.2041, - "loss": 0.1855, - "short_answer_loss": NaN, - "step": 40, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1467, - "grad_norm": 2.578125, - "learning_rate": 1.2974683544303799e-05, - "long_answer_loss": 0.1467, - "loss": 0.1809, - "short_answer_loss": NaN, - "step": 41, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1754, - "grad_norm": 2.34375, - "learning_rate": 1.3291139240506329e-05, - "long_answer_loss": 0.1754, - "loss": 0.1787, - "short_answer_loss": NaN, - "step": 42, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1669, - "grad_norm": 2.6875, - "learning_rate": 1.3607594936708861e-05, - "long_answer_loss": 0.1669, - "loss": 0.1696, - "short_answer_loss": NaN, - "step": 43, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1715, - "grad_norm": 2.875, - "learning_rate": 1.3924050632911393e-05, - "long_answer_loss": 0.1715, - "loss": 0.1773, - "short_answer_loss": NaN, - "step": 44, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.1737, - "grad_norm": 2.640625, - "learning_rate": 1.4240506329113925e-05, - "long_answer_loss": 0.1737, - "loss": 0.1789, - "short_answer_loss": NaN, - "step": 45, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1641, - "grad_norm": 2.796875, - "learning_rate": 1.4556962025316457e-05, - "long_answer_loss": 0.1641, - "loss": 0.1756, - "short_answer_loss": NaN, - "step": 46, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1743, - "grad_norm": 2.515625, - "learning_rate": 1.4873417721518987e-05, - "long_answer_loss": 0.1743, - "loss": 0.1728, - "short_answer_loss": NaN, - "step": 47, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1921, - "grad_norm": 2.515625, - "learning_rate": 1.5189873417721521e-05, - "long_answer_loss": 0.1921, - "loss": 0.1766, - "short_answer_loss": NaN, - "step": 48, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.2013, - "grad_norm": 2.5, - "learning_rate": 1.550632911392405e-05, - "long_answer_loss": 0.2013, - "loss": 0.1804, - "short_answer_loss": NaN, - "step": 49, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1937, - "grad_norm": 2.703125, - "learning_rate": 1.5822784810126583e-05, - "long_answer_loss": 0.1937, - "loss": 0.1868, - "short_answer_loss": NaN, - "step": 50, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1877, - "grad_norm": 2.75, - "learning_rate": 1.6139240506329115e-05, - "long_answer_loss": 0.1877, - "loss": 0.1723, - "short_answer_loss": NaN, - "step": 51, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1664, - "grad_norm": 2.640625, - "learning_rate": 1.6455696202531644e-05, - "long_answer_loss": 0.1664, - "loss": 0.172, - "short_answer_loss": NaN, - "step": 52, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1944, - "grad_norm": 2.625, - "learning_rate": 1.677215189873418e-05, - "long_answer_loss": 0.1944, - "loss": 0.1713, - "short_answer_loss": NaN, - "step": 53, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1679, - "grad_norm": 2.640625, - "learning_rate": 1.7088607594936708e-05, - "long_answer_loss": 0.1679, - "loss": 0.1673, - "short_answer_loss": NaN, - "step": 54, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1657, - "grad_norm": 2.5625, - "learning_rate": 1.7405063291139243e-05, - "long_answer_loss": 0.1657, - "loss": 0.1642, - "short_answer_loss": NaN, - "step": 55, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.177, - "grad_norm": 2.609375, - "learning_rate": 1.7721518987341772e-05, - "long_answer_loss": 0.177, - "loss": 0.1739, - "short_answer_loss": NaN, - "step": 56, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1579, - "grad_norm": 2.703125, - "learning_rate": 1.8037974683544304e-05, - "long_answer_loss": 0.1579, - "loss": 0.1722, - "short_answer_loss": NaN, - "step": 57, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1785, - "grad_norm": 2.46875, - "learning_rate": 1.8354430379746836e-05, - "long_answer_loss": 0.1785, - "loss": 0.1668, - "short_answer_loss": NaN, - "step": 58, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1544, - "grad_norm": 2.734375, - "learning_rate": 1.8670886075949368e-05, - "long_answer_loss": 0.1544, - "loss": 0.173, - "short_answer_loss": NaN, - "step": 59, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1487, - "grad_norm": 2.859375, - "learning_rate": 1.89873417721519e-05, - "long_answer_loss": 0.1487, - "loss": 0.1755, - "short_answer_loss": NaN, - "step": 60, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1978, - "grad_norm": 2.75, - "learning_rate": 1.9303797468354432e-05, - "long_answer_loss": 0.1978, - "loss": 0.1704, - "short_answer_loss": NaN, - "step": 61, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1614, - "grad_norm": 2.75, - "learning_rate": 1.962025316455696e-05, - "long_answer_loss": 0.1614, - "loss": 0.1757, - "short_answer_loss": NaN, - "step": 62, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1567, - "grad_norm": 2.5, - "learning_rate": 1.9936708860759496e-05, - "long_answer_loss": 0.1567, - "loss": 0.1687, - "short_answer_loss": NaN, - "step": 63, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1606, - "grad_norm": 2.453125, - "learning_rate": 2.0253164556962025e-05, - "long_answer_loss": 0.1606, - "loss": 0.1682, - "short_answer_loss": NaN, - "step": 64, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1524, - "grad_norm": 2.734375, - "learning_rate": 2.056962025316456e-05, - "long_answer_loss": 0.1524, - "loss": 0.1682, - "short_answer_loss": NaN, - "step": 65, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1714, - "grad_norm": 2.765625, - "learning_rate": 2.088607594936709e-05, - "long_answer_loss": 0.1714, - "loss": 0.1698, - "short_answer_loss": NaN, - "step": 66, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1756, - "grad_norm": 2.859375, - "learning_rate": 2.120253164556962e-05, - "long_answer_loss": 0.1756, - "loss": 0.1721, - "short_answer_loss": NaN, - "step": 67, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1587, - "grad_norm": 2.671875, - "learning_rate": 2.1518987341772153e-05, - "long_answer_loss": 0.1587, - "loss": 0.1768, - "short_answer_loss": NaN, - "step": 68, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1785, - "grad_norm": 3.3125, - "learning_rate": 2.1835443037974685e-05, - "long_answer_loss": 0.1785, - "loss": 0.1778, - "short_answer_loss": NaN, - "step": 69, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1619, - "grad_norm": 2.71875, - "learning_rate": 2.2151898734177217e-05, - "long_answer_loss": 0.1619, - "loss": 0.1693, - "short_answer_loss": NaN, - "step": 70, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.1524, - "grad_norm": 2.796875, - "learning_rate": 2.246835443037975e-05, - "long_answer_loss": 0.1524, - "loss": 0.1772, - "short_answer_loss": NaN, - "step": 71, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1541, - "grad_norm": 2.59375, - "learning_rate": 2.278481012658228e-05, - "long_answer_loss": 0.1541, - "loss": 0.1693, - "short_answer_loss": NaN, - "step": 72, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1781, - "grad_norm": 2.984375, - "learning_rate": 2.3101265822784813e-05, - "long_answer_loss": 0.1781, - "loss": 0.1724, - "short_answer_loss": NaN, - "step": 73, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1583, - "grad_norm": 2.59375, - "learning_rate": 2.341772151898734e-05, - "long_answer_loss": 0.1583, - "loss": 0.1792, - "short_answer_loss": NaN, - "step": 74, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1577, - "grad_norm": 2.671875, - "learning_rate": 2.3734177215189873e-05, - "long_answer_loss": 0.1577, - "loss": 0.171, - "short_answer_loss": NaN, - "step": 75, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1753, - "grad_norm": 2.515625, - "learning_rate": 2.4050632911392405e-05, - "long_answer_loss": 0.1753, - "loss": 0.1788, - "short_answer_loss": NaN, - "step": 76, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1595, - "grad_norm": 2.859375, - "learning_rate": 2.4367088607594937e-05, - "long_answer_loss": 0.1595, - "loss": 0.1776, - "short_answer_loss": NaN, - "step": 77, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.2065, - "grad_norm": 2.578125, - "learning_rate": 2.468354430379747e-05, - "long_answer_loss": 0.2065, - "loss": 0.1759, - "short_answer_loss": NaN, - "step": 78, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1924, - "grad_norm": 2.671875, - "learning_rate": 2.5e-05, - "long_answer_loss": 0.1924, - "loss": 0.1815, - "short_answer_loss": NaN, - "step": 79, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.17, - "grad_norm": 2.5625, - "learning_rate": 2.4999990416177256e-05, - "long_answer_loss": 0.17, - "loss": 0.1732, - "short_answer_loss": NaN, - "step": 80, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1657, - "grad_norm": 2.359375, - "learning_rate": 2.4999961664723716e-05, - "long_answer_loss": 0.1657, - "loss": 0.1737, - "short_answer_loss": NaN, - "step": 81, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1976, - "grad_norm": 2.796875, - "learning_rate": 2.4999913745683463e-05, - "long_answer_loss": 0.1976, - "loss": 0.1828, - "short_answer_loss": NaN, - "step": 82, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1684, - "grad_norm": 2.59375, - "learning_rate": 2.4999846659129984e-05, - "long_answer_loss": 0.1684, - "loss": 0.1828, - "short_answer_loss": NaN, - "step": 83, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1616, - "grad_norm": 2.484375, - "learning_rate": 2.4999760405166147e-05, - "long_answer_loss": 0.1616, - "loss": 0.1823, - "short_answer_loss": NaN, - "step": 84, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1942, - "grad_norm": 2.734375, - "learning_rate": 2.4999654983924213e-05, - "long_answer_loss": 0.1942, - "loss": 0.1864, - "short_answer_loss": NaN, - "step": 85, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1786, - "grad_norm": 2.59375, - "learning_rate": 2.499953039556584e-05, - "long_answer_loss": 0.1786, - "loss": 0.1769, - "short_answer_loss": NaN, - "step": 86, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1749, - "grad_norm": 2.296875, - "learning_rate": 2.4999386640282073e-05, - "long_answer_loss": 0.1749, - "loss": 0.172, - "short_answer_loss": NaN, - "step": 87, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1575, - "grad_norm": 2.34375, - "learning_rate": 2.4999223718293347e-05, - "long_answer_loss": 0.1575, - "loss": 0.1861, - "short_answer_loss": NaN, - "step": 88, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1993, - "grad_norm": 2.421875, - "learning_rate": 2.4999041629849486e-05, - "long_answer_loss": 0.1993, - "loss": 0.184, - "short_answer_loss": NaN, - "step": 89, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1679, - "grad_norm": 2.296875, - "learning_rate": 2.4998840375229712e-05, - "long_answer_loss": 0.1679, - "loss": 0.1804, - "short_answer_loss": NaN, - "step": 90, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1924, - "grad_norm": 2.28125, - "learning_rate": 2.4998619954742626e-05, - "long_answer_loss": 0.1924, - "loss": 0.1807, - "short_answer_loss": NaN, - "step": 91, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1604, - "grad_norm": 2.140625, - "learning_rate": 2.4998380368726225e-05, - "long_answer_loss": 0.1604, - "loss": 0.1798, - "short_answer_loss": NaN, - "step": 92, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1709, - "grad_norm": 2.359375, - "learning_rate": 2.4998121617547894e-05, - "long_answer_loss": 0.1709, - "loss": 0.1863, - "short_answer_loss": NaN, - "step": 93, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.2032, - "grad_norm": 2.421875, - "learning_rate": 2.4997843701604404e-05, - "long_answer_loss": 0.2032, - "loss": 0.1787, - "short_answer_loss": NaN, - "step": 94, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1891, - "grad_norm": 2.25, - "learning_rate": 2.4997546621321914e-05, - "long_answer_loss": 0.1891, - "loss": 0.1738, - "short_answer_loss": NaN, - "step": 95, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1959, - "grad_norm": 2.71875, - "learning_rate": 2.4997230377155972e-05, - "long_answer_loss": 0.1959, - "loss": 0.1872, - "short_answer_loss": NaN, - "step": 96, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1873, - "grad_norm": 2.484375, - "learning_rate": 2.499689496959151e-05, - "long_answer_loss": 0.1873, - "loss": 0.1889, - "short_answer_loss": NaN, - "step": 97, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1609, - "grad_norm": 2.1875, - "learning_rate": 2.499654039914285e-05, - "long_answer_loss": 0.1609, - "loss": 0.1775, - "short_answer_loss": NaN, - "step": 98, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.172, - "grad_norm": 2.6875, - "learning_rate": 2.499616666635368e-05, - "long_answer_loss": 0.172, - "loss": 0.1797, - "short_answer_loss": NaN, - "step": 99, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1783, - "grad_norm": 2.234375, - "learning_rate": 2.4995773771797104e-05, - "long_answer_loss": 0.1783, - "loss": 0.1775, - "short_answer_loss": NaN, - "step": 100, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1639, - "grad_norm": 2.171875, - "learning_rate": 2.4995361716075583e-05, - "long_answer_loss": 0.1639, - "loss": 0.1703, - "short_answer_loss": NaN, - "step": 101, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1762, - "grad_norm": 2.4375, - "learning_rate": 2.4994930499820965e-05, - "long_answer_loss": 0.1762, - "loss": 0.1865, - "short_answer_loss": NaN, - "step": 102, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1901, - "grad_norm": 2.390625, - "learning_rate": 2.4994480123694486e-05, - "long_answer_loss": 0.1901, - "loss": 0.1906, - "short_answer_loss": NaN, - "step": 103, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1734, - "grad_norm": 2.5, - "learning_rate": 2.4994010588386757e-05, - "long_answer_loss": 0.1734, - "loss": 0.1868, - "short_answer_loss": NaN, - "step": 104, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1668, - "grad_norm": 2.8125, - "learning_rate": 2.4993521894617772e-05, - "long_answer_loss": 0.1668, - "loss": 0.1804, - "short_answer_loss": NaN, - "step": 105, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1931, - "grad_norm": 2.390625, - "learning_rate": 2.49930140431369e-05, - "long_answer_loss": 0.1931, - "loss": 0.1893, - "short_answer_loss": NaN, - "step": 106, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1763, - "grad_norm": 2.359375, - "learning_rate": 2.4992487034722875e-05, - "long_answer_loss": 0.1763, - "loss": 0.187, - "short_answer_loss": NaN, - "step": 107, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.2037, - "grad_norm": 2.390625, - "learning_rate": 2.499194087018383e-05, - "long_answer_loss": 0.2037, - "loss": 0.1968, - "short_answer_loss": NaN, - "step": 108, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1873, - "grad_norm": 2.171875, - "learning_rate": 2.4991375550357253e-05, - "long_answer_loss": 0.1873, - "loss": 0.1832, - "short_answer_loss": NaN, - "step": 109, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.2097, - "grad_norm": 2.265625, - "learning_rate": 2.499079107611002e-05, - "long_answer_loss": 0.2097, - "loss": 0.1836, - "short_answer_loss": NaN, - "step": 110, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1797, - "grad_norm": 2.265625, - "learning_rate": 2.4990187448338365e-05, - "long_answer_loss": 0.1797, - "loss": 0.1818, - "short_answer_loss": NaN, - "step": 111, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1816, - "grad_norm": 2.3125, - "learning_rate": 2.4989564667967902e-05, - "long_answer_loss": 0.1816, - "loss": 0.188, - "short_answer_loss": NaN, - "step": 112, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1719, - "grad_norm": 2.140625, - "learning_rate": 2.4988922735953603e-05, - "long_answer_loss": 0.1719, - "loss": 0.1794, - "short_answer_loss": NaN, - "step": 113, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1852, - "grad_norm": 2.140625, - "learning_rate": 2.4988261653279815e-05, - "long_answer_loss": 0.1852, - "loss": 0.1769, - "short_answer_loss": NaN, - "step": 114, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.2302, - "grad_norm": 2.109375, - "learning_rate": 2.4987581420960253e-05, - "long_answer_loss": 0.2302, - "loss": 0.1863, - "short_answer_loss": NaN, - "step": 115, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.2028, - "grad_norm": 2.1875, - "learning_rate": 2.4986882040037994e-05, - "long_answer_loss": 0.2028, - "loss": 0.1779, - "short_answer_loss": NaN, - "step": 116, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1978, - "grad_norm": 2.28125, - "learning_rate": 2.4986163511585474e-05, - "long_answer_loss": 0.1978, - "loss": 0.1776, - "short_answer_loss": NaN, - "step": 117, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1804, - "grad_norm": 2.1875, - "learning_rate": 2.49854258367045e-05, - "long_answer_loss": 0.1804, - "loss": 0.1817, - "short_answer_loss": NaN, - "step": 118, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1766, - "grad_norm": 2.078125, - "learning_rate": 2.498466901652622e-05, - "long_answer_loss": 0.1766, - "loss": 0.1877, - "short_answer_loss": NaN, - "step": 119, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1824, - "grad_norm": 2.3125, - "learning_rate": 2.498389305221116e-05, - "long_answer_loss": 0.1824, - "loss": 0.1831, - "short_answer_loss": NaN, - "step": 120, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1967, - "grad_norm": 2.140625, - "learning_rate": 2.4983097944949187e-05, - "long_answer_loss": 0.1967, - "loss": 0.1757, - "short_answer_loss": NaN, - "step": 121, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1834, - "grad_norm": 2.171875, - "learning_rate": 2.4982283695959525e-05, - "long_answer_loss": 0.1834, - "loss": 0.1771, - "short_answer_loss": NaN, - "step": 122, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1605, - "grad_norm": 2.15625, - "learning_rate": 2.4981450306490762e-05, - "long_answer_loss": 0.1605, - "loss": 0.1849, - "short_answer_loss": NaN, - "step": 123, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.175, - "grad_norm": 2.203125, - "learning_rate": 2.4980597777820826e-05, - "long_answer_loss": 0.175, - "loss": 0.1816, - "short_answer_loss": NaN, - "step": 124, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1695, - "grad_norm": 2.296875, - "learning_rate": 2.4979726111256983e-05, - "long_answer_loss": 0.1695, - "loss": 0.1809, - "short_answer_loss": NaN, - "step": 125, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1986, - "grad_norm": 2.3125, - "learning_rate": 2.4978835308135873e-05, - "long_answer_loss": 0.1986, - "loss": 0.1796, - "short_answer_loss": NaN, - "step": 126, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1911, - "grad_norm": 2.375, - "learning_rate": 2.497792536982345e-05, - "long_answer_loss": 0.1911, - "loss": 0.1875, - "short_answer_loss": NaN, - "step": 127, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1701, - "grad_norm": 2.390625, - "learning_rate": 2.4976996297715033e-05, - "long_answer_loss": 0.1701, - "loss": 0.1884, - "short_answer_loss": NaN, - "step": 128, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1756, - "grad_norm": 1.9609375, - "learning_rate": 2.4976048093235265e-05, - "long_answer_loss": 0.1756, - "loss": 0.1789, - "short_answer_loss": NaN, - "step": 129, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1673, - "grad_norm": 2.0625, - "learning_rate": 2.4975080757838145e-05, - "long_answer_loss": 0.1673, - "loss": 0.1807, - "short_answer_loss": NaN, - "step": 130, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1738, - "grad_norm": 1.96875, - "learning_rate": 2.497409429300698e-05, - "long_answer_loss": 0.1738, - "loss": 0.1792, - "short_answer_loss": NaN, - "step": 131, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1517, - "grad_norm": 1.8671875, - "learning_rate": 2.4973088700254437e-05, - "long_answer_loss": 0.1517, - "loss": 0.1699, - "short_answer_loss": NaN, - "step": 132, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1914, - "grad_norm": 2.125, - "learning_rate": 2.4972063981122508e-05, - "long_answer_loss": 0.1914, - "loss": 0.1816, - "short_answer_loss": NaN, - "step": 133, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1903, - "grad_norm": 1.90625, - "learning_rate": 2.4971020137182498e-05, - "long_answer_loss": 0.1903, - "loss": 0.1752, - "short_answer_loss": NaN, - "step": 134, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.2153, - "grad_norm": 2.125, - "learning_rate": 2.4969957170035056e-05, - "long_answer_loss": 0.2153, - "loss": 0.1838, - "short_answer_loss": NaN, - "step": 135, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1954, - "grad_norm": 2.046875, - "learning_rate": 2.4968875081310148e-05, - "long_answer_loss": 0.1954, - "loss": 0.1849, - "short_answer_loss": NaN, - "step": 136, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.1911, - "grad_norm": 1.921875, - "learning_rate": 2.4967773872667062e-05, - "long_answer_loss": 0.1911, - "loss": 0.1807, - "short_answer_loss": NaN, - "step": 137, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.2322, - "grad_norm": 2.1875, - "learning_rate": 2.4966653545794398e-05, - "long_answer_loss": 0.2322, - "loss": 0.1954, - "short_answer_loss": NaN, - "step": 138, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1962, - "grad_norm": 2.140625, - "learning_rate": 2.4965514102410083e-05, - "long_answer_loss": 0.1962, - "loss": 0.1933, - "short_answer_loss": NaN, - "step": 139, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1697, - "grad_norm": 2.078125, - "learning_rate": 2.4964355544261357e-05, - "long_answer_loss": 0.1697, - "loss": 0.1816, - "short_answer_loss": NaN, - "step": 140, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.2178, - "grad_norm": 2.03125, - "learning_rate": 2.496317787312476e-05, - "long_answer_loss": 0.2178, - "loss": 0.1807, - "short_answer_loss": NaN, - "step": 141, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1866, - "grad_norm": 2.015625, - "learning_rate": 2.4961981090806147e-05, - "long_answer_loss": 0.1866, - "loss": 0.1857, - "short_answer_loss": NaN, - "step": 142, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1596, - "grad_norm": 1.9453125, - "learning_rate": 2.4960765199140682e-05, - "long_answer_loss": 0.1596, - "loss": 0.18, - "short_answer_loss": NaN, - "step": 143, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.2018, - "grad_norm": 1.9765625, - "learning_rate": 2.4959530199992826e-05, - "long_answer_loss": 0.2018, - "loss": 0.1838, - "short_answer_loss": NaN, - "step": 144, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1635, - "grad_norm": 2.078125, - "learning_rate": 2.4958276095256335e-05, - "long_answer_loss": 0.1635, - "loss": 0.1834, - "short_answer_loss": NaN, - "step": 145, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.2144, - "grad_norm": 2.046875, - "learning_rate": 2.4957002886854277e-05, - "long_answer_loss": 0.2144, - "loss": 0.1835, - "short_answer_loss": NaN, - "step": 146, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1922, - "grad_norm": 2.09375, - "learning_rate": 2.4955710576739e-05, - "long_answer_loss": 0.1922, - "loss": 0.1844, - "short_answer_loss": NaN, - "step": 147, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1819, - "grad_norm": 1.8984375, - "learning_rate": 2.4954399166892152e-05, - "long_answer_loss": 0.1819, - "loss": 0.1708, - "short_answer_loss": NaN, - "step": 148, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.2108, - "grad_norm": 1.9453125, - "learning_rate": 2.495306865932465e-05, - "long_answer_loss": 0.2108, - "loss": 0.1799, - "short_answer_loss": NaN, - "step": 149, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1585, - "grad_norm": 1.9375, - "learning_rate": 2.4951719056076728e-05, - "long_answer_loss": 0.1585, - "loss": 0.1703, - "short_answer_loss": NaN, - "step": 150, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1981, - "grad_norm": 2.015625, - "learning_rate": 2.495035035921787e-05, - "long_answer_loss": 0.1981, - "loss": 0.1838, - "short_answer_loss": NaN, - "step": 151, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1811, - "grad_norm": 2.015625, - "learning_rate": 2.4948962570846864e-05, - "long_answer_loss": 0.1811, - "loss": 0.181, - "short_answer_loss": NaN, - "step": 152, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1662, - "grad_norm": 1.875, - "learning_rate": 2.494755569309175e-05, - "long_answer_loss": 0.1662, - "loss": 0.1741, - "short_answer_loss": NaN, - "step": 153, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1624, - "grad_norm": 2.03125, - "learning_rate": 2.4946129728109854e-05, - "long_answer_loss": 0.1624, - "loss": 0.1794, - "short_answer_loss": NaN, - "step": 154, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1687, - "grad_norm": 1.859375, - "learning_rate": 2.494468467808777e-05, - "long_answer_loss": 0.1687, - "loss": 0.1754, - "short_answer_loss": NaN, - "step": 155, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1637, - "grad_norm": 1.90625, - "learning_rate": 2.4943220545241346e-05, - "long_answer_loss": 0.1637, - "loss": 0.1802, - "short_answer_loss": NaN, - "step": 156, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1815, - "grad_norm": 1.84375, - "learning_rate": 2.494173733181571e-05, - "long_answer_loss": 0.1815, - "loss": 0.1806, - "short_answer_loss": NaN, - "step": 157, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1729, - "grad_norm": 1.984375, - "learning_rate": 2.4940235040085243e-05, - "long_answer_loss": 0.1729, - "loss": 0.1866, - "short_answer_loss": NaN, - "step": 158, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1794, - "grad_norm": 1.90625, - "learning_rate": 2.493871367235356e-05, - "long_answer_loss": 0.1794, - "loss": 0.1808, - "short_answer_loss": NaN, - "step": 159, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1691, - "grad_norm": 1.96875, - "learning_rate": 2.4937173230953554e-05, - "long_answer_loss": 0.1691, - "loss": 0.1772, - "short_answer_loss": NaN, - "step": 160, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1698, - "grad_norm": 1.90625, - "learning_rate": 2.493561371824736e-05, - "long_answer_loss": 0.1698, - "loss": 0.19, - "short_answer_loss": NaN, - "step": 161, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1616, - "grad_norm": 1.8828125, - "learning_rate": 2.4934035136626338e-05, - "long_answer_loss": 0.1616, - "loss": 0.1851, - "short_answer_loss": NaN, - "step": 162, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1754, - "grad_norm": 1.9921875, - "learning_rate": 2.493243748851112e-05, - "long_answer_loss": 0.1754, - "loss": 0.1789, - "short_answer_loss": NaN, - "step": 163, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1766, - "grad_norm": 2.015625, - "learning_rate": 2.4930820776351548e-05, - "long_answer_loss": 0.1766, - "loss": 0.1832, - "short_answer_loss": NaN, - "step": 164, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1541, - "grad_norm": 1.984375, - "learning_rate": 2.4929185002626714e-05, - "long_answer_loss": 0.1541, - "loss": 0.1781, - "short_answer_loss": NaN, - "step": 165, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.178, - "grad_norm": 2.046875, - "learning_rate": 2.492753016984493e-05, - "long_answer_loss": 0.178, - "loss": 0.1802, - "short_answer_loss": NaN, - "step": 166, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1843, - "grad_norm": 1.7890625, - "learning_rate": 2.492585628054373e-05, - "long_answer_loss": 0.1843, - "loss": 0.1885, - "short_answer_loss": NaN, - "step": 167, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1808, - "grad_norm": 2.0, - "learning_rate": 2.4924163337289885e-05, - "long_answer_loss": 0.1808, - "loss": 0.1821, - "short_answer_loss": NaN, - "step": 168, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.2042, - "grad_norm": 1.875, - "learning_rate": 2.4922451342679366e-05, - "long_answer_loss": 0.2042, - "loss": 0.1784, - "short_answer_loss": NaN, - "step": 169, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1893, - "grad_norm": 1.953125, - "learning_rate": 2.492072029933737e-05, - "long_answer_loss": 0.1893, - "loss": 0.1797, - "short_answer_loss": NaN, - "step": 170, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1749, - "grad_norm": 1.8359375, - "learning_rate": 2.4918970209918296e-05, - "long_answer_loss": 0.1749, - "loss": 0.1706, - "short_answer_loss": NaN, - "step": 171, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1807, - "grad_norm": 1.921875, - "learning_rate": 2.4917201077105757e-05, - "long_answer_loss": 0.1807, - "loss": 0.1828, - "short_answer_loss": NaN, - "step": 172, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1641, - "grad_norm": 1.90625, - "learning_rate": 2.4915412903612554e-05, - "long_answer_loss": 0.1641, - "loss": 0.1846, - "short_answer_loss": NaN, - "step": 173, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1789, - "grad_norm": 1.9453125, - "learning_rate": 2.4913605692180696e-05, - "long_answer_loss": 0.1789, - "loss": 0.1846, - "short_answer_loss": NaN, - "step": 174, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1687, - "grad_norm": 1.8125, - "learning_rate": 2.4911779445581384e-05, - "long_answer_loss": 0.1687, - "loss": 0.1882, - "short_answer_loss": NaN, - "step": 175, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.181, - "grad_norm": 1.9296875, - "learning_rate": 2.4909934166615006e-05, - "long_answer_loss": 0.181, - "loss": 0.1751, - "short_answer_loss": NaN, - "step": 176, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1749, - "grad_norm": 2.0, - "learning_rate": 2.4908069858111133e-05, - "long_answer_loss": 0.1749, - "loss": 0.1779, - "short_answer_loss": NaN, - "step": 177, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1906, - "grad_norm": 2.171875, - "learning_rate": 2.4906186522928516e-05, - "long_answer_loss": 0.1906, - "loss": 0.1921, - "short_answer_loss": NaN, - "step": 178, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1641, - "grad_norm": 1.953125, - "learning_rate": 2.490428416395509e-05, - "long_answer_loss": 0.1641, - "loss": 0.1869, - "short_answer_loss": NaN, - "step": 179, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1781, - "grad_norm": 2.125, - "learning_rate": 2.490236278410794e-05, - "long_answer_loss": 0.1781, - "loss": 0.1886, - "short_answer_loss": NaN, - "step": 180, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1526, - "grad_norm": 2.046875, - "learning_rate": 2.490042238633335e-05, - "long_answer_loss": 0.1526, - "loss": 0.1808, - "short_answer_loss": NaN, - "step": 181, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1651, - "grad_norm": 1.8515625, - "learning_rate": 2.4898462973606736e-05, - "long_answer_loss": 0.1651, - "loss": 0.1718, - "short_answer_loss": NaN, - "step": 182, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1868, - "grad_norm": 1.953125, - "learning_rate": 2.4896484548932686e-05, - "long_answer_loss": 0.1868, - "loss": 0.1772, - "short_answer_loss": NaN, - "step": 183, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.175, - "grad_norm": 1.78125, - "learning_rate": 2.489448711534494e-05, - "long_answer_loss": 0.175, - "loss": 0.1753, - "short_answer_loss": NaN, - "step": 184, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.2003, - "grad_norm": 1.8515625, - "learning_rate": 2.4892470675906394e-05, - "long_answer_loss": 0.2003, - "loss": 0.1866, - "short_answer_loss": NaN, - "step": 185, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1635, - "grad_norm": 2.015625, - "learning_rate": 2.4890435233709066e-05, - "long_answer_loss": 0.1635, - "loss": 0.1782, - "short_answer_loss": NaN, - "step": 186, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1667, - "grad_norm": 1.8203125, - "learning_rate": 2.4888380791874137e-05, - "long_answer_loss": 0.1667, - "loss": 0.1785, - "short_answer_loss": NaN, - "step": 187, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1869, - "grad_norm": 2.125, - "learning_rate": 2.4886307353551906e-05, - "long_answer_loss": 0.1869, - "loss": 0.1782, - "short_answer_loss": NaN, - "step": 188, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1848, - "grad_norm": 1.84375, - "learning_rate": 2.4884214921921813e-05, - "long_answer_loss": 0.1848, - "loss": 0.1631, - "short_answer_loss": NaN, - "step": 189, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1732, - "grad_norm": 1.859375, - "learning_rate": 2.4882103500192415e-05, - "long_answer_loss": 0.1732, - "loss": 0.1824, - "short_answer_loss": NaN, - "step": 190, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.2009, - "grad_norm": 2.015625, - "learning_rate": 2.4879973091601387e-05, - "long_answer_loss": 0.2009, - "loss": 0.1801, - "short_answer_loss": NaN, - "step": 191, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1753, - "grad_norm": 1.9296875, - "learning_rate": 2.487782369941553e-05, - "long_answer_loss": 0.1753, - "loss": 0.1782, - "short_answer_loss": NaN, - "step": 192, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1967, - "grad_norm": 2.046875, - "learning_rate": 2.4875655326930736e-05, - "long_answer_loss": 0.1967, - "loss": 0.1889, - "short_answer_loss": NaN, - "step": 193, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.2104, - "grad_norm": 1.90625, - "learning_rate": 2.4873467977472025e-05, - "long_answer_loss": 0.2104, - "loss": 0.1812, - "short_answer_loss": NaN, - "step": 194, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1992, - "grad_norm": 1.7265625, - "learning_rate": 2.487126165439349e-05, - "long_answer_loss": 0.1992, - "loss": 0.1791, - "short_answer_loss": NaN, - "step": 195, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1856, - "grad_norm": 2.375, - "learning_rate": 2.4869036361078345e-05, - "long_answer_loss": 0.1856, - "loss": 0.1859, - "short_answer_loss": NaN, - "step": 196, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1755, - "grad_norm": 1.875, - "learning_rate": 2.486679210093888e-05, - "long_answer_loss": 0.1755, - "loss": 0.169, - "short_answer_loss": NaN, - "step": 197, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.183, - "grad_norm": 1.78125, - "learning_rate": 2.486452887741646e-05, - "long_answer_loss": 0.183, - "loss": 0.1848, - "short_answer_loss": NaN, - "step": 198, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1781, - "grad_norm": 2.03125, - "learning_rate": 2.4862246693981544e-05, - "long_answer_loss": 0.1781, - "loss": 0.1751, - "short_answer_loss": NaN, - "step": 199, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1652, - "grad_norm": 1.8046875, - "learning_rate": 2.4859945554133662e-05, - "long_answer_loss": 0.1652, - "loss": 0.1812, - "short_answer_loss": NaN, - "step": 200, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.177, - "grad_norm": 1.7421875, - "learning_rate": 2.4857625461401404e-05, - "long_answer_loss": 0.177, - "loss": 0.1734, - "short_answer_loss": NaN, - "step": 201, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.196, - "grad_norm": 1.875, - "learning_rate": 2.4855286419342428e-05, - "long_answer_loss": 0.196, - "loss": 0.1882, - "short_answer_loss": NaN, - "step": 202, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.2024, - "grad_norm": 1.7421875, - "learning_rate": 2.485292843154345e-05, - "long_answer_loss": 0.2024, - "loss": 0.181, - "short_answer_loss": NaN, - "step": 203, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1584, - "grad_norm": 1.6796875, - "learning_rate": 2.4850551501620235e-05, - "long_answer_loss": 0.1584, - "loss": 0.1746, - "short_answer_loss": NaN, - "step": 204, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.179, - "grad_norm": 1.7421875, - "learning_rate": 2.484815563321759e-05, - "long_answer_loss": 0.179, - "loss": 0.1755, - "short_answer_loss": NaN, - "step": 205, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1756, - "grad_norm": 1.9765625, - "learning_rate": 2.484574083000938e-05, - "long_answer_loss": 0.1756, - "loss": 0.1772, - "short_answer_loss": NaN, - "step": 206, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1868, - "grad_norm": 1.7890625, - "learning_rate": 2.4843307095698476e-05, - "long_answer_loss": 0.1868, - "loss": 0.1763, - "short_answer_loss": NaN, - "step": 207, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1768, - "grad_norm": 1.6328125, - "learning_rate": 2.4840854434016808e-05, - "long_answer_loss": 0.1768, - "loss": 0.1763, - "short_answer_loss": NaN, - "step": 208, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.2088, - "grad_norm": 1.75, - "learning_rate": 2.4838382848725312e-05, - "long_answer_loss": 0.2088, - "loss": 0.1811, - "short_answer_loss": NaN, - "step": 209, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1809, - "grad_norm": 1.7578125, - "learning_rate": 2.4835892343613943e-05, - "long_answer_loss": 0.1809, - "loss": 0.178, - "short_answer_loss": NaN, - "step": 210, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.158, - "grad_norm": 1.78125, - "learning_rate": 2.4833382922501668e-05, - "long_answer_loss": 0.158, - "loss": 0.1679, - "short_answer_loss": NaN, - "step": 211, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1787, - "grad_norm": 1.8125, - "learning_rate": 2.4830854589236475e-05, - "long_answer_loss": 0.1787, - "loss": 0.1809, - "short_answer_loss": NaN, - "step": 212, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1763, - "grad_norm": 1.7890625, - "learning_rate": 2.4828307347695326e-05, - "long_answer_loss": 0.1763, - "loss": 0.1729, - "short_answer_loss": NaN, - "step": 213, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1905, - "grad_norm": 1.8203125, - "learning_rate": 2.4825741201784198e-05, - "long_answer_loss": 0.1905, - "loss": 0.1758, - "short_answer_loss": NaN, - "step": 214, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1986, - "grad_norm": 1.9921875, - "learning_rate": 2.482315615543805e-05, - "long_answer_loss": 0.1986, - "loss": 0.178, - "short_answer_loss": NaN, - "step": 215, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1832, - "grad_norm": 1.71875, - "learning_rate": 2.482055221262081e-05, - "long_answer_loss": 0.1832, - "loss": 0.1813, - "short_answer_loss": NaN, - "step": 216, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1984, - "grad_norm": 1.90625, - "learning_rate": 2.4817929377325413e-05, - "long_answer_loss": 0.1984, - "loss": 0.179, - "short_answer_loss": NaN, - "step": 217, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.2032, - "grad_norm": 1.625, - "learning_rate": 2.4815287653573733e-05, - "long_answer_loss": 0.2032, - "loss": 0.1802, - "short_answer_loss": NaN, - "step": 218, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1822, - "grad_norm": 1.7734375, - "learning_rate": 2.4812627045416623e-05, - "long_answer_loss": 0.1822, - "loss": 0.182, - "short_answer_loss": NaN, - "step": 219, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1751, - "grad_norm": 1.65625, - "learning_rate": 2.4809947556933886e-05, - "long_answer_loss": 0.1751, - "loss": 0.177, - "short_answer_loss": NaN, - "step": 220, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.158, - "grad_norm": 1.6484375, - "learning_rate": 2.4807249192234293e-05, - "long_answer_loss": 0.158, - "loss": 0.1713, - "short_answer_loss": NaN, - "step": 221, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1628, - "grad_norm": 1.625, - "learning_rate": 2.4804531955455534e-05, - "long_answer_loss": 0.1628, - "loss": 0.1689, - "short_answer_loss": NaN, - "step": 222, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.207, - "grad_norm": 1.6796875, - "learning_rate": 2.480179585076426e-05, - "long_answer_loss": 0.207, - "loss": 0.1757, - "short_answer_loss": NaN, - "step": 223, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1636, - "grad_norm": 2.0, - "learning_rate": 2.4799040882356044e-05, - "long_answer_loss": 0.1636, - "loss": 0.1736, - "short_answer_loss": NaN, - "step": 224, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1432, - "grad_norm": 1.9375, - "learning_rate": 2.4796267054455384e-05, - "long_answer_loss": 0.1432, - "loss": 0.1869, - "short_answer_loss": NaN, - "step": 225, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1877, - "grad_norm": 1.9609375, - "learning_rate": 2.47934743713157e-05, - "long_answer_loss": 0.1877, - "loss": 0.1771, - "short_answer_loss": NaN, - "step": 226, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1752, - "grad_norm": 1.8203125, - "learning_rate": 2.479066283721933e-05, - "long_answer_loss": 0.1752, - "loss": 0.18, - "short_answer_loss": NaN, - "step": 227, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1989, - "grad_norm": 1.8046875, - "learning_rate": 2.478783245647751e-05, - "long_answer_loss": 0.1989, - "loss": 0.1752, - "short_answer_loss": NaN, - "step": 228, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1757, - "grad_norm": 1.6796875, - "learning_rate": 2.4784983233430375e-05, - "long_answer_loss": 0.1757, - "loss": 0.1711, - "short_answer_loss": NaN, - "step": 229, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1674, - "grad_norm": 1.7890625, - "learning_rate": 2.4782115172446966e-05, - "long_answer_loss": 0.1674, - "loss": 0.1734, - "short_answer_loss": NaN, - "step": 230, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.2165, - "grad_norm": 1.7890625, - "learning_rate": 2.4779228277925193e-05, - "long_answer_loss": 0.2165, - "loss": 0.1832, - "short_answer_loss": NaN, - "step": 231, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.178, - "grad_norm": 1.7734375, - "learning_rate": 2.4776322554291854e-05, - "long_answer_loss": 0.178, - "loss": 0.175, - "short_answer_loss": NaN, - "step": 232, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1577, - "grad_norm": 1.6875, - "learning_rate": 2.4773398006002625e-05, - "long_answer_loss": 0.1577, - "loss": 0.1708, - "short_answer_loss": NaN, - "step": 233, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1855, - "grad_norm": 1.8671875, - "learning_rate": 2.4770454637542035e-05, - "long_answer_loss": 0.1855, - "loss": 0.182, - "short_answer_loss": NaN, - "step": 234, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1642, - "grad_norm": 1.734375, - "learning_rate": 2.4767492453423487e-05, - "long_answer_loss": 0.1642, - "loss": 0.1778, - "short_answer_loss": NaN, - "step": 235, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1711, - "grad_norm": 1.859375, - "learning_rate": 2.4764511458189222e-05, - "long_answer_loss": 0.1711, - "loss": 0.1804, - "short_answer_loss": NaN, - "step": 236, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1805, - "grad_norm": 2.015625, - "learning_rate": 2.4761511656410334e-05, - "long_answer_loss": 0.1805, - "loss": 0.1823, - "short_answer_loss": NaN, - "step": 237, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1689, - "grad_norm": 1.765625, - "learning_rate": 2.4758493052686758e-05, - "long_answer_loss": 0.1689, - "loss": 0.1765, - "short_answer_loss": NaN, - "step": 238, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1683, - "grad_norm": 1.6640625, - "learning_rate": 2.4755455651647255e-05, - "long_answer_loss": 0.1683, - "loss": 0.1751, - "short_answer_loss": NaN, - "step": 239, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1831, - "grad_norm": 1.90625, - "learning_rate": 2.475239945794941e-05, - "long_answer_loss": 0.1831, - "loss": 0.178, - "short_answer_loss": NaN, - "step": 240, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1917, - "grad_norm": 1.734375, - "learning_rate": 2.4749324476279622e-05, - "long_answer_loss": 0.1917, - "loss": 0.1782, - "short_answer_loss": NaN, - "step": 241, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.202, - "grad_norm": 1.65625, - "learning_rate": 2.4746230711353115e-05, - "long_answer_loss": 0.202, - "loss": 0.1761, - "short_answer_loss": NaN, - "step": 242, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1779, - "grad_norm": 1.7421875, - "learning_rate": 2.4743118167913893e-05, - "long_answer_loss": 0.1779, - "loss": 0.1727, - "short_answer_loss": NaN, - "step": 243, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1835, - "grad_norm": 1.7109375, - "learning_rate": 2.4739986850734768e-05, - "long_answer_loss": 0.1835, - "loss": 0.1834, - "short_answer_loss": NaN, - "step": 244, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1973, - "grad_norm": 1.7734375, - "learning_rate": 2.473683676461734e-05, - "long_answer_loss": 0.1973, - "loss": 0.1825, - "short_answer_loss": NaN, - "step": 245, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.16, - "grad_norm": 1.671875, - "learning_rate": 2.473366791439199e-05, - "long_answer_loss": 0.16, - "loss": 0.1705, - "short_answer_loss": NaN, - "step": 246, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1712, - "grad_norm": 1.984375, - "learning_rate": 2.473048030491787e-05, - "long_answer_loss": 0.1712, - "loss": 0.1697, - "short_answer_loss": NaN, - "step": 247, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1704, - "grad_norm": 1.9296875, - "learning_rate": 2.472727394108289e-05, - "long_answer_loss": 0.1704, - "loss": 0.1754, - "short_answer_loss": NaN, - "step": 248, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1551, - "grad_norm": 1.8671875, - "learning_rate": 2.4724048827803738e-05, - "long_answer_loss": 0.1551, - "loss": 0.173, - "short_answer_loss": NaN, - "step": 249, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1998, - "grad_norm": 1.8984375, - "learning_rate": 2.4720804970025827e-05, - "long_answer_loss": 0.1998, - "loss": 0.1799, - "short_answer_loss": NaN, - "step": 250, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1711, - "grad_norm": 1.8046875, - "learning_rate": 2.4717542372723333e-05, - "long_answer_loss": 0.1711, - "loss": 0.1704, - "short_answer_loss": NaN, - "step": 251, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1787, - "grad_norm": 1.671875, - "learning_rate": 2.471426104089916e-05, - "long_answer_loss": 0.1787, - "loss": 0.1786, - "short_answer_loss": NaN, - "step": 252, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.2084, - "grad_norm": 1.75, - "learning_rate": 2.4710960979584945e-05, - "long_answer_loss": 0.2084, - "loss": 0.1703, - "short_answer_loss": NaN, - "step": 253, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1814, - "grad_norm": 1.703125, - "learning_rate": 2.4707642193841036e-05, - "long_answer_loss": 0.1814, - "loss": 0.1756, - "short_answer_loss": NaN, - "step": 254, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1797, - "grad_norm": 1.765625, - "learning_rate": 2.470430468875649e-05, - "long_answer_loss": 0.1797, - "loss": 0.1834, - "short_answer_loss": NaN, - "step": 255, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1801, - "grad_norm": 1.765625, - "learning_rate": 2.4700948469449092e-05, - "long_answer_loss": 0.1801, - "loss": 0.1803, - "short_answer_loss": NaN, - "step": 256, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1742, - "grad_norm": 1.7265625, - "learning_rate": 2.4697573541065295e-05, - "long_answer_loss": 0.1742, - "loss": 0.1847, - "short_answer_loss": NaN, - "step": 257, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1778, - "grad_norm": 1.6484375, - "learning_rate": 2.4694179908780257e-05, - "long_answer_loss": 0.1778, - "loss": 0.1741, - "short_answer_loss": NaN, - "step": 258, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1916, - "grad_norm": 1.890625, - "learning_rate": 2.469076757779782e-05, - "long_answer_loss": 0.1916, - "loss": 0.1832, - "short_answer_loss": NaN, - "step": 259, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1921, - "grad_norm": 1.6875, - "learning_rate": 2.4687336553350482e-05, - "long_answer_loss": 0.1921, - "loss": 0.1746, - "short_answer_loss": NaN, - "step": 260, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1907, - "grad_norm": 1.875, - "learning_rate": 2.4683886840699422e-05, - "long_answer_loss": 0.1907, - "loss": 0.1784, - "short_answer_loss": NaN, - "step": 261, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1727, - "grad_norm": 1.65625, - "learning_rate": 2.4680418445134463e-05, - "long_answer_loss": 0.1727, - "loss": 0.1688, - "short_answer_loss": NaN, - "step": 262, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1382, - "grad_norm": 1.8671875, - "learning_rate": 2.4676931371974094e-05, - "long_answer_loss": 0.1382, - "loss": 0.1743, - "short_answer_loss": NaN, - "step": 263, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1783, - "grad_norm": 1.7578125, - "learning_rate": 2.467342562656542e-05, - "long_answer_loss": 0.1783, - "loss": 0.1789, - "short_answer_loss": NaN, - "step": 264, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.169, - "grad_norm": 1.8046875, - "learning_rate": 2.466990121428421e-05, - "long_answer_loss": 0.169, - "loss": 0.1765, - "short_answer_loss": NaN, - "step": 265, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1932, - "grad_norm": 1.9140625, - "learning_rate": 2.4666358140534817e-05, - "long_answer_loss": 0.1932, - "loss": 0.1779, - "short_answer_loss": NaN, - "step": 266, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1589, - "grad_norm": 1.7421875, - "learning_rate": 2.466279641075025e-05, - "long_answer_loss": 0.1589, - "loss": 0.1763, - "short_answer_loss": NaN, - "step": 267, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.1768, - "grad_norm": 1.8046875, - "learning_rate": 2.4659216030392098e-05, - "long_answer_loss": 0.1768, - "loss": 0.1778, - "short_answer_loss": NaN, - "step": 268, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1877, - "grad_norm": 1.6328125, - "learning_rate": 2.4655617004950553e-05, - "long_answer_loss": 0.1877, - "loss": 0.1734, - "short_answer_loss": NaN, - "step": 269, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1784, - "grad_norm": 1.6640625, - "learning_rate": 2.4651999339944416e-05, - "long_answer_loss": 0.1784, - "loss": 0.1822, - "short_answer_loss": NaN, - "step": 270, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1583, - "grad_norm": 1.9296875, - "learning_rate": 2.4648363040921047e-05, - "long_answer_loss": 0.1583, - "loss": 0.1836, - "short_answer_loss": NaN, - "step": 271, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1831, - "grad_norm": 1.6015625, - "learning_rate": 2.4644708113456394e-05, - "long_answer_loss": 0.1831, - "loss": 0.1728, - "short_answer_loss": NaN, - "step": 272, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1654, - "grad_norm": 1.6640625, - "learning_rate": 2.4641034563154957e-05, - "long_answer_loss": 0.1654, - "loss": 0.1671, - "short_answer_loss": NaN, - "step": 273, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1872, - "grad_norm": 1.7734375, - "learning_rate": 2.4637342395649815e-05, - "long_answer_loss": 0.1872, - "loss": 0.1793, - "short_answer_loss": NaN, - "step": 274, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1772, - "grad_norm": 1.6875, - "learning_rate": 2.4633631616602566e-05, - "long_answer_loss": 0.1772, - "loss": 0.169, - "short_answer_loss": NaN, - "step": 275, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.2241, - "grad_norm": 1.7578125, - "learning_rate": 2.462990223170337e-05, - "long_answer_loss": 0.2241, - "loss": 0.1874, - "short_answer_loss": NaN, - "step": 276, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1745, - "grad_norm": 1.5390625, - "learning_rate": 2.4626154246670908e-05, - "long_answer_loss": 0.1745, - "loss": 0.1757, - "short_answer_loss": NaN, - "step": 277, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.154, - "grad_norm": 1.6875, - "learning_rate": 2.4622387667252384e-05, - "long_answer_loss": 0.154, - "loss": 0.1699, - "short_answer_loss": NaN, - "step": 278, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1981, - "grad_norm": 1.8125, - "learning_rate": 2.4618602499223513e-05, - "long_answer_loss": 0.1981, - "loss": 0.1817, - "short_answer_loss": NaN, - "step": 279, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1763, - "grad_norm": 1.703125, - "learning_rate": 2.4614798748388518e-05, - "long_answer_loss": 0.1763, - "loss": 0.1787, - "short_answer_loss": NaN, - "step": 280, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1851, - "grad_norm": 1.6640625, - "learning_rate": 2.461097642058011e-05, - "long_answer_loss": 0.1851, - "loss": 0.177, - "short_answer_loss": NaN, - "step": 281, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1804, - "grad_norm": 1.75, - "learning_rate": 2.4607135521659497e-05, - "long_answer_loss": 0.1804, - "loss": 0.1798, - "short_answer_loss": NaN, - "step": 282, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1867, - "grad_norm": 1.7265625, - "learning_rate": 2.4603276057516356e-05, - "long_answer_loss": 0.1867, - "loss": 0.1818, - "short_answer_loss": NaN, - "step": 283, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1855, - "grad_norm": 1.765625, - "learning_rate": 2.4599398034068836e-05, - "long_answer_loss": 0.1855, - "loss": 0.1757, - "short_answer_loss": NaN, - "step": 284, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1853, - "grad_norm": 1.671875, - "learning_rate": 2.4595501457263538e-05, - "long_answer_loss": 0.1853, - "loss": 0.1715, - "short_answer_loss": NaN, - "step": 285, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1618, - "grad_norm": 1.578125, - "learning_rate": 2.4591586333075522e-05, - "long_answer_loss": 0.1618, - "loss": 0.1775, - "short_answer_loss": NaN, - "step": 286, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1556, - "grad_norm": 1.7734375, - "learning_rate": 2.4587652667508282e-05, - "long_answer_loss": 0.1556, - "loss": 0.1676, - "short_answer_loss": NaN, - "step": 287, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.196, - "grad_norm": 1.6328125, - "learning_rate": 2.458370046659375e-05, - "long_answer_loss": 0.196, - "loss": 0.1759, - "short_answer_loss": NaN, - "step": 288, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1929, - "grad_norm": 1.7578125, - "learning_rate": 2.457972973639228e-05, - "long_answer_loss": 0.1929, - "loss": 0.1803, - "short_answer_loss": NaN, - "step": 289, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1833, - "grad_norm": 1.9609375, - "learning_rate": 2.4575740482992625e-05, - "long_answer_loss": 0.1833, - "loss": 0.1831, - "short_answer_loss": NaN, - "step": 290, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.175, - "grad_norm": 1.609375, - "learning_rate": 2.4571732712511967e-05, - "long_answer_loss": 0.175, - "loss": 0.177, - "short_answer_loss": NaN, - "step": 291, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1867, - "grad_norm": 1.7890625, - "learning_rate": 2.4567706431095855e-05, - "long_answer_loss": 0.1867, - "loss": 0.1814, - "short_answer_loss": NaN, - "step": 292, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1801, - "grad_norm": 1.6484375, - "learning_rate": 2.456366164491824e-05, - "long_answer_loss": 0.1801, - "loss": 0.1825, - "short_answer_loss": NaN, - "step": 293, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1977, - "grad_norm": 1.8515625, - "learning_rate": 2.455959836018145e-05, - "long_answer_loss": 0.1977, - "loss": 0.1818, - "short_answer_loss": NaN, - "step": 294, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1498, - "grad_norm": 1.5703125, - "learning_rate": 2.4555516583116166e-05, - "long_answer_loss": 0.1498, - "loss": 0.1694, - "short_answer_loss": NaN, - "step": 295, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1881, - "grad_norm": 1.515625, - "learning_rate": 2.4551416319981435e-05, - "long_answer_loss": 0.1881, - "loss": 0.1753, - "short_answer_loss": NaN, - "step": 296, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1579, - "grad_norm": 1.6640625, - "learning_rate": 2.4547297577064648e-05, - "long_answer_loss": 0.1579, - "loss": 0.1644, - "short_answer_loss": NaN, - "step": 297, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1872, - "grad_norm": 1.5625, - "learning_rate": 2.4543160360681533e-05, - "long_answer_loss": 0.1872, - "loss": 0.1737, - "short_answer_loss": NaN, - "step": 298, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1537, - "grad_norm": 1.640625, - "learning_rate": 2.4539004677176147e-05, - "long_answer_loss": 0.1537, - "loss": 0.1726, - "short_answer_loss": NaN, - "step": 299, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1694, - "grad_norm": 1.7890625, - "learning_rate": 2.453483053292086e-05, - "long_answer_loss": 0.1694, - "loss": 0.177, - "short_answer_loss": NaN, - "step": 300, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1746, - "grad_norm": 1.6875, - "learning_rate": 2.453063793431636e-05, - "long_answer_loss": 0.1746, - "loss": 0.169, - "short_answer_loss": NaN, - "step": 301, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1845, - "grad_norm": 1.765625, - "learning_rate": 2.4526426887791618e-05, - "long_answer_loss": 0.1845, - "loss": 0.177, - "short_answer_loss": NaN, - "step": 302, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1717, - "grad_norm": 1.703125, - "learning_rate": 2.452219739980391e-05, - "long_answer_loss": 0.1717, - "loss": 0.178, - "short_answer_loss": NaN, - "step": 303, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.2011, - "grad_norm": 1.7734375, - "learning_rate": 2.4517949476838775e-05, - "long_answer_loss": 0.2011, - "loss": 0.176, - "short_answer_loss": NaN, - "step": 304, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1782, - "grad_norm": 1.6640625, - "learning_rate": 2.451368312541003e-05, - "long_answer_loss": 0.1782, - "loss": 0.1744, - "short_answer_loss": NaN, - "step": 305, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1608, - "grad_norm": 1.609375, - "learning_rate": 2.4509398352059755e-05, - "long_answer_loss": 0.1608, - "loss": 0.167, - "short_answer_loss": NaN, - "step": 306, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1702, - "grad_norm": 1.78125, - "learning_rate": 2.450509516335826e-05, - "long_answer_loss": 0.1702, - "loss": 0.174, - "short_answer_loss": NaN, - "step": 307, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1792, - "grad_norm": 1.5625, - "learning_rate": 2.450077356590411e-05, - "long_answer_loss": 0.1792, - "loss": 0.1678, - "short_answer_loss": NaN, - "step": 308, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1848, - "grad_norm": 1.6328125, - "learning_rate": 2.449643356632409e-05, - "long_answer_loss": 0.1848, - "loss": 0.1747, - "short_answer_loss": NaN, - "step": 309, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1703, - "grad_norm": 1.75, - "learning_rate": 2.4492075171273213e-05, - "long_answer_loss": 0.1703, - "loss": 0.1725, - "short_answer_loss": NaN, - "step": 310, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1823, - "grad_norm": 1.7578125, - "learning_rate": 2.4487698387434687e-05, - "long_answer_loss": 0.1823, - "loss": 0.169, - "short_answer_loss": NaN, - "step": 311, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1899, - "grad_norm": 1.640625, - "learning_rate": 2.4483303221519924e-05, - "long_answer_loss": 0.1899, - "loss": 0.1765, - "short_answer_loss": NaN, - "step": 312, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1695, - "grad_norm": 1.7265625, - "learning_rate": 2.4478889680268525e-05, - "long_answer_loss": 0.1695, - "loss": 0.1677, - "short_answer_loss": NaN, - "step": 313, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1753, - "grad_norm": 1.6484375, - "learning_rate": 2.447445777044826e-05, - "long_answer_loss": 0.1753, - "loss": 0.1715, - "short_answer_loss": NaN, - "step": 314, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1616, - "grad_norm": 1.828125, - "learning_rate": 2.4470007498855074e-05, - "long_answer_loss": 0.1616, - "loss": 0.176, - "short_answer_loss": NaN, - "step": 315, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1811, - "grad_norm": 1.6015625, - "learning_rate": 2.446553887231307e-05, - "long_answer_loss": 0.1811, - "loss": 0.1744, - "short_answer_loss": NaN, - "step": 316, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1735, - "grad_norm": 1.609375, - "learning_rate": 2.4461051897674487e-05, - "long_answer_loss": 0.1735, - "loss": 0.1753, - "short_answer_loss": NaN, - "step": 317, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1725, - "grad_norm": 1.6484375, - "learning_rate": 2.44565465818197e-05, - "long_answer_loss": 0.1725, - "loss": 0.1725, - "short_answer_loss": NaN, - "step": 318, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1573, - "grad_norm": 1.546875, - "learning_rate": 2.4452022931657227e-05, - "long_answer_loss": 0.1573, - "loss": 0.1712, - "short_answer_loss": NaN, - "step": 319, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.1584, - "grad_norm": 1.5234375, - "learning_rate": 2.444748095412367e-05, - "long_answer_loss": 0.1584, - "loss": 0.1666, - "short_answer_loss": NaN, - "step": 320, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1572, - "grad_norm": 1.703125, - "learning_rate": 2.4442920656183753e-05, - "long_answer_loss": 0.1572, - "loss": 0.1697, - "short_answer_loss": NaN, - "step": 321, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.153, - "grad_norm": 1.6484375, - "learning_rate": 2.44383420448303e-05, - "long_answer_loss": 0.153, - "loss": 0.1679, - "short_answer_loss": NaN, - "step": 322, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.2001, - "grad_norm": 1.6328125, - "learning_rate": 2.44337451270842e-05, - "long_answer_loss": 0.2001, - "loss": 0.1671, - "short_answer_loss": NaN, - "step": 323, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1627, - "grad_norm": 1.828125, - "learning_rate": 2.442912990999442e-05, - "long_answer_loss": 0.1627, - "loss": 0.1815, - "short_answer_loss": NaN, - "step": 324, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1915, - "grad_norm": 1.765625, - "learning_rate": 2.442449640063799e-05, - "long_answer_loss": 0.1915, - "loss": 0.1767, - "short_answer_loss": NaN, - "step": 325, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1976, - "grad_norm": 1.6875, - "learning_rate": 2.4419844606119982e-05, - "long_answer_loss": 0.1976, - "loss": 0.172, - "short_answer_loss": NaN, - "step": 326, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1881, - "grad_norm": 1.5546875, - "learning_rate": 2.4415174533573516e-05, - "long_answer_loss": 0.1881, - "loss": 0.1681, - "short_answer_loss": NaN, - "step": 327, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1629, - "grad_norm": 1.7265625, - "learning_rate": 2.4410486190159738e-05, - "long_answer_loss": 0.1629, - "loss": 0.1665, - "short_answer_loss": NaN, - "step": 328, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1612, - "grad_norm": 1.578125, - "learning_rate": 2.4405779583067803e-05, - "long_answer_loss": 0.1612, - "loss": 0.1657, - "short_answer_loss": NaN, - "step": 329, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1883, - "grad_norm": 1.59375, - "learning_rate": 2.440105471951488e-05, - "long_answer_loss": 0.1883, - "loss": 0.1729, - "short_answer_loss": NaN, - "step": 330, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1574, - "grad_norm": 1.6875, - "learning_rate": 2.439631160674613e-05, - "long_answer_loss": 0.1574, - "loss": 0.1665, - "short_answer_loss": NaN, - "step": 331, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1602, - "grad_norm": 1.6796875, - "learning_rate": 2.4391550252034696e-05, - "long_answer_loss": 0.1602, - "loss": 0.1691, - "short_answer_loss": NaN, - "step": 332, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1489, - "grad_norm": 1.6796875, - "learning_rate": 2.4386770662681698e-05, - "long_answer_loss": 0.1489, - "loss": 0.1778, - "short_answer_loss": NaN, - "step": 333, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1727, - "grad_norm": 1.71875, - "learning_rate": 2.4381972846016204e-05, - "long_answer_loss": 0.1727, - "loss": 0.1757, - "short_answer_loss": NaN, - "step": 334, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1515, - "grad_norm": 1.65625, - "learning_rate": 2.4377156809395256e-05, - "long_answer_loss": 0.1515, - "loss": 0.159, - "short_answer_loss": NaN, - "step": 335, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.157, - "grad_norm": 1.625, - "learning_rate": 2.4372322560203814e-05, - "long_answer_loss": 0.157, - "loss": 0.1645, - "short_answer_loss": NaN, - "step": 336, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1804, - "grad_norm": 1.5703125, - "learning_rate": 2.4367470105854766e-05, - "long_answer_loss": 0.1804, - "loss": 0.1709, - "short_answer_loss": NaN, - "step": 337, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1665, - "grad_norm": 1.59375, - "learning_rate": 2.436259945378893e-05, - "long_answer_loss": 0.1665, - "loss": 0.179, - "short_answer_loss": NaN, - "step": 338, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1861, - "grad_norm": 1.6875, - "learning_rate": 2.4357710611475022e-05, - "long_answer_loss": 0.1861, - "loss": 0.1735, - "short_answer_loss": NaN, - "step": 339, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1816, - "grad_norm": 1.671875, - "learning_rate": 2.4352803586409644e-05, - "long_answer_loss": 0.1816, - "loss": 0.1744, - "short_answer_loss": NaN, - "step": 340, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1678, - "grad_norm": 1.53125, - "learning_rate": 2.4347878386117287e-05, - "long_answer_loss": 0.1678, - "loss": 0.1712, - "short_answer_loss": NaN, - "step": 341, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1804, - "grad_norm": 1.65625, - "learning_rate": 2.434293501815031e-05, - "long_answer_loss": 0.1804, - "loss": 0.1749, - "short_answer_loss": NaN, - "step": 342, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1898, - "grad_norm": 1.5703125, - "learning_rate": 2.4337973490088932e-05, - "long_answer_loss": 0.1898, - "loss": 0.1681, - "short_answer_loss": NaN, - "step": 343, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1709, - "grad_norm": 1.609375, - "learning_rate": 2.4332993809541222e-05, - "long_answer_loss": 0.1709, - "loss": 0.1673, - "short_answer_loss": NaN, - "step": 344, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1666, - "grad_norm": 1.6953125, - "learning_rate": 2.432799598414307e-05, - "long_answer_loss": 0.1666, - "loss": 0.1689, - "short_answer_loss": NaN, - "step": 345, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.1463, - "grad_norm": 1.5546875, - "learning_rate": 2.4322980021558208e-05, - "long_answer_loss": 0.1463, - "loss": 0.1599, - "short_answer_loss": NaN, - "step": 346, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1684, - "grad_norm": 1.6796875, - "learning_rate": 2.4317945929478167e-05, - "long_answer_loss": 0.1684, - "loss": 0.1701, - "short_answer_loss": NaN, - "step": 347, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.2068, - "grad_norm": 1.75, - "learning_rate": 2.4312893715622287e-05, - "long_answer_loss": 0.2068, - "loss": 0.1775, - "short_answer_loss": NaN, - "step": 348, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1742, - "grad_norm": 1.6015625, - "learning_rate": 2.4307823387737688e-05, - "long_answer_loss": 0.1742, - "loss": 0.172, - "short_answer_loss": NaN, - "step": 349, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1693, - "grad_norm": 1.734375, - "learning_rate": 2.4302734953599267e-05, - "long_answer_loss": 0.1693, - "loss": 0.1699, - "short_answer_loss": NaN, - "step": 350, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1693, - "grad_norm": 1.625, - "learning_rate": 2.4297628421009696e-05, - "long_answer_loss": 0.1693, - "loss": 0.1704, - "short_answer_loss": NaN, - "step": 351, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.2014, - "grad_norm": 1.765625, - "learning_rate": 2.4292503797799387e-05, - "long_answer_loss": 0.2014, - "loss": 0.1809, - "short_answer_loss": NaN, - "step": 352, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1607, - "grad_norm": 1.5234375, - "learning_rate": 2.4287361091826493e-05, - "long_answer_loss": 0.1607, - "loss": 0.1639, - "short_answer_loss": NaN, - "step": 353, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1781, - "grad_norm": 1.59375, - "learning_rate": 2.4282200310976908e-05, - "long_answer_loss": 0.1781, - "loss": 0.1739, - "short_answer_loss": NaN, - "step": 354, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1875, - "grad_norm": 1.5625, - "learning_rate": 2.4277021463164225e-05, - "long_answer_loss": 0.1875, - "loss": 0.1692, - "short_answer_loss": NaN, - "step": 355, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1818, - "grad_norm": 1.6796875, - "learning_rate": 2.427182455632976e-05, - "long_answer_loss": 0.1818, - "loss": 0.1769, - "short_answer_loss": NaN, - "step": 356, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1429, - "grad_norm": 1.8125, - "learning_rate": 2.4266609598442496e-05, - "long_answer_loss": 0.1429, - "loss": 0.1608, - "short_answer_loss": NaN, - "step": 357, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1654, - "grad_norm": 1.515625, - "learning_rate": 2.426137659749912e-05, - "long_answer_loss": 0.1654, - "loss": 0.1686, - "short_answer_loss": NaN, - "step": 358, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1614, - "grad_norm": 1.7578125, - "learning_rate": 2.4256125561523973e-05, - "long_answer_loss": 0.1614, - "loss": 0.1804, - "short_answer_loss": NaN, - "step": 359, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1662, - "grad_norm": 1.5390625, - "learning_rate": 2.425085649856906e-05, - "long_answer_loss": 0.1662, - "loss": 0.1695, - "short_answer_loss": NaN, - "step": 360, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1752, - "grad_norm": 1.640625, - "learning_rate": 2.424556941671402e-05, - "long_answer_loss": 0.1752, - "loss": 0.1774, - "short_answer_loss": NaN, - "step": 361, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1553, - "grad_norm": 1.8046875, - "learning_rate": 2.424026432406612e-05, - "long_answer_loss": 0.1553, - "loss": 0.1713, - "short_answer_loss": NaN, - "step": 362, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1801, - "grad_norm": 1.625, - "learning_rate": 2.423494122876026e-05, - "long_answer_loss": 0.1801, - "loss": 0.1714, - "short_answer_loss": NaN, - "step": 363, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1475, - "grad_norm": 1.6015625, - "learning_rate": 2.422960013895893e-05, - "long_answer_loss": 0.1475, - "loss": 0.1713, - "short_answer_loss": NaN, - "step": 364, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1626, - "grad_norm": 1.578125, - "learning_rate": 2.4224241062852223e-05, - "long_answer_loss": 0.1626, - "loss": 0.1728, - "short_answer_loss": NaN, - "step": 365, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1864, - "grad_norm": 1.5859375, - "learning_rate": 2.421886400865781e-05, - "long_answer_loss": 0.1864, - "loss": 0.172, - "short_answer_loss": NaN, - "step": 366, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1619, - "grad_norm": 1.6796875, - "learning_rate": 2.421346898462092e-05, - "long_answer_loss": 0.1619, - "loss": 0.1722, - "short_answer_loss": NaN, - "step": 367, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1571, - "grad_norm": 1.6328125, - "learning_rate": 2.4208055999014358e-05, - "long_answer_loss": 0.1571, - "loss": 0.1704, - "short_answer_loss": NaN, - "step": 368, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1906, - "grad_norm": 1.53125, - "learning_rate": 2.4202625060138448e-05, - "long_answer_loss": 0.1906, - "loss": 0.1673, - "short_answer_loss": NaN, - "step": 369, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1708, - "grad_norm": 1.4921875, - "learning_rate": 2.4197176176321062e-05, - "long_answer_loss": 0.1708, - "loss": 0.1736, - "short_answer_loss": NaN, - "step": 370, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1703, - "grad_norm": 1.5078125, - "learning_rate": 2.4191709355917578e-05, - "long_answer_loss": 0.1703, - "loss": 0.1592, - "short_answer_loss": NaN, - "step": 371, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1814, - "grad_norm": 1.546875, - "learning_rate": 2.4186224607310885e-05, - "long_answer_loss": 0.1814, - "loss": 0.1775, - "short_answer_loss": NaN, - "step": 372, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1752, - "grad_norm": 1.578125, - "learning_rate": 2.4180721938911354e-05, - "long_answer_loss": 0.1752, - "loss": 0.175, - "short_answer_loss": NaN, - "step": 373, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1876, - "grad_norm": 1.5, - "learning_rate": 2.417520135915685e-05, - "long_answer_loss": 0.1876, - "loss": 0.1647, - "short_answer_loss": NaN, - "step": 374, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1672, - "grad_norm": 1.625, - "learning_rate": 2.416966287651269e-05, - "long_answer_loss": 0.1672, - "loss": 0.1706, - "short_answer_loss": NaN, - "step": 375, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1832, - "grad_norm": 1.65625, - "learning_rate": 2.4164106499471647e-05, - "long_answer_loss": 0.1832, - "loss": 0.172, - "short_answer_loss": NaN, - "step": 376, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1775, - "grad_norm": 1.609375, - "learning_rate": 2.4158532236553934e-05, - "long_answer_loss": 0.1775, - "loss": 0.167, - "short_answer_loss": NaN, - "step": 377, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1594, - "grad_norm": 1.4921875, - "learning_rate": 2.4152940096307192e-05, - "long_answer_loss": 0.1594, - "loss": 0.1584, - "short_answer_loss": NaN, - "step": 378, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.178, - "grad_norm": 1.5078125, - "learning_rate": 2.4147330087306475e-05, - "long_answer_loss": 0.178, - "loss": 0.1672, - "short_answer_loss": NaN, - "step": 379, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1556, - "grad_norm": 1.5390625, - "learning_rate": 2.4141702218154232e-05, - "long_answer_loss": 0.1556, - "loss": 0.1693, - "short_answer_loss": NaN, - "step": 380, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1987, - "grad_norm": 1.5, - "learning_rate": 2.4136056497480306e-05, - "long_answer_loss": 0.1987, - "loss": 0.1739, - "short_answer_loss": NaN, - "step": 381, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1702, - "grad_norm": 1.65625, - "learning_rate": 2.413039293394191e-05, - "long_answer_loss": 0.1702, - "loss": 0.1722, - "short_answer_loss": NaN, - "step": 382, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1701, - "grad_norm": 1.5546875, - "learning_rate": 2.4124711536223623e-05, - "long_answer_loss": 0.1701, - "loss": 0.1808, - "short_answer_loss": NaN, - "step": 383, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1951, - "grad_norm": 1.5546875, - "learning_rate": 2.4119012313037353e-05, - "long_answer_loss": 0.1951, - "loss": 0.1744, - "short_answer_loss": NaN, - "step": 384, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1922, - "grad_norm": 1.59375, - "learning_rate": 2.411329527312237e-05, - "long_answer_loss": 0.1922, - "loss": 0.1743, - "short_answer_loss": NaN, - "step": 385, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1705, - "grad_norm": 1.546875, - "learning_rate": 2.4107560425245248e-05, - "long_answer_loss": 0.1705, - "loss": 0.1721, - "short_answer_loss": NaN, - "step": 386, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1606, - "grad_norm": 1.5078125, - "learning_rate": 2.4101807778199858e-05, - "long_answer_loss": 0.1606, - "loss": 0.1718, - "short_answer_loss": NaN, - "step": 387, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1857, - "grad_norm": 1.6484375, - "learning_rate": 2.4096037340807385e-05, - "long_answer_loss": 0.1857, - "loss": 0.1783, - "short_answer_loss": NaN, - "step": 388, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1812, - "grad_norm": 1.5546875, - "learning_rate": 2.4090249121916284e-05, - "long_answer_loss": 0.1812, - "loss": 0.1796, - "short_answer_loss": NaN, - "step": 389, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1745, - "grad_norm": 1.6484375, - "learning_rate": 2.4084443130402274e-05, - "long_answer_loss": 0.1745, - "loss": 0.1636, - "short_answer_loss": NaN, - "step": 390, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1987, - "grad_norm": 1.640625, - "learning_rate": 2.4078619375168333e-05, - "long_answer_loss": 0.1987, - "loss": 0.186, - "short_answer_loss": NaN, - "step": 391, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1688, - "grad_norm": 1.6171875, - "learning_rate": 2.4072777865144678e-05, - "long_answer_loss": 0.1688, - "loss": 0.171, - "short_answer_loss": NaN, - "step": 392, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1596, - "grad_norm": 1.5859375, - "learning_rate": 2.406691860928874e-05, - "long_answer_loss": 0.1596, - "loss": 0.1638, - "short_answer_loss": NaN, - "step": 393, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1692, - "grad_norm": 1.5859375, - "learning_rate": 2.4061041616585177e-05, - "long_answer_loss": 0.1692, - "loss": 0.1742, - "short_answer_loss": NaN, - "step": 394, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1661, - "grad_norm": 1.6015625, - "learning_rate": 2.4055146896045837e-05, - "long_answer_loss": 0.1661, - "loss": 0.1697, - "short_answer_loss": NaN, - "step": 395, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1562, - "grad_norm": 1.515625, - "learning_rate": 2.404923445670975e-05, - "long_answer_loss": 0.1562, - "loss": 0.1675, - "short_answer_loss": NaN, - "step": 396, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1623, - "grad_norm": 1.46875, - "learning_rate": 2.404330430764312e-05, - "long_answer_loss": 0.1623, - "loss": 0.1648, - "short_answer_loss": NaN, - "step": 397, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1756, - "grad_norm": 1.5078125, - "learning_rate": 2.4037356457939307e-05, - "long_answer_loss": 0.1756, - "loss": 0.1713, - "short_answer_loss": NaN, - "step": 398, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1489, - "grad_norm": 1.578125, - "learning_rate": 2.403139091671882e-05, - "long_answer_loss": 0.1489, - "loss": 0.171, - "short_answer_loss": NaN, - "step": 399, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1594, - "grad_norm": 1.4765625, - "learning_rate": 2.4025407693129278e-05, - "long_answer_loss": 0.1594, - "loss": 0.1655, - "short_answer_loss": NaN, - "step": 400, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1474, - "grad_norm": 1.5, - "learning_rate": 2.4019406796345434e-05, - "long_answer_loss": 0.1474, - "loss": 0.1661, - "short_answer_loss": NaN, - "step": 401, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1628, - "grad_norm": 1.609375, - "learning_rate": 2.401338823556913e-05, - "long_answer_loss": 0.1628, - "loss": 0.1789, - "short_answer_loss": NaN, - "step": 402, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1855, - "grad_norm": 1.546875, - "learning_rate": 2.4007352020029292e-05, - "long_answer_loss": 0.1855, - "loss": 0.1672, - "short_answer_loss": NaN, - "step": 403, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1614, - "grad_norm": 1.484375, - "learning_rate": 2.400129815898193e-05, - "long_answer_loss": 0.1614, - "loss": 0.1633, - "short_answer_loss": NaN, - "step": 404, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1755, - "grad_norm": 1.6484375, - "learning_rate": 2.3995226661710105e-05, - "long_answer_loss": 0.1755, - "loss": 0.171, - "short_answer_loss": NaN, - "step": 405, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1746, - "grad_norm": 1.5859375, - "learning_rate": 2.3989137537523922e-05, - "long_answer_loss": 0.1746, - "loss": 0.1729, - "short_answer_loss": NaN, - "step": 406, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1726, - "grad_norm": 1.4375, - "learning_rate": 2.3983030795760504e-05, - "long_answer_loss": 0.1726, - "loss": 0.1673, - "short_answer_loss": NaN, - "step": 407, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.153, - "grad_norm": 1.5625, - "learning_rate": 2.3976906445784015e-05, - "long_answer_loss": 0.153, - "loss": 0.1792, - "short_answer_loss": NaN, - "step": 408, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1705, - "grad_norm": 1.5859375, - "learning_rate": 2.3970764496985597e-05, - "long_answer_loss": 0.1705, - "loss": 0.1704, - "short_answer_loss": NaN, - "step": 409, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1677, - "grad_norm": 1.484375, - "learning_rate": 2.3964604958783388e-05, - "long_answer_loss": 0.1677, - "loss": 0.1656, - "short_answer_loss": NaN, - "step": 410, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1659, - "grad_norm": 1.5, - "learning_rate": 2.3958427840622495e-05, - "long_answer_loss": 0.1659, - "loss": 0.1671, - "short_answer_loss": NaN, - "step": 411, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1975, - "grad_norm": 1.5390625, - "learning_rate": 2.3952233151974978e-05, - "long_answer_loss": 0.1975, - "loss": 0.1751, - "short_answer_loss": NaN, - "step": 412, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1731, - "grad_norm": 1.5703125, - "learning_rate": 2.394602090233985e-05, - "long_answer_loss": 0.1731, - "loss": 0.1785, - "short_answer_loss": NaN, - "step": 413, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.154, - "grad_norm": 1.578125, - "learning_rate": 2.393979110124305e-05, - "long_answer_loss": 0.154, - "loss": 0.1602, - "short_answer_loss": NaN, - "step": 414, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1741, - "grad_norm": 1.59375, - "learning_rate": 2.3933543758237418e-05, - "long_answer_loss": 0.1741, - "loss": 0.176, - "short_answer_loss": NaN, - "step": 415, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1838, - "grad_norm": 1.46875, - "learning_rate": 2.392727888290271e-05, - "long_answer_loss": 0.1838, - "loss": 0.1675, - "short_answer_loss": NaN, - "step": 416, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1626, - "grad_norm": 1.546875, - "learning_rate": 2.3920996484845558e-05, - "long_answer_loss": 0.1626, - "loss": 0.1733, - "short_answer_loss": NaN, - "step": 417, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1627, - "grad_norm": 1.59375, - "learning_rate": 2.391469657369946e-05, - "long_answer_loss": 0.1627, - "loss": 0.1747, - "short_answer_loss": NaN, - "step": 418, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1681, - "grad_norm": 1.515625, - "learning_rate": 2.3908379159124777e-05, - "long_answer_loss": 0.1681, - "loss": 0.1673, - "short_answer_loss": NaN, - "step": 419, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1527, - "grad_norm": 1.5546875, - "learning_rate": 2.3902044250808705e-05, - "long_answer_loss": 0.1527, - "loss": 0.1772, - "short_answer_loss": NaN, - "step": 420, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1756, - "grad_norm": 1.5703125, - "learning_rate": 2.3895691858465267e-05, - "long_answer_loss": 0.1756, - "loss": 0.1732, - "short_answer_loss": NaN, - "step": 421, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1576, - "grad_norm": 1.4765625, - "learning_rate": 2.3889321991835296e-05, - "long_answer_loss": 0.1576, - "loss": 0.1627, - "short_answer_loss": NaN, - "step": 422, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1607, - "grad_norm": 1.6953125, - "learning_rate": 2.3882934660686418e-05, - "long_answer_loss": 0.1607, - "loss": 0.1717, - "short_answer_loss": NaN, - "step": 423, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1931, - "grad_norm": 1.5078125, - "learning_rate": 2.3876529874813036e-05, - "long_answer_loss": 0.1931, - "loss": 0.182, - "short_answer_loss": NaN, - "step": 424, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1765, - "grad_norm": 1.6484375, - "learning_rate": 2.3870107644036334e-05, - "long_answer_loss": 0.1765, - "loss": 0.1709, - "short_answer_loss": NaN, - "step": 425, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.2063, - "grad_norm": 1.53125, - "learning_rate": 2.3863667978204225e-05, - "long_answer_loss": 0.2063, - "loss": 0.1742, - "short_answer_loss": NaN, - "step": 426, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1628, - "grad_norm": 1.484375, - "learning_rate": 2.385721088719138e-05, - "long_answer_loss": 0.1628, - "loss": 0.1699, - "short_answer_loss": NaN, - "step": 427, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1639, - "grad_norm": 1.5703125, - "learning_rate": 2.385073638089916e-05, - "long_answer_loss": 0.1639, - "loss": 0.1723, - "short_answer_loss": NaN, - "step": 428, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1981, - "grad_norm": 1.375, - "learning_rate": 2.3844244469255665e-05, - "long_answer_loss": 0.1981, - "loss": 0.1602, - "short_answer_loss": NaN, - "step": 429, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1775, - "grad_norm": 1.59375, - "learning_rate": 2.383773516221566e-05, - "long_answer_loss": 0.1775, - "loss": 0.1674, - "short_answer_loss": NaN, - "step": 430, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1972, - "grad_norm": 1.5625, - "learning_rate": 2.3831208469760588e-05, - "long_answer_loss": 0.1972, - "loss": 0.1734, - "short_answer_loss": NaN, - "step": 431, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1634, - "grad_norm": 1.5625, - "learning_rate": 2.3824664401898564e-05, - "long_answer_loss": 0.1634, - "loss": 0.1686, - "short_answer_loss": NaN, - "step": 432, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1852, - "grad_norm": 1.65625, - "learning_rate": 2.3818102968664334e-05, - "long_answer_loss": 0.1852, - "loss": 0.1734, - "short_answer_loss": NaN, - "step": 433, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.164, - "grad_norm": 1.5703125, - "learning_rate": 2.3811524180119276e-05, - "long_answer_loss": 0.164, - "loss": 0.1704, - "short_answer_loss": NaN, - "step": 434, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1565, - "grad_norm": 1.5, - "learning_rate": 2.3804928046351384e-05, - "long_answer_loss": 0.1565, - "loss": 0.1663, - "short_answer_loss": NaN, - "step": 435, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1741, - "grad_norm": 1.5859375, - "learning_rate": 2.379831457747524e-05, - "long_answer_loss": 0.1741, - "loss": 0.1718, - "short_answer_loss": NaN, - "step": 436, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1936, - "grad_norm": 1.640625, - "learning_rate": 2.3791683783632018e-05, - "long_answer_loss": 0.1936, - "loss": 0.1797, - "short_answer_loss": NaN, - "step": 437, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1689, - "grad_norm": 1.515625, - "learning_rate": 2.3785035674989452e-05, - "long_answer_loss": 0.1689, - "loss": 0.1723, - "short_answer_loss": NaN, - "step": 438, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1903, - "grad_norm": 1.671875, - "learning_rate": 2.3778370261741834e-05, - "long_answer_loss": 0.1903, - "loss": 0.1775, - "short_answer_loss": NaN, - "step": 439, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1722, - "grad_norm": 1.515625, - "learning_rate": 2.3771687554109983e-05, - "long_answer_loss": 0.1722, - "loss": 0.1616, - "short_answer_loss": NaN, - "step": 440, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1647, - "grad_norm": 1.4609375, - "learning_rate": 2.376498756234124e-05, - "long_answer_loss": 0.1647, - "loss": 0.157, - "short_answer_loss": NaN, - "step": 441, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1953, - "grad_norm": 1.671875, - "learning_rate": 2.3758270296709455e-05, - "long_answer_loss": 0.1953, - "loss": 0.1775, - "short_answer_loss": NaN, - "step": 442, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1512, - "grad_norm": 1.609375, - "learning_rate": 2.3751535767514955e-05, - "long_answer_loss": 0.1512, - "loss": 0.1677, - "short_answer_loss": NaN, - "step": 443, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1621, - "grad_norm": 1.6796875, - "learning_rate": 2.374478398508455e-05, - "long_answer_loss": 0.1621, - "loss": 0.1739, - "short_answer_loss": NaN, - "step": 444, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.144, - "grad_norm": 1.5078125, - "learning_rate": 2.3738014959771498e-05, - "long_answer_loss": 0.144, - "loss": 0.1536, - "short_answer_loss": NaN, - "step": 445, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.17, - "grad_norm": 1.4296875, - "learning_rate": 2.3731228701955506e-05, - "long_answer_loss": 0.17, - "loss": 0.1623, - "short_answer_loss": NaN, - "step": 446, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1483, - "grad_norm": 1.4921875, - "learning_rate": 2.3724425222042692e-05, - "long_answer_loss": 0.1483, - "loss": 0.1691, - "short_answer_loss": NaN, - "step": 447, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1925, - "grad_norm": 1.53125, - "learning_rate": 2.3717604530465604e-05, - "long_answer_loss": 0.1925, - "loss": 0.1725, - "short_answer_loss": NaN, - "step": 448, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1583, - "grad_norm": 1.5078125, - "learning_rate": 2.3710766637683158e-05, - "long_answer_loss": 0.1583, - "loss": 0.1693, - "short_answer_loss": NaN, - "step": 449, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1951, - "grad_norm": 1.4375, - "learning_rate": 2.3703911554180666e-05, - "long_answer_loss": 0.1951, - "loss": 0.1755, - "short_answer_loss": NaN, - "step": 450, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1633, - "grad_norm": 1.390625, - "learning_rate": 2.369703929046979e-05, - "long_answer_loss": 0.1633, - "loss": 0.1632, - "short_answer_loss": NaN, - "step": 451, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1814, - "grad_norm": 1.4921875, - "learning_rate": 2.369014985708854e-05, - "long_answer_loss": 0.1814, - "loss": 0.1706, - "short_answer_loss": NaN, - "step": 452, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.178, - "grad_norm": 1.53125, - "learning_rate": 2.3683243264601253e-05, - "long_answer_loss": 0.178, - "loss": 0.1762, - "short_answer_loss": NaN, - "step": 453, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1773, - "grad_norm": 1.578125, - "learning_rate": 2.3676319523598577e-05, - "long_answer_loss": 0.1773, - "loss": 0.1691, - "short_answer_loss": NaN, - "step": 454, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1677, - "grad_norm": 1.4453125, - "learning_rate": 2.366937864469746e-05, - "long_answer_loss": 0.1677, - "loss": 0.1714, - "short_answer_loss": NaN, - "step": 455, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1799, - "grad_norm": 1.40625, - "learning_rate": 2.366242063854112e-05, - "long_answer_loss": 0.1799, - "loss": 0.1653, - "short_answer_loss": NaN, - "step": 456, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1721, - "grad_norm": 1.515625, - "learning_rate": 2.3655445515799053e-05, - "long_answer_loss": 0.1721, - "loss": 0.1673, - "short_answer_loss": NaN, - "step": 457, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1854, - "grad_norm": 1.4921875, - "learning_rate": 2.364845328716699e-05, - "long_answer_loss": 0.1854, - "loss": 0.1654, - "short_answer_loss": NaN, - "step": 458, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1768, - "grad_norm": 1.546875, - "learning_rate": 2.3641443963366893e-05, - "long_answer_loss": 0.1768, - "loss": 0.1682, - "short_answer_loss": NaN, - "step": 459, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1397, - "grad_norm": 1.6171875, - "learning_rate": 2.3634417555146944e-05, - "long_answer_loss": 0.1397, - "loss": 0.1704, - "short_answer_loss": NaN, - "step": 460, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1699, - "grad_norm": 1.4921875, - "learning_rate": 2.3627374073281522e-05, - "long_answer_loss": 0.1699, - "loss": 0.1669, - "short_answer_loss": NaN, - "step": 461, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1854, - "grad_norm": 1.453125, - "learning_rate": 2.3620313528571175e-05, - "long_answer_loss": 0.1854, - "loss": 0.1676, - "short_answer_loss": NaN, - "step": 462, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1752, - "grad_norm": 1.546875, - "learning_rate": 2.361323593184263e-05, - "long_answer_loss": 0.1752, - "loss": 0.1767, - "short_answer_loss": NaN, - "step": 463, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1727, - "grad_norm": 1.421875, - "learning_rate": 2.360614129394876e-05, - "long_answer_loss": 0.1727, - "loss": 0.1593, - "short_answer_loss": NaN, - "step": 464, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1874, - "grad_norm": 1.515625, - "learning_rate": 2.359902962576856e-05, - "long_answer_loss": 0.1874, - "loss": 0.1697, - "short_answer_loss": NaN, - "step": 465, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.161, - "grad_norm": 1.4296875, - "learning_rate": 2.3591900938207147e-05, - "long_answer_loss": 0.161, - "loss": 0.1741, - "short_answer_loss": NaN, - "step": 466, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1667, - "grad_norm": 1.53125, - "learning_rate": 2.358475524219573e-05, - "long_answer_loss": 0.1667, - "loss": 0.1749, - "short_answer_loss": NaN, - "step": 467, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1708, - "grad_norm": 1.4921875, - "learning_rate": 2.3577592548691606e-05, - "long_answer_loss": 0.1708, - "loss": 0.1739, - "short_answer_loss": NaN, - "step": 468, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1475, - "grad_norm": 1.6015625, - "learning_rate": 2.3570412868678132e-05, - "long_answer_loss": 0.1475, - "loss": 0.1611, - "short_answer_loss": NaN, - "step": 469, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.2036, - "grad_norm": 1.515625, - "learning_rate": 2.3563216213164713e-05, - "long_answer_loss": 0.2036, - "loss": 0.1762, - "short_answer_loss": NaN, - "step": 470, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1634, - "grad_norm": 1.421875, - "learning_rate": 2.3556002593186783e-05, - "long_answer_loss": 0.1634, - "loss": 0.1632, - "short_answer_loss": NaN, - "step": 471, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1565, - "grad_norm": 1.5859375, - "learning_rate": 2.3548772019805793e-05, - "long_answer_loss": 0.1565, - "loss": 0.1654, - "short_answer_loss": NaN, - "step": 472, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1365, - "grad_norm": 1.515625, - "learning_rate": 2.3541524504109182e-05, - "long_answer_loss": 0.1365, - "loss": 0.1658, - "short_answer_loss": NaN, - "step": 473, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1582, - "grad_norm": 1.4453125, - "learning_rate": 2.3534260057210384e-05, - "long_answer_loss": 0.1582, - "loss": 0.1704, - "short_answer_loss": NaN, - "step": 474, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.2011, - "grad_norm": 1.4375, - "learning_rate": 2.3526978690248782e-05, - "long_answer_loss": 0.2011, - "loss": 0.1601, - "short_answer_loss": NaN, - "step": 475, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1821, - "grad_norm": 1.5078125, - "learning_rate": 2.351968041438971e-05, - "long_answer_loss": 0.1821, - "loss": 0.1651, - "short_answer_loss": NaN, - "step": 476, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1619, - "grad_norm": 1.4453125, - "learning_rate": 2.3512365240824426e-05, - "long_answer_loss": 0.1619, - "loss": 0.1641, - "short_answer_loss": NaN, - "step": 477, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1637, - "grad_norm": 1.4296875, - "learning_rate": 2.350503318077011e-05, - "long_answer_loss": 0.1637, - "loss": 0.1589, - "short_answer_loss": NaN, - "step": 478, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.2126, - "grad_norm": 1.734375, - "learning_rate": 2.3497684245469816e-05, - "long_answer_loss": 0.2126, - "loss": 0.1755, - "short_answer_loss": NaN, - "step": 479, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1556, - "grad_norm": 1.421875, - "learning_rate": 2.3490318446192498e-05, - "long_answer_loss": 0.1556, - "loss": 0.1589, - "short_answer_loss": NaN, - "step": 480, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1729, - "grad_norm": 1.5, - "learning_rate": 2.3482935794232953e-05, - "long_answer_loss": 0.1729, - "loss": 0.1698, - "short_answer_loss": NaN, - "step": 481, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1569, - "grad_norm": 1.515625, - "learning_rate": 2.3475536300911827e-05, - "long_answer_loss": 0.1569, - "loss": 0.1691, - "short_answer_loss": NaN, - "step": 482, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1344, - "grad_norm": 1.4765625, - "learning_rate": 2.346811997757559e-05, - "long_answer_loss": 0.1344, - "loss": 0.168, - "short_answer_loss": NaN, - "step": 483, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1579, - "grad_norm": 1.4921875, - "learning_rate": 2.3460686835596514e-05, - "long_answer_loss": 0.1579, - "loss": 0.1681, - "short_answer_loss": NaN, - "step": 484, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1907, - "grad_norm": 1.515625, - "learning_rate": 2.345323688637267e-05, - "long_answer_loss": 0.1907, - "loss": 0.1705, - "short_answer_loss": NaN, - "step": 485, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1715, - "grad_norm": 1.546875, - "learning_rate": 2.34457701413279e-05, - "long_answer_loss": 0.1715, - "loss": 0.1613, - "short_answer_loss": NaN, - "step": 486, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1533, - "grad_norm": 1.3984375, - "learning_rate": 2.3438286611911787e-05, - "long_answer_loss": 0.1533, - "loss": 0.1624, - "short_answer_loss": NaN, - "step": 487, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1697, - "grad_norm": 1.5703125, - "learning_rate": 2.3430786309599674e-05, - "long_answer_loss": 0.1697, - "loss": 0.1659, - "short_answer_loss": NaN, - "step": 488, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1823, - "grad_norm": 1.6328125, - "learning_rate": 2.3423269245892602e-05, - "long_answer_loss": 0.1823, - "loss": 0.1716, - "short_answer_loss": NaN, - "step": 489, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1861, - "grad_norm": 1.5625, - "learning_rate": 2.3415735432317328e-05, - "long_answer_loss": 0.1861, - "loss": 0.172, - "short_answer_loss": NaN, - "step": 490, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1726, - "grad_norm": 1.3984375, - "learning_rate": 2.3408184880426293e-05, - "long_answer_loss": 0.1726, - "loss": 0.1685, - "short_answer_loss": NaN, - "step": 491, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1605, - "grad_norm": 1.46875, - "learning_rate": 2.3400617601797597e-05, - "long_answer_loss": 0.1605, - "loss": 0.1666, - "short_answer_loss": NaN, - "step": 492, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1579, - "grad_norm": 1.609375, - "learning_rate": 2.3393033608034993e-05, - "long_answer_loss": 0.1579, - "loss": 0.164, - "short_answer_loss": NaN, - "step": 493, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1729, - "grad_norm": 1.4765625, - "learning_rate": 2.338543291076787e-05, - "long_answer_loss": 0.1729, - "loss": 0.1662, - "short_answer_loss": NaN, - "step": 494, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1626, - "grad_norm": 1.515625, - "learning_rate": 2.3377815521651213e-05, - "long_answer_loss": 0.1626, - "loss": 0.1745, - "short_answer_loss": NaN, - "step": 495, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1797, - "grad_norm": 1.6015625, - "learning_rate": 2.3370181452365633e-05, - "long_answer_loss": 0.1797, - "loss": 0.175, - "short_answer_loss": NaN, - "step": 496, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1468, - "grad_norm": 1.453125, - "learning_rate": 2.3362530714617287e-05, - "long_answer_loss": 0.1468, - "loss": 0.1721, - "short_answer_loss": NaN, - "step": 497, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1568, - "grad_norm": 1.4765625, - "learning_rate": 2.3354863320137916e-05, - "long_answer_loss": 0.1568, - "loss": 0.1665, - "short_answer_loss": NaN, - "step": 498, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1422, - "grad_norm": 1.5390625, - "learning_rate": 2.3347179280684782e-05, - "long_answer_loss": 0.1422, - "loss": 0.1718, - "short_answer_loss": NaN, - "step": 499, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1559, - "grad_norm": 1.546875, - "learning_rate": 2.3339478608040682e-05, - "long_answer_loss": 0.1559, - "loss": 0.1763, - "short_answer_loss": NaN, - "step": 500, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1621, - "grad_norm": 1.484375, - "learning_rate": 2.3331761314013924e-05, - "long_answer_loss": 0.1621, - "loss": 0.165, - "short_answer_loss": NaN, - "step": 501, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1939, - "grad_norm": 1.5390625, - "learning_rate": 2.3324027410438288e-05, - "long_answer_loss": 0.1939, - "loss": 0.1665, - "short_answer_loss": NaN, - "step": 502, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1553, - "grad_norm": 1.40625, - "learning_rate": 2.331627690917304e-05, - "long_answer_loss": 0.1553, - "loss": 0.1684, - "short_answer_loss": NaN, - "step": 503, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1612, - "grad_norm": 1.421875, - "learning_rate": 2.3308509822102884e-05, - "long_answer_loss": 0.1612, - "loss": 0.1664, - "short_answer_loss": NaN, - "step": 504, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1765, - "grad_norm": 1.578125, - "learning_rate": 2.330072616113796e-05, - "long_answer_loss": 0.1765, - "loss": 0.1691, - "short_answer_loss": NaN, - "step": 505, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1581, - "grad_norm": 1.515625, - "learning_rate": 2.329292593821383e-05, - "long_answer_loss": 0.1581, - "loss": 0.1593, - "short_answer_loss": NaN, - "step": 506, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1539, - "grad_norm": 1.4296875, - "learning_rate": 2.3285109165291442e-05, - "long_answer_loss": 0.1539, - "loss": 0.161, - "short_answer_loss": NaN, - "step": 507, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1727, - "grad_norm": 1.515625, - "learning_rate": 2.327727585435713e-05, - "long_answer_loss": 0.1727, - "loss": 0.1705, - "short_answer_loss": NaN, - "step": 508, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1661, - "grad_norm": 1.5546875, - "learning_rate": 2.3269426017422576e-05, - "long_answer_loss": 0.1661, - "loss": 0.1679, - "short_answer_loss": NaN, - "step": 509, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1664, - "grad_norm": 1.3984375, - "learning_rate": 2.3261559666524824e-05, - "long_answer_loss": 0.1664, - "loss": 0.16, - "short_answer_loss": NaN, - "step": 510, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1819, - "grad_norm": 1.59375, - "learning_rate": 2.3253676813726218e-05, - "long_answer_loss": 0.1819, - "loss": 0.1691, - "short_answer_loss": NaN, - "step": 511, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1431, - "grad_norm": 1.4453125, - "learning_rate": 2.324577747111442e-05, - "long_answer_loss": 0.1431, - "loss": 0.1634, - "short_answer_loss": NaN, - "step": 512, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.146, - "grad_norm": 1.484375, - "learning_rate": 2.323786165080238e-05, - "long_answer_loss": 0.146, - "loss": 0.1679, - "short_answer_loss": NaN, - "step": 513, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.172, - "grad_norm": 1.6171875, - "learning_rate": 2.3229929364928294e-05, - "long_answer_loss": 0.172, - "loss": 0.173, - "short_answer_loss": NaN, - "step": 514, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1789, - "grad_norm": 1.484375, - "learning_rate": 2.3221980625655632e-05, - "long_answer_loss": 0.1789, - "loss": 0.1587, - "short_answer_loss": NaN, - "step": 515, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1505, - "grad_norm": 1.46875, - "learning_rate": 2.3214015445173083e-05, - "long_answer_loss": 0.1505, - "loss": 0.1606, - "short_answer_loss": NaN, - "step": 516, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1465, - "grad_norm": 1.4609375, - "learning_rate": 2.3206033835694545e-05, - "long_answer_loss": 0.1465, - "loss": 0.163, - "short_answer_loss": NaN, - "step": 517, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1578, - "grad_norm": 1.5390625, - "learning_rate": 2.3198035809459114e-05, - "long_answer_loss": 0.1578, - "loss": 0.1711, - "short_answer_loss": NaN, - "step": 518, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1532, - "grad_norm": 1.453125, - "learning_rate": 2.3190021378731054e-05, - "long_answer_loss": 0.1532, - "loss": 0.156, - "short_answer_loss": NaN, - "step": 519, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1936, - "grad_norm": 1.5078125, - "learning_rate": 2.3181990555799786e-05, - "long_answer_loss": 0.1936, - "loss": 0.1668, - "short_answer_loss": NaN, - "step": 520, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1764, - "grad_norm": 1.78125, - "learning_rate": 2.3173943352979865e-05, - "long_answer_loss": 0.1764, - "loss": 0.1717, - "short_answer_loss": NaN, - "step": 521, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1808, - "grad_norm": 1.5859375, - "learning_rate": 2.3165879782610973e-05, - "long_answer_loss": 0.1808, - "loss": 0.1675, - "short_answer_loss": NaN, - "step": 522, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1686, - "grad_norm": 1.546875, - "learning_rate": 2.3157799857057878e-05, - "long_answer_loss": 0.1686, - "loss": 0.1627, - "short_answer_loss": NaN, - "step": 523, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1803, - "grad_norm": 1.5234375, - "learning_rate": 2.314970358871043e-05, - "long_answer_loss": 0.1803, - "loss": 0.1723, - "short_answer_loss": NaN, - "step": 524, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1508, - "grad_norm": 1.46875, - "learning_rate": 2.314159098998354e-05, - "long_answer_loss": 0.1508, - "loss": 0.1642, - "short_answer_loss": NaN, - "step": 525, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1954, - "grad_norm": 1.625, - "learning_rate": 2.3133462073317174e-05, - "long_answer_loss": 0.1954, - "loss": 0.1791, - "short_answer_loss": NaN, - "step": 526, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1637, - "grad_norm": 1.4921875, - "learning_rate": 2.3125316851176288e-05, - "long_answer_loss": 0.1637, - "loss": 0.1675, - "short_answer_loss": NaN, - "step": 527, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1809, - "grad_norm": 1.5859375, - "learning_rate": 2.3117155336050875e-05, - "long_answer_loss": 0.1809, - "loss": 0.1741, - "short_answer_loss": NaN, - "step": 528, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1594, - "grad_norm": 1.5, - "learning_rate": 2.3108977540455893e-05, - "long_answer_loss": 0.1594, - "loss": 0.1669, - "short_answer_loss": NaN, - "step": 529, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.204, - "grad_norm": 1.4375, - "learning_rate": 2.3100783476931267e-05, - "long_answer_loss": 0.204, - "loss": 0.1645, - "short_answer_loss": NaN, - "step": 530, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1508, - "grad_norm": 1.421875, - "learning_rate": 2.3092573158041873e-05, - "long_answer_loss": 0.1508, - "loss": 0.1678, - "short_answer_loss": NaN, - "step": 531, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1789, - "grad_norm": 1.484375, - "learning_rate": 2.3084346596377505e-05, - "long_answer_loss": 0.1789, - "loss": 0.161, - "short_answer_loss": NaN, - "step": 532, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1675, - "grad_norm": 1.5546875, - "learning_rate": 2.3076103804552872e-05, - "long_answer_loss": 0.1675, - "loss": 0.1665, - "short_answer_loss": NaN, - "step": 533, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1722, - "grad_norm": 1.53125, - "learning_rate": 2.3067844795207565e-05, - "long_answer_loss": 0.1722, - "loss": 0.1775, - "short_answer_loss": NaN, - "step": 534, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1715, - "grad_norm": 1.6484375, - "learning_rate": 2.305956958100605e-05, - "long_answer_loss": 0.1715, - "loss": 0.1688, - "short_answer_loss": NaN, - "step": 535, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.168, - "grad_norm": 1.6015625, - "learning_rate": 2.305127817463763e-05, - "long_answer_loss": 0.168, - "loss": 0.1673, - "short_answer_loss": NaN, - "step": 536, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1516, - "grad_norm": 1.5703125, - "learning_rate": 2.3042970588816445e-05, - "long_answer_loss": 0.1516, - "loss": 0.1723, - "short_answer_loss": NaN, - "step": 537, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1507, - "grad_norm": 1.453125, - "learning_rate": 2.3034646836281447e-05, - "long_answer_loss": 0.1507, - "loss": 0.1544, - "short_answer_loss": NaN, - "step": 538, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1509, - "grad_norm": 1.609375, - "learning_rate": 2.3026306929796374e-05, - "long_answer_loss": 0.1509, - "loss": 0.1665, - "short_answer_loss": NaN, - "step": 539, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1362, - "grad_norm": 1.5703125, - "learning_rate": 2.3017950882149736e-05, - "long_answer_loss": 0.1362, - "loss": 0.1747, - "short_answer_loss": NaN, - "step": 540, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1778, - "grad_norm": 1.5859375, - "learning_rate": 2.3009578706154787e-05, - "long_answer_loss": 0.1778, - "loss": 0.1722, - "short_answer_loss": NaN, - "step": 541, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1641, - "grad_norm": 1.5546875, - "learning_rate": 2.300119041464953e-05, - "long_answer_loss": 0.1641, - "loss": 0.1689, - "short_answer_loss": NaN, - "step": 542, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1651, - "grad_norm": 1.5234375, - "learning_rate": 2.2992786020496665e-05, - "long_answer_loss": 0.1651, - "loss": 0.159, - "short_answer_loss": NaN, - "step": 543, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1431, - "grad_norm": 1.5234375, - "learning_rate": 2.2984365536583585e-05, - "long_answer_loss": 0.1431, - "loss": 0.1627, - "short_answer_loss": NaN, - "step": 544, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1724, - "grad_norm": 1.4453125, - "learning_rate": 2.2975928975822363e-05, - "long_answer_loss": 0.1724, - "loss": 0.1637, - "short_answer_loss": NaN, - "step": 545, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1353, - "grad_norm": 1.3984375, - "learning_rate": 2.2967476351149713e-05, - "long_answer_loss": 0.1353, - "loss": 0.1562, - "short_answer_loss": NaN, - "step": 546, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1846, - "grad_norm": 1.453125, - "learning_rate": 2.2959007675526987e-05, - "long_answer_loss": 0.1846, - "loss": 0.1698, - "short_answer_loss": NaN, - "step": 547, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1545, - "grad_norm": 1.484375, - "learning_rate": 2.2950522961940163e-05, - "long_answer_loss": 0.1545, - "loss": 0.166, - "short_answer_loss": NaN, - "step": 548, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1724, - "grad_norm": 1.46875, - "learning_rate": 2.2942022223399788e-05, - "long_answer_loss": 0.1724, - "loss": 0.1688, - "short_answer_loss": NaN, - "step": 549, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1635, - "grad_norm": 1.46875, - "learning_rate": 2.2933505472940995e-05, - "long_answer_loss": 0.1635, - "loss": 0.158, - "short_answer_loss": NaN, - "step": 550, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1854, - "grad_norm": 1.6875, - "learning_rate": 2.2924972723623474e-05, - "long_answer_loss": 0.1854, - "loss": 0.1712, - "short_answer_loss": NaN, - "step": 551, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1738, - "grad_norm": 1.5546875, - "learning_rate": 2.2916423988531437e-05, - "long_answer_loss": 0.1738, - "loss": 0.1655, - "short_answer_loss": NaN, - "step": 552, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1479, - "grad_norm": 1.4453125, - "learning_rate": 2.2907859280773617e-05, - "long_answer_loss": 0.1479, - "loss": 0.1625, - "short_answer_loss": NaN, - "step": 553, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.161, - "grad_norm": 1.4140625, - "learning_rate": 2.2899278613483232e-05, - "long_answer_loss": 0.161, - "loss": 0.1561, - "short_answer_loss": NaN, - "step": 554, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1328, - "grad_norm": 1.5078125, - "learning_rate": 2.289068199981798e-05, - "long_answer_loss": 0.1328, - "loss": 0.1561, - "short_answer_loss": NaN, - "step": 555, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1908, - "grad_norm": 1.4140625, - "learning_rate": 2.288206945296001e-05, - "long_answer_loss": 0.1908, - "loss": 0.1616, - "short_answer_loss": NaN, - "step": 556, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1642, - "grad_norm": 1.53125, - "learning_rate": 2.2873440986115903e-05, - "long_answer_loss": 0.1642, - "loss": 0.1605, - "short_answer_loss": NaN, - "step": 557, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.181, - "grad_norm": 1.546875, - "learning_rate": 2.2864796612516644e-05, - "long_answer_loss": 0.181, - "loss": 0.1652, - "short_answer_loss": NaN, - "step": 558, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1668, - "grad_norm": 1.46875, - "learning_rate": 2.2856136345417618e-05, - "long_answer_loss": 0.1668, - "loss": 0.1721, - "short_answer_loss": NaN, - "step": 559, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1369, - "grad_norm": 1.515625, - "learning_rate": 2.2847460198098585e-05, - "long_answer_loss": 0.1369, - "loss": 0.1557, - "short_answer_loss": NaN, - "step": 560, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1605, - "grad_norm": 1.46875, - "learning_rate": 2.2838768183863644e-05, - "long_answer_loss": 0.1605, - "loss": 0.1621, - "short_answer_loss": NaN, - "step": 561, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.15, - "grad_norm": 1.4375, - "learning_rate": 2.283006031604123e-05, - "long_answer_loss": 0.15, - "loss": 0.1575, - "short_answer_loss": NaN, - "step": 562, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.177, - "grad_norm": 1.6328125, - "learning_rate": 2.2821336607984095e-05, - "long_answer_loss": 0.177, - "loss": 0.1659, - "short_answer_loss": NaN, - "step": 563, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1767, - "grad_norm": 1.5234375, - "learning_rate": 2.2812597073069274e-05, - "long_answer_loss": 0.1767, - "loss": 0.1588, - "short_answer_loss": NaN, - "step": 564, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1729, - "grad_norm": 1.546875, - "learning_rate": 2.2803841724698065e-05, - "long_answer_loss": 0.1729, - "loss": 0.1653, - "short_answer_loss": NaN, - "step": 565, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1453, - "grad_norm": 1.390625, - "learning_rate": 2.279507057629603e-05, - "long_answer_loss": 0.1453, - "loss": 0.165, - "short_answer_loss": NaN, - "step": 566, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1633, - "grad_norm": 1.5078125, - "learning_rate": 2.278628364131294e-05, - "long_answer_loss": 0.1633, - "loss": 0.1712, - "short_answer_loss": NaN, - "step": 567, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1726, - "grad_norm": 1.5390625, - "learning_rate": 2.277748093322279e-05, - "long_answer_loss": 0.1726, - "loss": 0.1663, - "short_answer_loss": NaN, - "step": 568, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1551, - "grad_norm": 1.4375, - "learning_rate": 2.2768662465523755e-05, - "long_answer_loss": 0.1551, - "loss": 0.1603, - "short_answer_loss": NaN, - "step": 569, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1701, - "grad_norm": 1.390625, - "learning_rate": 2.275982825173817e-05, - "long_answer_loss": 0.1701, - "loss": 0.1638, - "short_answer_loss": NaN, - "step": 570, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1825, - "grad_norm": 1.515625, - "learning_rate": 2.2750978305412528e-05, - "long_answer_loss": 0.1825, - "loss": 0.1686, - "short_answer_loss": NaN, - "step": 571, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1553, - "grad_norm": 1.5390625, - "learning_rate": 2.274211264011744e-05, - "long_answer_loss": 0.1553, - "loss": 0.1675, - "short_answer_loss": NaN, - "step": 572, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.181, - "grad_norm": 1.609375, - "learning_rate": 2.273323126944762e-05, - "long_answer_loss": 0.181, - "loss": 0.1664, - "short_answer_loss": NaN, - "step": 573, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1669, - "grad_norm": 1.59375, - "learning_rate": 2.2724334207021857e-05, - "long_answer_loss": 0.1669, - "loss": 0.1657, - "short_answer_loss": NaN, - "step": 574, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1474, - "grad_norm": 1.5390625, - "learning_rate": 2.271542146648302e-05, - "long_answer_loss": 0.1474, - "loss": 0.1571, - "short_answer_loss": NaN, - "step": 575, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.168, - "grad_norm": 1.484375, - "learning_rate": 2.2706493061498e-05, - "long_answer_loss": 0.168, - "loss": 0.1579, - "short_answer_loss": NaN, - "step": 576, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1424, - "grad_norm": 1.59375, - "learning_rate": 2.2697549005757728e-05, - "long_answer_loss": 0.1424, - "loss": 0.16, - "short_answer_loss": NaN, - "step": 577, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1731, - "grad_norm": 1.46875, - "learning_rate": 2.2688589312977117e-05, - "long_answer_loss": 0.1731, - "loss": 0.1612, - "short_answer_loss": NaN, - "step": 578, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1888, - "grad_norm": 1.484375, - "learning_rate": 2.267961399689506e-05, - "long_answer_loss": 0.1888, - "loss": 0.1614, - "short_answer_loss": NaN, - "step": 579, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1405, - "grad_norm": 1.6875, - "learning_rate": 2.2670623071274423e-05, - "long_answer_loss": 0.1405, - "loss": 0.1643, - "short_answer_loss": NaN, - "step": 580, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1466, - "grad_norm": 1.4609375, - "learning_rate": 2.2661616549901982e-05, - "long_answer_loss": 0.1466, - "loss": 0.1608, - "short_answer_loss": NaN, - "step": 581, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1548, - "grad_norm": 1.3984375, - "learning_rate": 2.2652594446588456e-05, - "long_answer_loss": 0.1548, - "loss": 0.1601, - "short_answer_loss": NaN, - "step": 582, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1799, - "grad_norm": 1.5546875, - "learning_rate": 2.264355677516843e-05, - "long_answer_loss": 0.1799, - "loss": 0.1651, - "short_answer_loss": NaN, - "step": 583, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1587, - "grad_norm": 1.453125, - "learning_rate": 2.263450354950038e-05, - "long_answer_loss": 0.1587, - "loss": 0.161, - "short_answer_loss": NaN, - "step": 584, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1888, - "grad_norm": 1.53125, - "learning_rate": 2.262543478346663e-05, - "long_answer_loss": 0.1888, - "loss": 0.17, - "short_answer_loss": NaN, - "step": 585, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1487, - "grad_norm": 1.546875, - "learning_rate": 2.2616350490973326e-05, - "long_answer_loss": 0.1487, - "loss": 0.1602, - "short_answer_loss": NaN, - "step": 586, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1764, - "grad_norm": 1.5, - "learning_rate": 2.2607250685950435e-05, - "long_answer_loss": 0.1764, - "loss": 0.1652, - "short_answer_loss": NaN, - "step": 587, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1787, - "grad_norm": 1.515625, - "learning_rate": 2.2598135382351698e-05, - "long_answer_loss": 0.1787, - "loss": 0.1636, - "short_answer_loss": NaN, - "step": 588, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1398, - "grad_norm": 1.625, - "learning_rate": 2.2589004594154633e-05, - "long_answer_loss": 0.1398, - "loss": 0.1544, - "short_answer_loss": NaN, - "step": 589, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1665, - "grad_norm": 1.453125, - "learning_rate": 2.2579858335360492e-05, - "long_answer_loss": 0.1665, - "loss": 0.168, - "short_answer_loss": NaN, - "step": 590, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1713, - "grad_norm": 1.5, - "learning_rate": 2.2570696619994253e-05, - "long_answer_loss": 0.1713, - "loss": 0.1672, - "short_answer_loss": NaN, - "step": 591, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1436, - "grad_norm": 1.421875, - "learning_rate": 2.2561519462104604e-05, - "long_answer_loss": 0.1436, - "loss": 0.16, - "short_answer_loss": NaN, - "step": 592, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1525, - "grad_norm": 1.53125, - "learning_rate": 2.25523268757639e-05, - "long_answer_loss": 0.1525, - "loss": 0.1584, - "short_answer_loss": NaN, - "step": 593, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1666, - "grad_norm": 1.40625, - "learning_rate": 2.2543118875068166e-05, - "long_answer_loss": 0.1666, - "loss": 0.161, - "short_answer_loss": NaN, - "step": 594, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1843, - "grad_norm": 1.359375, - "learning_rate": 2.2533895474137047e-05, - "long_answer_loss": 0.1843, - "loss": 0.1619, - "short_answer_loss": NaN, - "step": 595, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1571, - "grad_norm": 1.359375, - "learning_rate": 2.2524656687113822e-05, - "long_answer_loss": 0.1571, - "loss": 0.1593, - "short_answer_loss": NaN, - "step": 596, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1842, - "grad_norm": 1.4921875, - "learning_rate": 2.251540252816535e-05, - "long_answer_loss": 0.1842, - "loss": 0.1646, - "short_answer_loss": NaN, - "step": 597, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1378, - "grad_norm": 1.46875, - "learning_rate": 2.2506133011482075e-05, - "long_answer_loss": 0.1378, - "loss": 0.1564, - "short_answer_loss": NaN, - "step": 598, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1795, - "grad_norm": 1.609375, - "learning_rate": 2.2496848151277973e-05, - "long_answer_loss": 0.1795, - "loss": 0.1634, - "short_answer_loss": NaN, - "step": 599, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.182, - "grad_norm": 1.5703125, - "learning_rate": 2.2487547961790556e-05, - "long_answer_loss": 0.182, - "loss": 0.1685, - "short_answer_loss": NaN, - "step": 600, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1688, - "grad_norm": 1.5, - "learning_rate": 2.2478232457280845e-05, - "long_answer_loss": 0.1688, - "loss": 0.1604, - "short_answer_loss": NaN, - "step": 601, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.135, - "grad_norm": 1.3984375, - "learning_rate": 2.2468901652033346e-05, - "long_answer_loss": 0.135, - "loss": 0.153, - "short_answer_loss": NaN, - "step": 602, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1504, - "grad_norm": 1.46875, - "learning_rate": 2.2459555560356023e-05, - "long_answer_loss": 0.1504, - "loss": 0.1553, - "short_answer_loss": NaN, - "step": 603, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1645, - "grad_norm": 1.6171875, - "learning_rate": 2.2450194196580278e-05, - "long_answer_loss": 0.1645, - "loss": 0.1645, - "short_answer_loss": NaN, - "step": 604, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1687, - "grad_norm": 1.421875, - "learning_rate": 2.244081757506094e-05, - "long_answer_loss": 0.1687, - "loss": 0.157, - "short_answer_loss": NaN, - "step": 605, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1474, - "grad_norm": 1.421875, - "learning_rate": 2.2431425710176226e-05, - "long_answer_loss": 0.1474, - "loss": 0.1563, - "short_answer_loss": NaN, - "step": 606, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1731, - "grad_norm": 1.4453125, - "learning_rate": 2.2422018616327734e-05, - "long_answer_loss": 0.1731, - "loss": 0.1666, - "short_answer_loss": NaN, - "step": 607, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1654, - "grad_norm": 1.53125, - "learning_rate": 2.241259630794041e-05, - "long_answer_loss": 0.1654, - "loss": 0.1572, - "short_answer_loss": NaN, - "step": 608, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1783, - "grad_norm": 1.6015625, - "learning_rate": 2.2403158799462524e-05, - "long_answer_loss": 0.1783, - "loss": 0.1667, - "short_answer_loss": NaN, - "step": 609, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1855, - "grad_norm": 1.4765625, - "learning_rate": 2.239370610536568e-05, - "long_answer_loss": 0.1855, - "loss": 0.1664, - "short_answer_loss": NaN, - "step": 610, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.204, - "grad_norm": 1.4609375, - "learning_rate": 2.238423824014473e-05, - "long_answer_loss": 0.204, - "loss": 0.1666, - "short_answer_loss": NaN, - "step": 611, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1617, - "grad_norm": 1.484375, - "learning_rate": 2.2374755218317817e-05, - "long_answer_loss": 0.1617, - "loss": 0.1639, - "short_answer_loss": NaN, - "step": 612, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1845, - "grad_norm": 1.53125, - "learning_rate": 2.2365257054426315e-05, - "long_answer_loss": 0.1845, - "loss": 0.165, - "short_answer_loss": NaN, - "step": 613, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1409, - "grad_norm": 1.3984375, - "learning_rate": 2.2355743763034825e-05, - "long_answer_loss": 0.1409, - "loss": 0.1524, - "short_answer_loss": NaN, - "step": 614, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1681, - "grad_norm": 1.4453125, - "learning_rate": 2.234621535873113e-05, - "long_answer_loss": 0.1681, - "loss": 0.1571, - "short_answer_loss": NaN, - "step": 615, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1709, - "grad_norm": 1.390625, - "learning_rate": 2.23366718561262e-05, - "long_answer_loss": 0.1709, - "loss": 0.1564, - "short_answer_loss": NaN, - "step": 616, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1757, - "grad_norm": 1.515625, - "learning_rate": 2.2327113269854154e-05, - "long_answer_loss": 0.1757, - "loss": 0.1657, - "short_answer_loss": NaN, - "step": 617, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1609, - "grad_norm": 1.5390625, - "learning_rate": 2.231753961457224e-05, - "long_answer_loss": 0.1609, - "loss": 0.1656, - "short_answer_loss": NaN, - "step": 618, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1462, - "grad_norm": 1.3671875, - "learning_rate": 2.2307950904960813e-05, - "long_answer_loss": 0.1462, - "loss": 0.1584, - "short_answer_loss": NaN, - "step": 619, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1626, - "grad_norm": 1.453125, - "learning_rate": 2.2298347155723302e-05, - "long_answer_loss": 0.1626, - "loss": 0.1523, - "short_answer_loss": NaN, - "step": 620, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1791, - "grad_norm": 1.5, - "learning_rate": 2.2288728381586224e-05, - "long_answer_loss": 0.1791, - "loss": 0.1635, - "short_answer_loss": NaN, - "step": 621, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.146, - "grad_norm": 1.453125, - "learning_rate": 2.2279094597299108e-05, - "long_answer_loss": 0.146, - "loss": 0.1553, - "short_answer_loss": NaN, - "step": 622, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1659, - "grad_norm": 1.375, - "learning_rate": 2.2269445817634514e-05, - "long_answer_loss": 0.1659, - "loss": 0.1549, - "short_answer_loss": NaN, - "step": 623, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1679, - "grad_norm": 1.4296875, - "learning_rate": 2.2259782057387994e-05, - "long_answer_loss": 0.1679, - "loss": 0.1658, - "short_answer_loss": NaN, - "step": 624, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.188, - "grad_norm": 1.4375, - "learning_rate": 2.2250103331378067e-05, - "long_answer_loss": 0.188, - "loss": 0.158, - "short_answer_loss": NaN, - "step": 625, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1569, - "grad_norm": 1.3984375, - "learning_rate": 2.224040965444621e-05, - "long_answer_loss": 0.1569, - "loss": 0.158, - "short_answer_loss": NaN, - "step": 626, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1536, - "grad_norm": 1.421875, - "learning_rate": 2.2230701041456814e-05, - "long_answer_loss": 0.1536, - "loss": 0.1516, - "short_answer_loss": NaN, - "step": 627, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1335, - "grad_norm": 1.4140625, - "learning_rate": 2.222097750729718e-05, - "long_answer_loss": 0.1335, - "loss": 0.1654, - "short_answer_loss": NaN, - "step": 628, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.17, - "grad_norm": 1.4453125, - "learning_rate": 2.221123906687749e-05, - "long_answer_loss": 0.17, - "loss": 0.1615, - "short_answer_loss": NaN, - "step": 629, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.182, - "grad_norm": 1.3671875, - "learning_rate": 2.2201485735130787e-05, - "long_answer_loss": 0.182, - "loss": 0.164, - "short_answer_loss": NaN, - "step": 630, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1433, - "grad_norm": 1.453125, - "learning_rate": 2.2191717527012935e-05, - "long_answer_loss": 0.1433, - "loss": 0.1515, - "short_answer_loss": NaN, - "step": 631, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1535, - "grad_norm": 1.4140625, - "learning_rate": 2.2181934457502622e-05, - "long_answer_loss": 0.1535, - "loss": 0.155, - "short_answer_loss": NaN, - "step": 632, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1635, - "grad_norm": 1.453125, - "learning_rate": 2.2172136541601322e-05, - "long_answer_loss": 0.1635, - "loss": 0.1578, - "short_answer_loss": NaN, - "step": 633, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1698, - "grad_norm": 1.46875, - "learning_rate": 2.216232379433327e-05, - "long_answer_loss": 0.1698, - "loss": 0.1715, - "short_answer_loss": NaN, - "step": 634, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1945, - "grad_norm": 1.3828125, - "learning_rate": 2.2152496230745447e-05, - "long_answer_loss": 0.1945, - "loss": 0.1623, - "short_answer_loss": NaN, - "step": 635, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1695, - "grad_norm": 1.46875, - "learning_rate": 2.2142653865907557e-05, - "long_answer_loss": 0.1695, - "loss": 0.161, - "short_answer_loss": NaN, - "step": 636, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.163, - "grad_norm": 1.4375, - "learning_rate": 2.2132796714911998e-05, - "long_answer_loss": 0.163, - "loss": 0.1615, - "short_answer_loss": NaN, - "step": 637, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1437, - "grad_norm": 1.4375, - "learning_rate": 2.2122924792873827e-05, - "long_answer_loss": 0.1437, - "loss": 0.1662, - "short_answer_loss": NaN, - "step": 638, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1582, - "grad_norm": 1.4921875, - "learning_rate": 2.211303811493078e-05, - "long_answer_loss": 0.1582, - "loss": 0.1603, - "short_answer_loss": NaN, - "step": 639, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1522, - "grad_norm": 1.4765625, - "learning_rate": 2.2103136696243197e-05, - "long_answer_loss": 0.1522, - "loss": 0.1593, - "short_answer_loss": NaN, - "step": 640, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1626, - "grad_norm": 1.375, - "learning_rate": 2.2093220551994033e-05, - "long_answer_loss": 0.1626, - "loss": 0.1483, - "short_answer_loss": NaN, - "step": 641, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1684, - "grad_norm": 1.4296875, - "learning_rate": 2.2083289697388808e-05, - "long_answer_loss": 0.1684, - "loss": 0.1642, - "short_answer_loss": NaN, - "step": 642, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1512, - "grad_norm": 1.5078125, - "learning_rate": 2.207334414765562e-05, - "long_answer_loss": 0.1512, - "loss": 0.1474, - "short_answer_loss": NaN, - "step": 643, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1601, - "grad_norm": 1.40625, - "learning_rate": 2.2063383918045092e-05, - "long_answer_loss": 0.1601, - "loss": 0.1625, - "short_answer_loss": NaN, - "step": 644, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1868, - "grad_norm": 1.46875, - "learning_rate": 2.2053409023830353e-05, - "long_answer_loss": 0.1868, - "loss": 0.1692, - "short_answer_loss": NaN, - "step": 645, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1715, - "grad_norm": 1.4921875, - "learning_rate": 2.204341948030702e-05, - "long_answer_loss": 0.1715, - "loss": 0.1586, - "short_answer_loss": NaN, - "step": 646, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1579, - "grad_norm": 1.453125, - "learning_rate": 2.2033415302793173e-05, - "long_answer_loss": 0.1579, - "loss": 0.1539, - "short_answer_loss": NaN, - "step": 647, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1937, - "grad_norm": 1.4375, - "learning_rate": 2.202339650662934e-05, - "long_answer_loss": 0.1937, - "loss": 0.1533, - "short_answer_loss": NaN, - "step": 648, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1718, - "grad_norm": 1.546875, - "learning_rate": 2.2013363107178454e-05, - "long_answer_loss": 0.1718, - "loss": 0.1625, - "short_answer_loss": NaN, - "step": 649, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1678, - "grad_norm": 1.46875, - "learning_rate": 2.2003315119825856e-05, - "long_answer_loss": 0.1678, - "loss": 0.1586, - "short_answer_loss": NaN, - "step": 650, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1645, - "grad_norm": 1.5625, - "learning_rate": 2.199325255997923e-05, - "long_answer_loss": 0.1645, - "loss": 0.1689, - "short_answer_loss": NaN, - "step": 651, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1483, - "grad_norm": 1.421875, - "learning_rate": 2.1983175443068645e-05, - "long_answer_loss": 0.1483, - "loss": 0.1637, - "short_answer_loss": NaN, - "step": 652, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1693, - "grad_norm": 1.5546875, - "learning_rate": 2.1973083784546454e-05, - "long_answer_loss": 0.1693, - "loss": 0.1675, - "short_answer_loss": NaN, - "step": 653, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1406, - "grad_norm": 1.3359375, - "learning_rate": 2.1962977599887324e-05, - "long_answer_loss": 0.1406, - "loss": 0.1513, - "short_answer_loss": NaN, - "step": 654, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1466, - "grad_norm": 1.4765625, - "learning_rate": 2.19528569045882e-05, - "long_answer_loss": 0.1466, - "loss": 0.1656, - "short_answer_loss": NaN, - "step": 655, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1671, - "grad_norm": 1.484375, - "learning_rate": 2.1942721714168274e-05, - "long_answer_loss": 0.1671, - "loss": 0.1652, - "short_answer_loss": NaN, - "step": 656, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1676, - "grad_norm": 1.46875, - "learning_rate": 2.1932572044168964e-05, - "long_answer_loss": 0.1676, - "loss": 0.1648, - "short_answer_loss": NaN, - "step": 657, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1671, - "grad_norm": 1.4609375, - "learning_rate": 2.1922407910153895e-05, - "long_answer_loss": 0.1671, - "loss": 0.1618, - "short_answer_loss": NaN, - "step": 658, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1679, - "grad_norm": 1.46875, - "learning_rate": 2.191222932770886e-05, - "long_answer_loss": 0.1679, - "loss": 0.1688, - "short_answer_loss": NaN, - "step": 659, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1853, - "grad_norm": 1.484375, - "learning_rate": 2.1902036312441824e-05, - "long_answer_loss": 0.1853, - "loss": 0.1664, - "short_answer_loss": NaN, - "step": 660, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1532, - "grad_norm": 1.4140625, - "learning_rate": 2.1891828879982877e-05, - "long_answer_loss": 0.1532, - "loss": 0.1532, - "short_answer_loss": NaN, - "step": 661, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1579, - "grad_norm": 1.5, - "learning_rate": 2.1881607045984202e-05, - "long_answer_loss": 0.1579, - "loss": 0.1651, - "short_answer_loss": NaN, - "step": 662, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1708, - "grad_norm": 1.421875, - "learning_rate": 2.1871370826120093e-05, - "long_answer_loss": 0.1708, - "loss": 0.1526, - "short_answer_loss": NaN, - "step": 663, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1641, - "grad_norm": 1.46875, - "learning_rate": 2.186112023608688e-05, - "long_answer_loss": 0.1641, - "loss": 0.1633, - "short_answer_loss": NaN, - "step": 664, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1567, - "grad_norm": 1.5234375, - "learning_rate": 2.1850855291602942e-05, - "long_answer_loss": 0.1567, - "loss": 0.1582, - "short_answer_loss": NaN, - "step": 665, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1713, - "grad_norm": 1.53125, - "learning_rate": 2.184057600840866e-05, - "long_answer_loss": 0.1713, - "loss": 0.1637, - "short_answer_loss": NaN, - "step": 666, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1682, - "grad_norm": 1.4921875, - "learning_rate": 2.1830282402266407e-05, - "long_answer_loss": 0.1682, - "loss": 0.1545, - "short_answer_loss": NaN, - "step": 667, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1714, - "grad_norm": 1.59375, - "learning_rate": 2.181997448896052e-05, - "long_answer_loss": 0.1714, - "loss": 0.1571, - "short_answer_loss": NaN, - "step": 668, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1608, - "grad_norm": 1.5234375, - "learning_rate": 2.1809652284297275e-05, - "long_answer_loss": 0.1608, - "loss": 0.1614, - "short_answer_loss": NaN, - "step": 669, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1385, - "grad_norm": 1.6015625, - "learning_rate": 2.1799315804104858e-05, - "long_answer_loss": 0.1385, - "loss": 0.1701, - "short_answer_loss": NaN, - "step": 670, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1628, - "grad_norm": 1.53125, - "learning_rate": 2.1788965064233346e-05, - "long_answer_loss": 0.1628, - "loss": 0.1674, - "short_answer_loss": NaN, - "step": 671, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1639, - "grad_norm": 1.3984375, - "learning_rate": 2.177860008055469e-05, - "long_answer_loss": 0.1639, - "loss": 0.1571, - "short_answer_loss": NaN, - "step": 672, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.143, - "grad_norm": 1.421875, - "learning_rate": 2.1768220868962675e-05, - "long_answer_loss": 0.143, - "loss": 0.1505, - "short_answer_loss": NaN, - "step": 673, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1511, - "grad_norm": 1.3828125, - "learning_rate": 2.1757827445372896e-05, - "long_answer_loss": 0.1511, - "loss": 0.1514, - "short_answer_loss": NaN, - "step": 674, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1556, - "grad_norm": 1.484375, - "learning_rate": 2.174741982572276e-05, - "long_answer_loss": 0.1556, - "loss": 0.1587, - "short_answer_loss": NaN, - "step": 675, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.154, - "grad_norm": 1.4375, - "learning_rate": 2.1736998025971433e-05, - "long_answer_loss": 0.154, - "loss": 0.1506, - "short_answer_loss": NaN, - "step": 676, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1721, - "grad_norm": 1.4609375, - "learning_rate": 2.1726562062099816e-05, - "long_answer_loss": 0.1721, - "loss": 0.1676, - "short_answer_loss": NaN, - "step": 677, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1666, - "grad_norm": 1.3828125, - "learning_rate": 2.1716111950110545e-05, - "long_answer_loss": 0.1666, - "loss": 0.172, - "short_answer_loss": NaN, - "step": 678, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1712, - "grad_norm": 1.4453125, - "learning_rate": 2.1705647706027938e-05, - "long_answer_loss": 0.1712, - "loss": 0.1585, - "short_answer_loss": NaN, - "step": 679, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1519, - "grad_norm": 1.53125, - "learning_rate": 2.1695169345897993e-05, - "long_answer_loss": 0.1519, - "loss": 0.1538, - "short_answer_loss": NaN, - "step": 680, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1544, - "grad_norm": 1.6484375, - "learning_rate": 2.168467688578834e-05, - "long_answer_loss": 0.1544, - "loss": 0.1557, - "short_answer_loss": NaN, - "step": 681, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1447, - "grad_norm": 1.3671875, - "learning_rate": 2.167417034178825e-05, - "long_answer_loss": 0.1447, - "loss": 0.1622, - "short_answer_loss": NaN, - "step": 682, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1723, - "grad_norm": 1.4609375, - "learning_rate": 2.166364973000858e-05, - "long_answer_loss": 0.1723, - "loss": 0.169, - "short_answer_loss": NaN, - "step": 683, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1506, - "grad_norm": 1.3671875, - "learning_rate": 2.1653115066581752e-05, - "long_answer_loss": 0.1506, - "loss": 0.1597, - "short_answer_loss": NaN, - "step": 684, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1669, - "grad_norm": 1.4140625, - "learning_rate": 2.1642566367661744e-05, - "long_answer_loss": 0.1669, - "loss": 0.1546, - "short_answer_loss": NaN, - "step": 685, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1499, - "grad_norm": 1.421875, - "learning_rate": 2.1632003649424054e-05, - "long_answer_loss": 0.1499, - "loss": 0.1492, - "short_answer_loss": NaN, - "step": 686, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1791, - "grad_norm": 1.484375, - "learning_rate": 2.162142692806568e-05, - "long_answer_loss": 0.1791, - "loss": 0.161, - "short_answer_loss": NaN, - "step": 687, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1576, - "grad_norm": 1.4140625, - "learning_rate": 2.1610836219805085e-05, - "long_answer_loss": 0.1576, - "loss": 0.1541, - "short_answer_loss": NaN, - "step": 688, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1531, - "grad_norm": 1.3828125, - "learning_rate": 2.1600231540882184e-05, - "long_answer_loss": 0.1531, - "loss": 0.155, - "short_answer_loss": NaN, - "step": 689, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1639, - "grad_norm": 1.5078125, - "learning_rate": 2.158961290755832e-05, - "long_answer_loss": 0.1639, - "loss": 0.1626, - "short_answer_loss": NaN, - "step": 690, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.166, - "grad_norm": 1.421875, - "learning_rate": 2.1578980336116226e-05, - "long_answer_loss": 0.166, - "loss": 0.1628, - "short_answer_loss": NaN, - "step": 691, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1709, - "grad_norm": 1.4921875, - "learning_rate": 2.1568333842860007e-05, - "long_answer_loss": 0.1709, - "loss": 0.1614, - "short_answer_loss": NaN, - "step": 692, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1704, - "grad_norm": 1.421875, - "learning_rate": 2.1557673444115127e-05, - "long_answer_loss": 0.1704, - "loss": 0.1557, - "short_answer_loss": NaN, - "step": 693, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1628, - "grad_norm": 1.3515625, - "learning_rate": 2.1546999156228366e-05, - "long_answer_loss": 0.1628, - "loss": 0.1547, - "short_answer_loss": NaN, - "step": 694, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1591, - "grad_norm": 1.375, - "learning_rate": 2.1536310995567794e-05, - "long_answer_loss": 0.1591, - "loss": 0.1561, - "short_answer_loss": NaN, - "step": 695, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1692, - "grad_norm": 1.3984375, - "learning_rate": 2.152560897852276e-05, - "long_answer_loss": 0.1692, - "loss": 0.1617, - "short_answer_loss": NaN, - "step": 696, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1663, - "grad_norm": 1.3984375, - "learning_rate": 2.151489312150387e-05, - "long_answer_loss": 0.1663, - "loss": 0.1543, - "short_answer_loss": NaN, - "step": 697, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1695, - "grad_norm": 1.421875, - "learning_rate": 2.150416344094294e-05, - "long_answer_loss": 0.1695, - "loss": 0.1591, - "short_answer_loss": NaN, - "step": 698, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1523, - "grad_norm": 1.3515625, - "learning_rate": 2.149341995329299e-05, - "long_answer_loss": 0.1523, - "loss": 0.1537, - "short_answer_loss": NaN, - "step": 699, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1699, - "grad_norm": 1.4140625, - "learning_rate": 2.14826626750282e-05, - "long_answer_loss": 0.1699, - "loss": 0.1606, - "short_answer_loss": NaN, - "step": 700, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1369, - "grad_norm": 1.515625, - "learning_rate": 2.147189162264391e-05, - "long_answer_loss": 0.1369, - "loss": 0.1605, - "short_answer_loss": NaN, - "step": 701, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1477, - "grad_norm": 1.3828125, - "learning_rate": 2.1461106812656583e-05, - "long_answer_loss": 0.1477, - "loss": 0.1572, - "short_answer_loss": NaN, - "step": 702, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1557, - "grad_norm": 1.3984375, - "learning_rate": 2.145030826160377e-05, - "long_answer_loss": 0.1557, - "loss": 0.1486, - "short_answer_loss": NaN, - "step": 703, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1728, - "grad_norm": 1.5390625, - "learning_rate": 2.1439495986044088e-05, - "long_answer_loss": 0.1728, - "loss": 0.1681, - "short_answer_loss": NaN, - "step": 704, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1508, - "grad_norm": 1.4375, - "learning_rate": 2.142867000255721e-05, - "long_answer_loss": 0.1508, - "loss": 0.153, - "short_answer_loss": NaN, - "step": 705, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1616, - "grad_norm": 1.4296875, - "learning_rate": 2.141783032774383e-05, - "long_answer_loss": 0.1616, - "loss": 0.1578, - "short_answer_loss": NaN, - "step": 706, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1402, - "grad_norm": 1.4765625, - "learning_rate": 2.1406976978225623e-05, - "long_answer_loss": 0.1402, - "loss": 0.1515, - "short_answer_loss": NaN, - "step": 707, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1611, - "grad_norm": 1.4375, - "learning_rate": 2.139610997064525e-05, - "long_answer_loss": 0.1611, - "loss": 0.1528, - "short_answer_loss": NaN, - "step": 708, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1608, - "grad_norm": 1.4296875, - "learning_rate": 2.1385229321666304e-05, - "long_answer_loss": 0.1608, - "loss": 0.1584, - "short_answer_loss": NaN, - "step": 709, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1673, - "grad_norm": 1.6015625, - "learning_rate": 2.1374335047973292e-05, - "long_answer_loss": 0.1673, - "loss": 0.1642, - "short_answer_loss": NaN, - "step": 710, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1603, - "grad_norm": 1.53125, - "learning_rate": 2.1363427166271632e-05, - "long_answer_loss": 0.1603, - "loss": 0.1639, - "short_answer_loss": NaN, - "step": 711, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1876, - "grad_norm": 1.40625, - "learning_rate": 2.1352505693287587e-05, - "long_answer_loss": 0.1876, - "loss": 0.1615, - "short_answer_loss": NaN, - "step": 712, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1583, - "grad_norm": 1.4609375, - "learning_rate": 2.1341570645768273e-05, - "long_answer_loss": 0.1583, - "loss": 0.1627, - "short_answer_loss": NaN, - "step": 713, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1504, - "grad_norm": 1.4375, - "learning_rate": 2.1330622040481624e-05, - "long_answer_loss": 0.1504, - "loss": 0.1586, - "short_answer_loss": NaN, - "step": 714, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1234, - "grad_norm": 1.375, - "learning_rate": 2.1319659894216355e-05, - "long_answer_loss": 0.1234, - "loss": 0.1463, - "short_answer_loss": NaN, - "step": 715, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1471, - "grad_norm": 1.4453125, - "learning_rate": 2.1308684223781945e-05, - "long_answer_loss": 0.1471, - "loss": 0.1562, - "short_answer_loss": NaN, - "step": 716, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1731, - "grad_norm": 1.5859375, - "learning_rate": 2.129769504600862e-05, - "long_answer_loss": 0.1731, - "loss": 0.1657, - "short_answer_loss": NaN, - "step": 717, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1534, - "grad_norm": 1.390625, - "learning_rate": 2.1286692377747315e-05, - "long_answer_loss": 0.1534, - "loss": 0.1529, - "short_answer_loss": NaN, - "step": 718, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1623, - "grad_norm": 1.546875, - "learning_rate": 2.1275676235869644e-05, - "long_answer_loss": 0.1623, - "loss": 0.1592, - "short_answer_loss": NaN, - "step": 719, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1562, - "grad_norm": 1.421875, - "learning_rate": 2.1264646637267886e-05, - "long_answer_loss": 0.1562, - "loss": 0.1566, - "short_answer_loss": NaN, - "step": 720, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1464, - "grad_norm": 1.4921875, - "learning_rate": 2.1253603598854964e-05, - "long_answer_loss": 0.1464, - "loss": 0.1647, - "short_answer_loss": NaN, - "step": 721, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1665, - "grad_norm": 1.3515625, - "learning_rate": 2.12425471375644e-05, - "long_answer_loss": 0.1665, - "loss": 0.154, - "short_answer_loss": NaN, - "step": 722, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1601, - "grad_norm": 1.4453125, - "learning_rate": 2.1231477270350293e-05, - "long_answer_loss": 0.1601, - "loss": 0.1509, - "short_answer_loss": NaN, - "step": 723, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1454, - "grad_norm": 1.453125, - "learning_rate": 2.1220394014187312e-05, - "long_answer_loss": 0.1454, - "loss": 0.1603, - "short_answer_loss": NaN, - "step": 724, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.164, - "grad_norm": 1.421875, - "learning_rate": 2.1209297386070647e-05, - "long_answer_loss": 0.164, - "loss": 0.1586, - "short_answer_loss": NaN, - "step": 725, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1379, - "grad_norm": 1.359375, - "learning_rate": 2.1198187403016e-05, - "long_answer_loss": 0.1379, - "loss": 0.1564, - "short_answer_loss": NaN, - "step": 726, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1616, - "grad_norm": 1.390625, - "learning_rate": 2.118706408205955e-05, - "long_answer_loss": 0.1616, - "loss": 0.1562, - "short_answer_loss": NaN, - "step": 727, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1567, - "grad_norm": 1.453125, - "learning_rate": 2.1175927440257926e-05, - "long_answer_loss": 0.1567, - "loss": 0.1581, - "short_answer_loss": NaN, - "step": 728, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1605, - "grad_norm": 1.5625, - "learning_rate": 2.1164777494688178e-05, - "long_answer_loss": 0.1605, - "loss": 0.1655, - "short_answer_loss": NaN, - "step": 729, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1674, - "grad_norm": 1.453125, - "learning_rate": 2.115361426244777e-05, - "long_answer_loss": 0.1674, - "loss": 0.1546, - "short_answer_loss": NaN, - "step": 730, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.168, - "grad_norm": 1.375, - "learning_rate": 2.114243776065453e-05, - "long_answer_loss": 0.168, - "loss": 0.1618, - "short_answer_loss": NaN, - "step": 731, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1741, - "grad_norm": 1.4921875, - "learning_rate": 2.1131248006446635e-05, - "long_answer_loss": 0.1741, - "loss": 0.1667, - "short_answer_loss": NaN, - "step": 732, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.173, - "grad_norm": 1.46875, - "learning_rate": 2.1120045016982585e-05, - "long_answer_loss": 0.173, - "loss": 0.1653, - "short_answer_loss": NaN, - "step": 733, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1371, - "grad_norm": 1.4140625, - "learning_rate": 2.110882880944117e-05, - "long_answer_loss": 0.1371, - "loss": 0.151, - "short_answer_loss": NaN, - "step": 734, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1731, - "grad_norm": 1.46875, - "learning_rate": 2.109759940102146e-05, - "long_answer_loss": 0.1731, - "loss": 0.1581, - "short_answer_loss": NaN, - "step": 735, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1223, - "grad_norm": 1.3203125, - "learning_rate": 2.1086356808942758e-05, - "long_answer_loss": 0.1223, - "loss": 0.1529, - "short_answer_loss": NaN, - "step": 736, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1406, - "grad_norm": 1.4453125, - "learning_rate": 2.1075101050444583e-05, - "long_answer_loss": 0.1406, - "loss": 0.1555, - "short_answer_loss": NaN, - "step": 737, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1444, - "grad_norm": 1.53125, - "learning_rate": 2.1063832142786652e-05, - "long_answer_loss": 0.1444, - "loss": 0.1646, - "short_answer_loss": NaN, - "step": 738, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1632, - "grad_norm": 1.5625, - "learning_rate": 2.1052550103248836e-05, - "long_answer_loss": 0.1632, - "loss": 0.1588, - "short_answer_loss": NaN, - "step": 739, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.153, - "grad_norm": 1.4609375, - "learning_rate": 2.1041254949131143e-05, - "long_answer_loss": 0.153, - "loss": 0.1566, - "short_answer_loss": NaN, - "step": 740, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1593, - "grad_norm": 1.390625, - "learning_rate": 2.1029946697753693e-05, - "long_answer_loss": 0.1593, - "loss": 0.1514, - "short_answer_loss": NaN, - "step": 741, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1732, - "grad_norm": 1.3984375, - "learning_rate": 2.10186253664567e-05, - "long_answer_loss": 0.1732, - "loss": 0.1586, - "short_answer_loss": NaN, - "step": 742, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1384, - "grad_norm": 1.3203125, - "learning_rate": 2.1007290972600415e-05, - "long_answer_loss": 0.1384, - "loss": 0.1427, - "short_answer_loss": NaN, - "step": 743, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1521, - "grad_norm": 1.3828125, - "learning_rate": 2.0995943533565136e-05, - "long_answer_loss": 0.1521, - "loss": 0.1532, - "short_answer_loss": NaN, - "step": 744, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1557, - "grad_norm": 1.5234375, - "learning_rate": 2.0984583066751152e-05, - "long_answer_loss": 0.1557, - "loss": 0.1563, - "short_answer_loss": NaN, - "step": 745, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1462, - "grad_norm": 1.40625, - "learning_rate": 2.0973209589578742e-05, - "long_answer_loss": 0.1462, - "loss": 0.1545, - "short_answer_loss": NaN, - "step": 746, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1669, - "grad_norm": 1.4140625, - "learning_rate": 2.0961823119488115e-05, - "long_answer_loss": 0.1669, - "loss": 0.1576, - "short_answer_loss": NaN, - "step": 747, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1665, - "grad_norm": 1.4453125, - "learning_rate": 2.0950423673939435e-05, - "long_answer_loss": 0.1665, - "loss": 0.1636, - "short_answer_loss": NaN, - "step": 748, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1561, - "grad_norm": 1.46875, - "learning_rate": 2.0939011270412735e-05, - "long_answer_loss": 0.1561, - "loss": 0.151, - "short_answer_loss": NaN, - "step": 749, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.173, - "grad_norm": 1.3671875, - "learning_rate": 2.092758592640793e-05, - "long_answer_loss": 0.173, - "loss": 0.1494, - "short_answer_loss": NaN, - "step": 750, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1618, - "grad_norm": 1.375, - "learning_rate": 2.0916147659444768e-05, - "long_answer_loss": 0.1618, - "loss": 0.1595, - "short_answer_loss": NaN, - "step": 751, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.139, - "grad_norm": 1.359375, - "learning_rate": 2.090469648706283e-05, - "long_answer_loss": 0.139, - "loss": 0.1557, - "short_answer_loss": NaN, - "step": 752, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1761, - "grad_norm": 1.390625, - "learning_rate": 2.089323242682147e-05, - "long_answer_loss": 0.1761, - "loss": 0.1582, - "short_answer_loss": NaN, - "step": 753, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1327, - "grad_norm": 1.4296875, - "learning_rate": 2.0881755496299817e-05, - "long_answer_loss": 0.1327, - "loss": 0.1499, - "short_answer_loss": NaN, - "step": 754, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.16, - "grad_norm": 1.453125, - "learning_rate": 2.0870265713096726e-05, - "long_answer_loss": 0.16, - "loss": 0.1573, - "short_answer_loss": NaN, - "step": 755, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1353, - "grad_norm": 1.4921875, - "learning_rate": 2.085876309483077e-05, - "long_answer_loss": 0.1353, - "loss": 0.1536, - "short_answer_loss": NaN, - "step": 756, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.129, - "grad_norm": 1.3984375, - "learning_rate": 2.084724765914019e-05, - "long_answer_loss": 0.129, - "loss": 0.151, - "short_answer_loss": NaN, - "step": 757, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1443, - "grad_norm": 1.4765625, - "learning_rate": 2.083571942368289e-05, - "long_answer_loss": 0.1443, - "loss": 0.1539, - "short_answer_loss": NaN, - "step": 758, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1633, - "grad_norm": 1.453125, - "learning_rate": 2.0824178406136407e-05, - "long_answer_loss": 0.1633, - "loss": 0.1595, - "short_answer_loss": NaN, - "step": 759, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1449, - "grad_norm": 1.328125, - "learning_rate": 2.0812624624197868e-05, - "long_answer_loss": 0.1449, - "loss": 0.1463, - "short_answer_loss": NaN, - "step": 760, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1805, - "grad_norm": 1.6015625, - "learning_rate": 2.0801058095583977e-05, - "long_answer_loss": 0.1805, - "loss": 0.1586, - "short_answer_loss": NaN, - "step": 761, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1466, - "grad_norm": 1.4609375, - "learning_rate": 2.078947883803098e-05, - "long_answer_loss": 0.1466, - "loss": 0.1549, - "short_answer_loss": NaN, - "step": 762, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1405, - "grad_norm": 1.484375, - "learning_rate": 2.0777886869294655e-05, - "long_answer_loss": 0.1405, - "loss": 0.1552, - "short_answer_loss": NaN, - "step": 763, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1516, - "grad_norm": 1.5, - "learning_rate": 2.076628220715025e-05, - "long_answer_loss": 0.1516, - "loss": 0.1591, - "short_answer_loss": NaN, - "step": 764, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1504, - "grad_norm": 1.4453125, - "learning_rate": 2.0754664869392494e-05, - "long_answer_loss": 0.1504, - "loss": 0.1615, - "short_answer_loss": NaN, - "step": 765, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1533, - "grad_norm": 1.4609375, - "learning_rate": 2.0743034873835547e-05, - "long_answer_loss": 0.1533, - "loss": 0.1507, - "short_answer_loss": NaN, - "step": 766, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1544, - "grad_norm": 1.4140625, - "learning_rate": 2.0731392238312985e-05, - "long_answer_loss": 0.1544, - "loss": 0.1469, - "short_answer_loss": NaN, - "step": 767, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1276, - "grad_norm": 1.3828125, - "learning_rate": 2.0719736980677754e-05, - "long_answer_loss": 0.1276, - "loss": 0.1467, - "short_answer_loss": NaN, - "step": 768, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1534, - "grad_norm": 1.3515625, - "learning_rate": 2.0708069118802166e-05, - "long_answer_loss": 0.1534, - "loss": 0.1503, - "short_answer_loss": NaN, - "step": 769, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1577, - "grad_norm": 1.4375, - "learning_rate": 2.0696388670577852e-05, - "long_answer_loss": 0.1577, - "loss": 0.1567, - "short_answer_loss": NaN, - "step": 770, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1717, - "grad_norm": 1.40625, - "learning_rate": 2.068469565391575e-05, - "long_answer_loss": 0.1717, - "loss": 0.1565, - "short_answer_loss": NaN, - "step": 771, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1555, - "grad_norm": 1.3515625, - "learning_rate": 2.0672990086746067e-05, - "long_answer_loss": 0.1555, - "loss": 0.1469, - "short_answer_loss": NaN, - "step": 772, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1462, - "grad_norm": 1.390625, - "learning_rate": 2.066127198701826e-05, - "long_answer_loss": 0.1462, - "loss": 0.1559, - "short_answer_loss": NaN, - "step": 773, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1359, - "grad_norm": 1.421875, - "learning_rate": 2.0649541372700993e-05, - "long_answer_loss": 0.1359, - "loss": 0.1492, - "short_answer_loss": NaN, - "step": 774, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1595, - "grad_norm": 1.3515625, - "learning_rate": 2.063779826178213e-05, - "long_answer_loss": 0.1595, - "loss": 0.1528, - "short_answer_loss": NaN, - "step": 775, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1492, - "grad_norm": 1.5, - "learning_rate": 2.0626042672268692e-05, - "long_answer_loss": 0.1492, - "loss": 0.1499, - "short_answer_loss": NaN, - "step": 776, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.177, - "grad_norm": 1.4453125, - "learning_rate": 2.061427462218684e-05, - "long_answer_loss": 0.177, - "loss": 0.1546, - "short_answer_loss": NaN, - "step": 777, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1602, - "grad_norm": 1.4453125, - "learning_rate": 2.060249412958184e-05, - "long_answer_loss": 0.1602, - "loss": 0.1683, - "short_answer_loss": NaN, - "step": 778, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1405, - "grad_norm": 1.34375, - "learning_rate": 2.059070121251803e-05, - "long_answer_loss": 0.1405, - "loss": 0.1478, - "short_answer_loss": NaN, - "step": 779, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1483, - "grad_norm": 1.53125, - "learning_rate": 2.057889588907881e-05, - "long_answer_loss": 0.1483, - "loss": 0.1505, - "short_answer_loss": NaN, - "step": 780, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1368, - "grad_norm": 1.421875, - "learning_rate": 2.05670781773666e-05, - "long_answer_loss": 0.1368, - "loss": 0.16, - "short_answer_loss": NaN, - "step": 781, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1582, - "grad_norm": 1.421875, - "learning_rate": 2.0555248095502823e-05, - "long_answer_loss": 0.1582, - "loss": 0.1583, - "short_answer_loss": NaN, - "step": 782, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1473, - "grad_norm": 1.5, - "learning_rate": 2.054340566162785e-05, - "long_answer_loss": 0.1473, - "loss": 0.1561, - "short_answer_loss": NaN, - "step": 783, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1368, - "grad_norm": 1.515625, - "learning_rate": 2.053155089390102e-05, - "long_answer_loss": 0.1368, - "loss": 0.1545, - "short_answer_loss": NaN, - "step": 784, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1446, - "grad_norm": 1.359375, - "learning_rate": 2.0519683810500568e-05, - "long_answer_loss": 0.1446, - "loss": 0.1473, - "short_answer_loss": NaN, - "step": 785, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1443, - "grad_norm": 1.34375, - "learning_rate": 2.0507804429623613e-05, - "long_answer_loss": 0.1443, - "loss": 0.1474, - "short_answer_loss": NaN, - "step": 786, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1292, - "grad_norm": 1.546875, - "learning_rate": 2.0495912769486143e-05, - "long_answer_loss": 0.1292, - "loss": 0.1502, - "short_answer_loss": NaN, - "step": 787, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1566, - "grad_norm": 1.5703125, - "learning_rate": 2.0484008848322962e-05, - "long_answer_loss": 0.1566, - "loss": 0.1563, - "short_answer_loss": NaN, - "step": 788, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1532, - "grad_norm": 1.46875, - "learning_rate": 2.0472092684387688e-05, - "long_answer_loss": 0.1532, - "loss": 0.1493, - "short_answer_loss": NaN, - "step": 789, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1707, - "grad_norm": 1.4375, - "learning_rate": 2.04601642959527e-05, - "long_answer_loss": 0.1707, - "loss": 0.1552, - "short_answer_loss": NaN, - "step": 790, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1538, - "grad_norm": 1.4140625, - "learning_rate": 2.0448223701309126e-05, - "long_answer_loss": 0.1538, - "loss": 0.1531, - "short_answer_loss": NaN, - "step": 791, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1559, - "grad_norm": 1.4453125, - "learning_rate": 2.043627091876682e-05, - "long_answer_loss": 0.1559, - "loss": 0.1566, - "short_answer_loss": NaN, - "step": 792, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1547, - "grad_norm": 1.375, - "learning_rate": 2.0424305966654312e-05, - "long_answer_loss": 0.1547, - "loss": 0.1542, - "short_answer_loss": NaN, - "step": 793, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1472, - "grad_norm": 1.4765625, - "learning_rate": 2.0412328863318803e-05, - "long_answer_loss": 0.1472, - "loss": 0.1529, - "short_answer_loss": NaN, - "step": 794, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1577, - "grad_norm": 1.328125, - "learning_rate": 2.040033962712612e-05, - "long_answer_loss": 0.1577, - "loss": 0.1516, - "short_answer_loss": NaN, - "step": 795, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1296, - "grad_norm": 1.40625, - "learning_rate": 2.0388338276460695e-05, - "long_answer_loss": 0.1296, - "loss": 0.1506, - "short_answer_loss": NaN, - "step": 796, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.142, - "grad_norm": 1.4296875, - "learning_rate": 2.037632482972554e-05, - "long_answer_loss": 0.142, - "loss": 0.1471, - "short_answer_loss": NaN, - "step": 797, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1248, - "grad_norm": 1.3828125, - "learning_rate": 2.0364299305342223e-05, - "long_answer_loss": 0.1248, - "loss": 0.1519, - "short_answer_loss": NaN, - "step": 798, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1292, - "grad_norm": 1.4921875, - "learning_rate": 2.035226172175081e-05, - "long_answer_loss": 0.1292, - "loss": 0.1553, - "short_answer_loss": NaN, - "step": 799, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1608, - "grad_norm": 1.4765625, - "learning_rate": 2.0340212097409878e-05, - "long_answer_loss": 0.1608, - "loss": 0.1593, - "short_answer_loss": NaN, - "step": 800, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1463, - "grad_norm": 1.4375, - "learning_rate": 2.032815045079646e-05, - "long_answer_loss": 0.1463, - "loss": 0.1602, - "short_answer_loss": NaN, - "step": 801, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1599, - "grad_norm": 1.4921875, - "learning_rate": 2.0316076800406024e-05, - "long_answer_loss": 0.1599, - "loss": 0.1523, - "short_answer_loss": NaN, - "step": 802, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1606, - "grad_norm": 1.4453125, - "learning_rate": 2.0303991164752455e-05, - "long_answer_loss": 0.1606, - "loss": 0.1462, - "short_answer_loss": NaN, - "step": 803, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1608, - "grad_norm": 1.4921875, - "learning_rate": 2.0291893562368e-05, - "long_answer_loss": 0.1608, - "loss": 0.1615, - "short_answer_loss": NaN, - "step": 804, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1578, - "grad_norm": 1.453125, - "learning_rate": 2.027978401180326e-05, - "long_answer_loss": 0.1578, - "loss": 0.1507, - "short_answer_loss": NaN, - "step": 805, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1508, - "grad_norm": 1.3828125, - "learning_rate": 2.0267662531627163e-05, - "long_answer_loss": 0.1508, - "loss": 0.1479, - "short_answer_loss": NaN, - "step": 806, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1581, - "grad_norm": 1.4921875, - "learning_rate": 2.025552914042693e-05, - "long_answer_loss": 0.1581, - "loss": 0.1589, - "short_answer_loss": NaN, - "step": 807, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1855, - "grad_norm": 1.5390625, - "learning_rate": 2.0243383856808046e-05, - "long_answer_loss": 0.1855, - "loss": 0.1555, - "short_answer_loss": NaN, - "step": 808, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1574, - "grad_norm": 1.4921875, - "learning_rate": 2.023122669939423e-05, - "long_answer_loss": 0.1574, - "loss": 0.1562, - "short_answer_loss": NaN, - "step": 809, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1511, - "grad_norm": 1.4765625, - "learning_rate": 2.02190576868274e-05, - "long_answer_loss": 0.1511, - "loss": 0.1534, - "short_answer_loss": NaN, - "step": 810, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1497, - "grad_norm": 1.390625, - "learning_rate": 2.0206876837767673e-05, - "long_answer_loss": 0.1497, - "loss": 0.1455, - "short_answer_loss": NaN, - "step": 811, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1658, - "grad_norm": 1.421875, - "learning_rate": 2.0194684170893296e-05, - "long_answer_loss": 0.1658, - "loss": 0.1461, - "short_answer_loss": NaN, - "step": 812, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1502, - "grad_norm": 1.5, - "learning_rate": 2.0182479704900654e-05, - "long_answer_loss": 0.1502, - "loss": 0.1458, - "short_answer_loss": NaN, - "step": 813, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1484, - "grad_norm": 1.390625, - "learning_rate": 2.017026345850421e-05, - "long_answer_loss": 0.1484, - "loss": 0.1643, - "short_answer_loss": NaN, - "step": 814, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1304, - "grad_norm": 1.46875, - "learning_rate": 2.0158035450436504e-05, - "long_answer_loss": 0.1304, - "loss": 0.155, - "short_answer_loss": NaN, - "step": 815, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.146, - "grad_norm": 1.421875, - "learning_rate": 2.01457956994481e-05, - "long_answer_loss": 0.146, - "loss": 0.1557, - "short_answer_loss": NaN, - "step": 816, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1492, - "grad_norm": 1.4296875, - "learning_rate": 2.0133544224307582e-05, - "long_answer_loss": 0.1492, - "loss": 0.1531, - "short_answer_loss": NaN, - "step": 817, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1572, - "grad_norm": 1.421875, - "learning_rate": 2.0121281043801498e-05, - "long_answer_loss": 0.1572, - "loss": 0.1527, - "short_answer_loss": NaN, - "step": 818, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1398, - "grad_norm": 1.4609375, - "learning_rate": 2.0109006176734356e-05, - "long_answer_loss": 0.1398, - "loss": 0.1535, - "short_answer_loss": NaN, - "step": 819, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1538, - "grad_norm": 1.4609375, - "learning_rate": 2.009671964192858e-05, - "long_answer_loss": 0.1538, - "loss": 0.1589, - "short_answer_loss": NaN, - "step": 820, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1506, - "grad_norm": 1.3984375, - "learning_rate": 2.008442145822448e-05, - "long_answer_loss": 0.1506, - "loss": 0.1513, - "short_answer_loss": NaN, - "step": 821, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1531, - "grad_norm": 1.5078125, - "learning_rate": 2.007211164448024e-05, - "long_answer_loss": 0.1531, - "loss": 0.1491, - "short_answer_loss": NaN, - "step": 822, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1577, - "grad_norm": 1.3828125, - "learning_rate": 2.0059790219571872e-05, - "long_answer_loss": 0.1577, - "loss": 0.1578, - "short_answer_loss": NaN, - "step": 823, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1292, - "grad_norm": 1.296875, - "learning_rate": 2.004745720239319e-05, - "long_answer_loss": 0.1292, - "loss": 0.1488, - "short_answer_loss": NaN, - "step": 824, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1581, - "grad_norm": 1.40625, - "learning_rate": 2.0035112611855784e-05, - "long_answer_loss": 0.1581, - "loss": 0.1513, - "short_answer_loss": NaN, - "step": 825, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.177, - "grad_norm": 1.4296875, - "learning_rate": 2.0022756466888996e-05, - "long_answer_loss": 0.177, - "loss": 0.1508, - "short_answer_loss": NaN, - "step": 826, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1555, - "grad_norm": 1.40625, - "learning_rate": 2.001038878643988e-05, - "long_answer_loss": 0.1555, - "loss": 0.1525, - "short_answer_loss": NaN, - "step": 827, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1664, - "grad_norm": 1.40625, - "learning_rate": 1.999800958947318e-05, - "long_answer_loss": 0.1664, - "loss": 0.1529, - "short_answer_loss": NaN, - "step": 828, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1548, - "grad_norm": 1.515625, - "learning_rate": 1.998561889497131e-05, - "long_answer_loss": 0.1548, - "loss": 0.1596, - "short_answer_loss": NaN, - "step": 829, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1783, - "grad_norm": 1.53125, - "learning_rate": 1.9973216721934296e-05, - "long_answer_loss": 0.1783, - "loss": 0.1595, - "short_answer_loss": NaN, - "step": 830, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1466, - "grad_norm": 1.3046875, - "learning_rate": 1.9960803089379776e-05, - "long_answer_loss": 0.1466, - "loss": 0.1391, - "short_answer_loss": NaN, - "step": 831, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1424, - "grad_norm": 1.4921875, - "learning_rate": 1.9948378016342962e-05, - "long_answer_loss": 0.1424, - "loss": 0.1517, - "short_answer_loss": NaN, - "step": 832, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1493, - "grad_norm": 1.4609375, - "learning_rate": 1.99359415218766e-05, - "long_answer_loss": 0.1493, - "loss": 0.1546, - "short_answer_loss": NaN, - "step": 833, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1591, - "grad_norm": 1.3828125, - "learning_rate": 1.992349362505096e-05, - "long_answer_loss": 0.1591, - "loss": 0.1513, - "short_answer_loss": NaN, - "step": 834, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1555, - "grad_norm": 1.453125, - "learning_rate": 1.991103434495379e-05, - "long_answer_loss": 0.1555, - "loss": 0.1506, - "short_answer_loss": NaN, - "step": 835, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1568, - "grad_norm": 1.4140625, - "learning_rate": 1.9898563700690298e-05, - "long_answer_loss": 0.1568, - "loss": 0.1441, - "short_answer_loss": NaN, - "step": 836, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1671, - "grad_norm": 1.3125, - "learning_rate": 1.9886081711383108e-05, - "long_answer_loss": 0.1671, - "loss": 0.1434, - "short_answer_loss": NaN, - "step": 837, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1579, - "grad_norm": 1.3828125, - "learning_rate": 1.9873588396172257e-05, - "long_answer_loss": 0.1579, - "loss": 0.1416, - "short_answer_loss": NaN, - "step": 838, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1626, - "grad_norm": 1.46875, - "learning_rate": 1.9861083774215133e-05, - "long_answer_loss": 0.1626, - "loss": 0.1518, - "short_answer_loss": NaN, - "step": 839, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1446, - "grad_norm": 1.4296875, - "learning_rate": 1.9848567864686474e-05, - "long_answer_loss": 0.1446, - "loss": 0.1511, - "short_answer_loss": NaN, - "step": 840, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1301, - "grad_norm": 1.5, - "learning_rate": 1.9836040686778316e-05, - "long_answer_loss": 0.1301, - "loss": 0.1459, - "short_answer_loss": NaN, - "step": 841, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1551, - "grad_norm": 1.421875, - "learning_rate": 1.982350225969998e-05, - "long_answer_loss": 0.1551, - "loss": 0.1445, - "short_answer_loss": NaN, - "step": 842, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1423, - "grad_norm": 1.46875, - "learning_rate": 1.981095260267804e-05, - "long_answer_loss": 0.1423, - "loss": 0.1449, - "short_answer_loss": NaN, - "step": 843, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1767, - "grad_norm": 1.453125, - "learning_rate": 1.9798391734956284e-05, - "long_answer_loss": 0.1767, - "loss": 0.1573, - "short_answer_loss": NaN, - "step": 844, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1634, - "grad_norm": 1.5625, - "learning_rate": 1.9785819675795698e-05, - "long_answer_loss": 0.1634, - "loss": 0.1554, - "short_answer_loss": NaN, - "step": 845, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1774, - "grad_norm": 1.3671875, - "learning_rate": 1.9773236444474414e-05, - "long_answer_loss": 0.1774, - "loss": 0.1529, - "short_answer_loss": NaN, - "step": 846, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1488, - "grad_norm": 1.2890625, - "learning_rate": 1.976064206028771e-05, - "long_answer_loss": 0.1488, - "loss": 0.1448, - "short_answer_loss": NaN, - "step": 847, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1502, - "grad_norm": 1.4140625, - "learning_rate": 1.974803654254796e-05, - "long_answer_loss": 0.1502, - "loss": 0.1506, - "short_answer_loss": NaN, - "step": 848, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1375, - "grad_norm": 1.375, - "learning_rate": 1.9735419910584616e-05, - "long_answer_loss": 0.1375, - "loss": 0.1568, - "short_answer_loss": NaN, - "step": 849, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1344, - "grad_norm": 1.4765625, - "learning_rate": 1.9722792183744162e-05, - "long_answer_loss": 0.1344, - "loss": 0.1525, - "short_answer_loss": NaN, - "step": 850, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1453, - "grad_norm": 1.4140625, - "learning_rate": 1.9710153381390108e-05, - "long_answer_loss": 0.1453, - "loss": 0.151, - "short_answer_loss": NaN, - "step": 851, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1454, - "grad_norm": 1.3515625, - "learning_rate": 1.9697503522902936e-05, - "long_answer_loss": 0.1454, - "loss": 0.1489, - "short_answer_loss": NaN, - "step": 852, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1466, - "grad_norm": 1.578125, - "learning_rate": 1.9684842627680088e-05, - "long_answer_loss": 0.1466, - "loss": 0.1543, - "short_answer_loss": NaN, - "step": 853, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1528, - "grad_norm": 1.28125, - "learning_rate": 1.9672170715135927e-05, - "long_answer_loss": 0.1528, - "loss": 0.1467, - "short_answer_loss": NaN, - "step": 854, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1578, - "grad_norm": 1.34375, - "learning_rate": 1.965948780470171e-05, - "long_answer_loss": 0.1578, - "loss": 0.1475, - "short_answer_loss": NaN, - "step": 855, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1392, - "grad_norm": 1.4609375, - "learning_rate": 1.964679391582557e-05, - "long_answer_loss": 0.1392, - "loss": 0.1542, - "short_answer_loss": NaN, - "step": 856, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1529, - "grad_norm": 1.3359375, - "learning_rate": 1.9634089067972445e-05, - "long_answer_loss": 0.1529, - "loss": 0.1483, - "short_answer_loss": NaN, - "step": 857, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1587, - "grad_norm": 1.359375, - "learning_rate": 1.962137328062411e-05, - "long_answer_loss": 0.1587, - "loss": 0.1532, - "short_answer_loss": NaN, - "step": 858, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1671, - "grad_norm": 1.5078125, - "learning_rate": 1.9608646573279098e-05, - "long_answer_loss": 0.1671, - "loss": 0.1638, - "short_answer_loss": NaN, - "step": 859, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1444, - "grad_norm": 1.375, - "learning_rate": 1.9595908965452692e-05, - "long_answer_loss": 0.1444, - "loss": 0.1475, - "short_answer_loss": NaN, - "step": 860, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1472, - "grad_norm": 1.359375, - "learning_rate": 1.9583160476676885e-05, - "long_answer_loss": 0.1472, - "loss": 0.1566, - "short_answer_loss": NaN, - "step": 861, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1496, - "grad_norm": 1.40625, - "learning_rate": 1.957040112650036e-05, - "long_answer_loss": 0.1496, - "loss": 0.156, - "short_answer_loss": NaN, - "step": 862, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1416, - "grad_norm": 1.359375, - "learning_rate": 1.955763093448845e-05, - "long_answer_loss": 0.1416, - "loss": 0.1537, - "short_answer_loss": NaN, - "step": 863, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1567, - "grad_norm": 1.3828125, - "learning_rate": 1.9544849920223123e-05, - "long_answer_loss": 0.1567, - "loss": 0.1499, - "short_answer_loss": NaN, - "step": 864, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1551, - "grad_norm": 1.3828125, - "learning_rate": 1.953205810330293e-05, - "long_answer_loss": 0.1551, - "loss": 0.154, - "short_answer_loss": NaN, - "step": 865, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1438, - "grad_norm": 1.4140625, - "learning_rate": 1.951925550334299e-05, - "long_answer_loss": 0.1438, - "loss": 0.1523, - "short_answer_loss": NaN, - "step": 866, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1265, - "grad_norm": 1.4765625, - "learning_rate": 1.950644213997496e-05, - "long_answer_loss": 0.1265, - "loss": 0.148, - "short_answer_loss": NaN, - "step": 867, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1521, - "grad_norm": 1.5, - "learning_rate": 1.949361803284701e-05, - "long_answer_loss": 0.1521, - "loss": 0.1474, - "short_answer_loss": NaN, - "step": 868, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.156, - "grad_norm": 1.4296875, - "learning_rate": 1.948078320162376e-05, - "long_answer_loss": 0.156, - "loss": 0.1512, - "short_answer_loss": NaN, - "step": 869, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1356, - "grad_norm": 1.484375, - "learning_rate": 1.94679376659863e-05, - "long_answer_loss": 0.1356, - "loss": 0.1499, - "short_answer_loss": NaN, - "step": 870, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1411, - "grad_norm": 1.46875, - "learning_rate": 1.945508144563212e-05, - "long_answer_loss": 0.1411, - "loss": 0.1558, - "short_answer_loss": NaN, - "step": 871, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1385, - "grad_norm": 1.4296875, - "learning_rate": 1.9442214560275096e-05, - "long_answer_loss": 0.1385, - "loss": 0.1422, - "short_answer_loss": NaN, - "step": 872, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1638, - "grad_norm": 1.421875, - "learning_rate": 1.9429337029645464e-05, - "long_answer_loss": 0.1638, - "loss": 0.1554, - "short_answer_loss": NaN, - "step": 873, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1393, - "grad_norm": 1.3984375, - "learning_rate": 1.9416448873489775e-05, - "long_answer_loss": 0.1393, - "loss": 0.1542, - "short_answer_loss": NaN, - "step": 874, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1456, - "grad_norm": 1.40625, - "learning_rate": 1.9403550111570883e-05, - "long_answer_loss": 0.1456, - "loss": 0.1492, - "short_answer_loss": NaN, - "step": 875, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1548, - "grad_norm": 1.390625, - "learning_rate": 1.93906407636679e-05, - "long_answer_loss": 0.1548, - "loss": 0.1585, - "short_answer_loss": NaN, - "step": 876, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1319, - "grad_norm": 1.3359375, - "learning_rate": 1.9377720849576164e-05, - "long_answer_loss": 0.1319, - "loss": 0.1454, - "short_answer_loss": NaN, - "step": 877, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.151, - "grad_norm": 1.453125, - "learning_rate": 1.9364790389107224e-05, - "long_answer_loss": 0.151, - "loss": 0.1583, - "short_answer_loss": NaN, - "step": 878, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1422, - "grad_norm": 1.4375, - "learning_rate": 1.93518494020888e-05, - "long_answer_loss": 0.1422, - "loss": 0.1506, - "short_answer_loss": NaN, - "step": 879, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1457, - "grad_norm": 1.3828125, - "learning_rate": 1.933889790836475e-05, - "long_answer_loss": 0.1457, - "loss": 0.1545, - "short_answer_loss": NaN, - "step": 880, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1346, - "grad_norm": 1.421875, - "learning_rate": 1.9325935927795052e-05, - "long_answer_loss": 0.1346, - "loss": 0.1448, - "short_answer_loss": NaN, - "step": 881, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1691, - "grad_norm": 1.4609375, - "learning_rate": 1.9312963480255746e-05, - "long_answer_loss": 0.1691, - "loss": 0.1523, - "short_answer_loss": NaN, - "step": 882, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1604, - "grad_norm": 1.3359375, - "learning_rate": 1.9299980585638946e-05, - "long_answer_loss": 0.1604, - "loss": 0.1474, - "short_answer_loss": NaN, - "step": 883, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1416, - "grad_norm": 1.5078125, - "learning_rate": 1.9286987263852767e-05, - "long_answer_loss": 0.1416, - "loss": 0.1491, - "short_answer_loss": NaN, - "step": 884, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1696, - "grad_norm": 1.453125, - "learning_rate": 1.927398353482132e-05, - "long_answer_loss": 0.1696, - "loss": 0.1525, - "short_answer_loss": NaN, - "step": 885, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1413, - "grad_norm": 1.3828125, - "learning_rate": 1.9260969418484677e-05, - "long_answer_loss": 0.1413, - "loss": 0.1474, - "short_answer_loss": NaN, - "step": 886, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1525, - "grad_norm": 1.453125, - "learning_rate": 1.9247944934798835e-05, - "long_answer_loss": 0.1525, - "loss": 0.1508, - "short_answer_loss": NaN, - "step": 887, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1834, - "grad_norm": 1.4140625, - "learning_rate": 1.9234910103735686e-05, - "long_answer_loss": 0.1834, - "loss": 0.1527, - "short_answer_loss": NaN, - "step": 888, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1449, - "grad_norm": 1.3984375, - "learning_rate": 1.9221864945282997e-05, - "long_answer_loss": 0.1449, - "loss": 0.1506, - "short_answer_loss": NaN, - "step": 889, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1531, - "grad_norm": 1.4140625, - "learning_rate": 1.920880947944436e-05, - "long_answer_loss": 0.1531, - "loss": 0.149, - "short_answer_loss": NaN, - "step": 890, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1307, - "grad_norm": 1.3984375, - "learning_rate": 1.919574372623918e-05, - "long_answer_loss": 0.1307, - "loss": 0.1506, - "short_answer_loss": NaN, - "step": 891, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1422, - "grad_norm": 1.3671875, - "learning_rate": 1.918266770570264e-05, - "long_answer_loss": 0.1422, - "loss": 0.1524, - "short_answer_loss": NaN, - "step": 892, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1324, - "grad_norm": 1.4609375, - "learning_rate": 1.9169581437885654e-05, - "long_answer_loss": 0.1324, - "loss": 0.1496, - "short_answer_loss": NaN, - "step": 893, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1687, - "grad_norm": 1.421875, - "learning_rate": 1.915648494285486e-05, - "long_answer_loss": 0.1687, - "loss": 0.1525, - "short_answer_loss": NaN, - "step": 894, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1571, - "grad_norm": 1.3515625, - "learning_rate": 1.9143378240692578e-05, - "long_answer_loss": 0.1571, - "loss": 0.1474, - "short_answer_loss": NaN, - "step": 895, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1396, - "grad_norm": 1.4453125, - "learning_rate": 1.913026135149678e-05, - "long_answer_loss": 0.1396, - "loss": 0.1541, - "short_answer_loss": NaN, - "step": 896, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1529, - "grad_norm": 1.40625, - "learning_rate": 1.9117134295381056e-05, - "long_answer_loss": 0.1529, - "loss": 0.1468, - "short_answer_loss": NaN, - "step": 897, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1445, - "grad_norm": 1.390625, - "learning_rate": 1.910399709247458e-05, - "long_answer_loss": 0.1445, - "loss": 0.1454, - "short_answer_loss": NaN, - "step": 898, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1334, - "grad_norm": 1.4296875, - "learning_rate": 1.90908497629221e-05, - "long_answer_loss": 0.1334, - "loss": 0.1533, - "short_answer_loss": NaN, - "step": 899, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.133, - "grad_norm": 1.4453125, - "learning_rate": 1.9077692326883872e-05, - "long_answer_loss": 0.133, - "loss": 0.1476, - "short_answer_loss": NaN, - "step": 900, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1569, - "grad_norm": 1.4296875, - "learning_rate": 1.9064524804535674e-05, - "long_answer_loss": 0.1569, - "loss": 0.1481, - "short_answer_loss": NaN, - "step": 901, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1252, - "grad_norm": 1.390625, - "learning_rate": 1.9051347216068734e-05, - "long_answer_loss": 0.1252, - "loss": 0.1479, - "short_answer_loss": NaN, - "step": 902, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.141, - "grad_norm": 1.4921875, - "learning_rate": 1.903815958168972e-05, - "long_answer_loss": 0.141, - "loss": 0.1476, - "short_answer_loss": NaN, - "step": 903, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1579, - "grad_norm": 1.421875, - "learning_rate": 1.9024961921620705e-05, - "long_answer_loss": 0.1579, - "loss": 0.1512, - "short_answer_loss": NaN, - "step": 904, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.149, - "grad_norm": 1.3671875, - "learning_rate": 1.9011754256099128e-05, - "long_answer_loss": 0.149, - "loss": 0.1504, - "short_answer_loss": NaN, - "step": 905, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1573, - "grad_norm": 1.53125, - "learning_rate": 1.8998536605377788e-05, - "long_answer_loss": 0.1573, - "loss": 0.1488, - "short_answer_loss": NaN, - "step": 906, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.139, - "grad_norm": 1.4765625, - "learning_rate": 1.8985308989724776e-05, - "long_answer_loss": 0.139, - "loss": 0.1501, - "short_answer_loss": NaN, - "step": 907, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1844, - "grad_norm": 1.46875, - "learning_rate": 1.8972071429423473e-05, - "long_answer_loss": 0.1844, - "loss": 0.1444, - "short_answer_loss": NaN, - "step": 908, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.157, - "grad_norm": 1.4921875, - "learning_rate": 1.8958823944772508e-05, - "long_answer_loss": 0.157, - "loss": 0.1546, - "short_answer_loss": NaN, - "step": 909, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1433, - "grad_norm": 1.5546875, - "learning_rate": 1.894556655608573e-05, - "long_answer_loss": 0.1433, - "loss": 0.1423, - "short_answer_loss": NaN, - "step": 910, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1477, - "grad_norm": 1.3984375, - "learning_rate": 1.8932299283692177e-05, - "long_answer_loss": 0.1477, - "loss": 0.1412, - "short_answer_loss": NaN, - "step": 911, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1523, - "grad_norm": 1.421875, - "learning_rate": 1.891902214793603e-05, - "long_answer_loss": 0.1523, - "loss": 0.1482, - "short_answer_loss": NaN, - "step": 912, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1541, - "grad_norm": 1.46875, - "learning_rate": 1.890573516917661e-05, - "long_answer_loss": 0.1541, - "loss": 0.1517, - "short_answer_loss": NaN, - "step": 913, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1398, - "grad_norm": 1.4453125, - "learning_rate": 1.889243836778832e-05, - "long_answer_loss": 0.1398, - "loss": 0.1453, - "short_answer_loss": NaN, - "step": 914, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1336, - "grad_norm": 1.296875, - "learning_rate": 1.8879131764160635e-05, - "long_answer_loss": 0.1336, - "loss": 0.1479, - "short_answer_loss": NaN, - "step": 915, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1259, - "grad_norm": 1.34375, - "learning_rate": 1.8865815378698052e-05, - "long_answer_loss": 0.1259, - "loss": 0.1437, - "short_answer_loss": NaN, - "step": 916, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1738, - "grad_norm": 1.453125, - "learning_rate": 1.8852489231820076e-05, - "long_answer_loss": 0.1738, - "loss": 0.1458, - "short_answer_loss": NaN, - "step": 917, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1643, - "grad_norm": 1.4296875, - "learning_rate": 1.883915334396117e-05, - "long_answer_loss": 0.1643, - "loss": 0.158, - "short_answer_loss": NaN, - "step": 918, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1383, - "grad_norm": 1.4453125, - "learning_rate": 1.8825807735570748e-05, - "long_answer_loss": 0.1383, - "loss": 0.1573, - "short_answer_loss": NaN, - "step": 919, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1543, - "grad_norm": 1.5078125, - "learning_rate": 1.881245242711311e-05, - "long_answer_loss": 0.1543, - "loss": 0.152, - "short_answer_loss": NaN, - "step": 920, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1397, - "grad_norm": 1.3671875, - "learning_rate": 1.879908743906745e-05, - "long_answer_loss": 0.1397, - "loss": 0.1497, - "short_answer_loss": NaN, - "step": 921, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1374, - "grad_norm": 1.40625, - "learning_rate": 1.878571279192779e-05, - "long_answer_loss": 0.1374, - "loss": 0.1483, - "short_answer_loss": NaN, - "step": 922, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1486, - "grad_norm": 1.40625, - "learning_rate": 1.8772328506202972e-05, - "long_answer_loss": 0.1486, - "loss": 0.1461, - "short_answer_loss": NaN, - "step": 923, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1676, - "grad_norm": 1.4296875, - "learning_rate": 1.8758934602416623e-05, - "long_answer_loss": 0.1676, - "loss": 0.1456, - "short_answer_loss": NaN, - "step": 924, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1361, - "grad_norm": 1.359375, - "learning_rate": 1.8745531101107104e-05, - "long_answer_loss": 0.1361, - "loss": 0.1469, - "short_answer_loss": NaN, - "step": 925, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1458, - "grad_norm": 1.390625, - "learning_rate": 1.87321180228275e-05, - "long_answer_loss": 0.1458, - "loss": 0.1402, - "short_answer_loss": NaN, - "step": 926, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1766, - "grad_norm": 1.3984375, - "learning_rate": 1.871869538814558e-05, - "long_answer_loss": 0.1766, - "loss": 0.1544, - "short_answer_loss": NaN, - "step": 927, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1668, - "grad_norm": 1.40625, - "learning_rate": 1.870526321764377e-05, - "long_answer_loss": 0.1668, - "loss": 0.1472, - "short_answer_loss": NaN, - "step": 928, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1488, - "grad_norm": 1.4609375, - "learning_rate": 1.8691821531919117e-05, - "long_answer_loss": 0.1488, - "loss": 0.1526, - "short_answer_loss": NaN, - "step": 929, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.156, - "grad_norm": 1.4609375, - "learning_rate": 1.8678370351583256e-05, - "long_answer_loss": 0.156, - "loss": 0.1519, - "short_answer_loss": NaN, - "step": 930, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1304, - "grad_norm": 1.4921875, - "learning_rate": 1.866490969726239e-05, - "long_answer_loss": 0.1304, - "loss": 0.1449, - "short_answer_loss": NaN, - "step": 931, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1553, - "grad_norm": 1.46875, - "learning_rate": 1.8651439589597235e-05, - "long_answer_loss": 0.1553, - "loss": 0.1516, - "short_answer_loss": NaN, - "step": 932, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1225, - "grad_norm": 1.3828125, - "learning_rate": 1.8637960049243013e-05, - "long_answer_loss": 0.1225, - "loss": 0.1434, - "short_answer_loss": NaN, - "step": 933, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1371, - "grad_norm": 1.4453125, - "learning_rate": 1.8624471096869417e-05, - "long_answer_loss": 0.1371, - "loss": 0.1507, - "short_answer_loss": NaN, - "step": 934, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1215, - "grad_norm": 1.3828125, - "learning_rate": 1.861097275316055e-05, - "long_answer_loss": 0.1215, - "loss": 0.1485, - "short_answer_loss": NaN, - "step": 935, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.133, - "grad_norm": 1.3203125, - "learning_rate": 1.8597465038814936e-05, - "long_answer_loss": 0.133, - "loss": 0.1495, - "short_answer_loss": NaN, - "step": 936, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1611, - "grad_norm": 1.4453125, - "learning_rate": 1.8583947974545462e-05, - "long_answer_loss": 0.1611, - "loss": 0.152, - "short_answer_loss": NaN, - "step": 937, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.144, - "grad_norm": 1.4453125, - "learning_rate": 1.857042158107935e-05, - "long_answer_loss": 0.144, - "loss": 0.1448, - "short_answer_loss": NaN, - "step": 938, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1488, - "grad_norm": 1.3671875, - "learning_rate": 1.855688587915813e-05, - "long_answer_loss": 0.1488, - "loss": 0.1432, - "short_answer_loss": NaN, - "step": 939, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1342, - "grad_norm": 1.390625, - "learning_rate": 1.85433408895376e-05, - "long_answer_loss": 0.1342, - "loss": 0.1423, - "short_answer_loss": NaN, - "step": 940, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1574, - "grad_norm": 1.421875, - "learning_rate": 1.8529786632987815e-05, - "long_answer_loss": 0.1574, - "loss": 0.1471, - "short_answer_loss": NaN, - "step": 941, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1266, - "grad_norm": 1.4296875, - "learning_rate": 1.8516223130293024e-05, - "long_answer_loss": 0.1266, - "loss": 0.1372, - "short_answer_loss": NaN, - "step": 942, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1552, - "grad_norm": 1.5, - "learning_rate": 1.850265040225166e-05, - "long_answer_loss": 0.1552, - "loss": 0.1581, - "short_answer_loss": NaN, - "step": 943, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1505, - "grad_norm": 1.421875, - "learning_rate": 1.8489068469676298e-05, - "long_answer_loss": 0.1505, - "loss": 0.1529, - "short_answer_loss": NaN, - "step": 944, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1384, - "grad_norm": 1.3984375, - "learning_rate": 1.8475477353393635e-05, - "long_answer_loss": 0.1384, - "loss": 0.1415, - "short_answer_loss": NaN, - "step": 945, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1615, - "grad_norm": 1.578125, - "learning_rate": 1.846187707424445e-05, - "long_answer_loss": 0.1615, - "loss": 0.1433, - "short_answer_loss": NaN, - "step": 946, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1301, - "grad_norm": 1.4453125, - "learning_rate": 1.844826765308357e-05, - "long_answer_loss": 0.1301, - "loss": 0.1417, - "short_answer_loss": NaN, - "step": 947, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1422, - "grad_norm": 1.3828125, - "learning_rate": 1.8434649110779833e-05, - "long_answer_loss": 0.1422, - "loss": 0.1424, - "short_answer_loss": NaN, - "step": 948, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1435, - "grad_norm": 1.4140625, - "learning_rate": 1.8421021468216075e-05, - "long_answer_loss": 0.1435, - "loss": 0.1419, - "short_answer_loss": NaN, - "step": 949, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.158, - "grad_norm": 1.515625, - "learning_rate": 1.8407384746289084e-05, - "long_answer_loss": 0.158, - "loss": 0.1517, - "short_answer_loss": NaN, - "step": 950, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1334, - "grad_norm": 1.4140625, - "learning_rate": 1.839373896590956e-05, - "long_answer_loss": 0.1334, - "loss": 0.1474, - "short_answer_loss": NaN, - "step": 951, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.158, - "grad_norm": 1.4921875, - "learning_rate": 1.8380084148002104e-05, - "long_answer_loss": 0.158, - "loss": 0.1474, - "short_answer_loss": NaN, - "step": 952, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1498, - "grad_norm": 1.3984375, - "learning_rate": 1.8366420313505182e-05, - "long_answer_loss": 0.1498, - "loss": 0.1438, - "short_answer_loss": NaN, - "step": 953, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1349, - "grad_norm": 1.5078125, - "learning_rate": 1.8352747483371064e-05, - "long_answer_loss": 0.1349, - "loss": 0.1529, - "short_answer_loss": NaN, - "step": 954, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1247, - "grad_norm": 1.4921875, - "learning_rate": 1.8339065678565835e-05, - "long_answer_loss": 0.1247, - "loss": 0.1535, - "short_answer_loss": NaN, - "step": 955, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1573, - "grad_norm": 1.3828125, - "learning_rate": 1.8325374920069333e-05, - "long_answer_loss": 0.1573, - "loss": 0.1416, - "short_answer_loss": NaN, - "step": 956, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1422, - "grad_norm": 1.578125, - "learning_rate": 1.831167522887512e-05, - "long_answer_loss": 0.1422, - "loss": 0.1534, - "short_answer_loss": NaN, - "step": 957, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1438, - "grad_norm": 1.5859375, - "learning_rate": 1.8297966625990474e-05, - "long_answer_loss": 0.1438, - "loss": 0.1542, - "short_answer_loss": NaN, - "step": 958, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1689, - "grad_norm": 1.6015625, - "learning_rate": 1.8284249132436316e-05, - "long_answer_loss": 0.1689, - "loss": 0.1487, - "short_answer_loss": NaN, - "step": 959, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1494, - "grad_norm": 1.5390625, - "learning_rate": 1.8270522769247212e-05, - "long_answer_loss": 0.1494, - "loss": 0.1537, - "short_answer_loss": NaN, - "step": 960, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1626, - "grad_norm": 1.4921875, - "learning_rate": 1.8256787557471328e-05, - "long_answer_loss": 0.1626, - "loss": 0.1569, - "short_answer_loss": NaN, - "step": 961, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1434, - "grad_norm": 1.4140625, - "learning_rate": 1.8243043518170395e-05, - "long_answer_loss": 0.1434, - "loss": 0.1516, - "short_answer_loss": NaN, - "step": 962, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1507, - "grad_norm": 1.4921875, - "learning_rate": 1.822929067241969e-05, - "long_answer_loss": 0.1507, - "loss": 0.1506, - "short_answer_loss": NaN, - "step": 963, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1427, - "grad_norm": 1.453125, - "learning_rate": 1.8215529041307982e-05, - "long_answer_loss": 0.1427, - "loss": 0.143, - "short_answer_loss": NaN, - "step": 964, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1768, - "grad_norm": 1.4921875, - "learning_rate": 1.8201758645937518e-05, - "long_answer_loss": 0.1768, - "loss": 0.1545, - "short_answer_loss": NaN, - "step": 965, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1529, - "grad_norm": 1.390625, - "learning_rate": 1.818797950742398e-05, - "long_answer_loss": 0.1529, - "loss": 0.1445, - "short_answer_loss": NaN, - "step": 966, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1398, - "grad_norm": 1.4296875, - "learning_rate": 1.817419164689646e-05, - "long_answer_loss": 0.1398, - "loss": 0.1457, - "short_answer_loss": NaN, - "step": 967, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1818, - "grad_norm": 1.3359375, - "learning_rate": 1.8160395085497428e-05, - "long_answer_loss": 0.1818, - "loss": 0.1415, - "short_answer_loss": NaN, - "step": 968, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1445, - "grad_norm": 1.4765625, - "learning_rate": 1.8146589844382686e-05, - "long_answer_loss": 0.1445, - "loss": 0.1487, - "short_answer_loss": NaN, - "step": 969, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1365, - "grad_norm": 1.34375, - "learning_rate": 1.8132775944721354e-05, - "long_answer_loss": 0.1365, - "loss": 0.1388, - "short_answer_loss": NaN, - "step": 970, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1537, - "grad_norm": 1.3828125, - "learning_rate": 1.8118953407695825e-05, - "long_answer_loss": 0.1537, - "loss": 0.1434, - "short_answer_loss": NaN, - "step": 971, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1437, - "grad_norm": 1.3671875, - "learning_rate": 1.8105122254501743e-05, - "long_answer_loss": 0.1437, - "loss": 0.1481, - "short_answer_loss": NaN, - "step": 972, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1409, - "grad_norm": 1.4765625, - "learning_rate": 1.8091282506347952e-05, - "long_answer_loss": 0.1409, - "loss": 0.1514, - "short_answer_loss": NaN, - "step": 973, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1473, - "grad_norm": 1.4140625, - "learning_rate": 1.807743418445649e-05, - "long_answer_loss": 0.1473, - "loss": 0.1485, - "short_answer_loss": NaN, - "step": 974, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1855, - "grad_norm": 1.3515625, - "learning_rate": 1.8063577310062527e-05, - "long_answer_loss": 0.1855, - "loss": 0.1464, - "short_answer_loss": NaN, - "step": 975, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1422, - "grad_norm": 1.3671875, - "learning_rate": 1.8049711904414362e-05, - "long_answer_loss": 0.1422, - "loss": 0.1484, - "short_answer_loss": NaN, - "step": 976, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1332, - "grad_norm": 1.4609375, - "learning_rate": 1.803583798877337e-05, - "long_answer_loss": 0.1332, - "loss": 0.1376, - "short_answer_loss": NaN, - "step": 977, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1391, - "grad_norm": 1.5234375, - "learning_rate": 1.802195558441397e-05, - "long_answer_loss": 0.1391, - "loss": 0.1475, - "short_answer_loss": NaN, - "step": 978, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1537, - "grad_norm": 1.359375, - "learning_rate": 1.8008064712623607e-05, - "long_answer_loss": 0.1537, - "loss": 0.1367, - "short_answer_loss": NaN, - "step": 979, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1691, - "grad_norm": 1.3125, - "learning_rate": 1.7994165394702705e-05, - "long_answer_loss": 0.1691, - "loss": 0.1448, - "short_answer_loss": NaN, - "step": 980, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1577, - "grad_norm": 1.421875, - "learning_rate": 1.7980257651964634e-05, - "long_answer_loss": 0.1577, - "loss": 0.149, - "short_answer_loss": NaN, - "step": 981, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1244, - "grad_norm": 1.484375, - "learning_rate": 1.7966341505735695e-05, - "long_answer_loss": 0.1244, - "loss": 0.1453, - "short_answer_loss": NaN, - "step": 982, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1265, - "grad_norm": 1.3984375, - "learning_rate": 1.7952416977355063e-05, - "long_answer_loss": 0.1265, - "loss": 0.1491, - "short_answer_loss": NaN, - "step": 983, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1647, - "grad_norm": 1.3984375, - "learning_rate": 1.793848408817478e-05, - "long_answer_loss": 0.1647, - "loss": 0.1433, - "short_answer_loss": NaN, - "step": 984, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1331, - "grad_norm": 1.40625, - "learning_rate": 1.792454285955969e-05, - "long_answer_loss": 0.1331, - "loss": 0.1418, - "short_answer_loss": NaN, - "step": 985, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1528, - "grad_norm": 1.5078125, - "learning_rate": 1.7910593312887447e-05, - "long_answer_loss": 0.1528, - "loss": 0.1486, - "short_answer_loss": NaN, - "step": 986, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1451, - "grad_norm": 1.4765625, - "learning_rate": 1.7896635469548438e-05, - "long_answer_loss": 0.1451, - "loss": 0.1495, - "short_answer_loss": NaN, - "step": 987, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1485, - "grad_norm": 1.3671875, - "learning_rate": 1.7882669350945787e-05, - "long_answer_loss": 0.1485, - "loss": 0.1357, - "short_answer_loss": NaN, - "step": 988, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1562, - "grad_norm": 1.359375, - "learning_rate": 1.7868694978495304e-05, - "long_answer_loss": 0.1562, - "loss": 0.1411, - "short_answer_loss": NaN, - "step": 989, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.13, - "grad_norm": 1.484375, - "learning_rate": 1.785471237362545e-05, - "long_answer_loss": 0.13, - "loss": 0.1494, - "short_answer_loss": NaN, - "step": 990, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1363, - "grad_norm": 1.2734375, - "learning_rate": 1.784072155777732e-05, - "long_answer_loss": 0.1363, - "loss": 0.1417, - "short_answer_loss": NaN, - "step": 991, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1632, - "grad_norm": 1.4140625, - "learning_rate": 1.782672255240459e-05, - "long_answer_loss": 0.1632, - "loss": 0.1447, - "short_answer_loss": NaN, - "step": 992, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1759, - "grad_norm": 1.4296875, - "learning_rate": 1.7812715378973495e-05, - "long_answer_loss": 0.1759, - "loss": 0.1479, - "short_answer_loss": NaN, - "step": 993, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1529, - "grad_norm": 1.4375, - "learning_rate": 1.7798700058962807e-05, - "long_answer_loss": 0.1529, - "loss": 0.1548, - "short_answer_loss": NaN, - "step": 994, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1382, - "grad_norm": 1.4140625, - "learning_rate": 1.778467661386377e-05, - "long_answer_loss": 0.1382, - "loss": 0.1413, - "short_answer_loss": NaN, - "step": 995, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1495, - "grad_norm": 1.5390625, - "learning_rate": 1.7770645065180106e-05, - "long_answer_loss": 0.1495, - "loss": 0.1493, - "short_answer_loss": NaN, - "step": 996, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1435, - "grad_norm": 1.390625, - "learning_rate": 1.7756605434427948e-05, - "long_answer_loss": 0.1435, - "loss": 0.1409, - "short_answer_loss": NaN, - "step": 997, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.161, - "grad_norm": 1.453125, - "learning_rate": 1.7742557743135836e-05, - "long_answer_loss": 0.161, - "loss": 0.1435, - "short_answer_loss": NaN, - "step": 998, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1362, - "grad_norm": 1.375, - "learning_rate": 1.7728502012844665e-05, - "long_answer_loss": 0.1362, - "loss": 0.1368, - "short_answer_loss": NaN, - "step": 999, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1227, - "grad_norm": 1.3515625, - "learning_rate": 1.7714438265107643e-05, - "long_answer_loss": 0.1227, - "loss": 0.1407, - "short_answer_loss": NaN, - "step": 1000, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1565, - "grad_norm": 1.4609375, - "learning_rate": 1.7700366521490296e-05, - "long_answer_loss": 0.1565, - "loss": 0.1506, - "short_answer_loss": NaN, - "step": 1001, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1356, - "grad_norm": 1.3984375, - "learning_rate": 1.7686286803570398e-05, - "long_answer_loss": 0.1356, - "loss": 0.1448, - "short_answer_loss": NaN, - "step": 1002, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.142, - "grad_norm": 1.34375, - "learning_rate": 1.767219913293795e-05, - "long_answer_loss": 0.142, - "loss": 0.1409, - "short_answer_loss": NaN, - "step": 1003, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1271, - "grad_norm": 1.5, - "learning_rate": 1.765810353119515e-05, - "long_answer_loss": 0.1271, - "loss": 0.1491, - "short_answer_loss": NaN, - "step": 1004, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1491, - "grad_norm": 1.375, - "learning_rate": 1.7644000019956353e-05, - "long_answer_loss": 0.1491, - "loss": 0.1515, - "short_answer_loss": NaN, - "step": 1005, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1404, - "grad_norm": 1.46875, - "learning_rate": 1.7629888620848055e-05, - "long_answer_loss": 0.1404, - "loss": 0.1449, - "short_answer_loss": NaN, - "step": 1006, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.15, - "grad_norm": 1.4140625, - "learning_rate": 1.761576935550884e-05, - "long_answer_loss": 0.15, - "loss": 0.1476, - "short_answer_loss": NaN, - "step": 1007, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.122, - "grad_norm": 1.421875, - "learning_rate": 1.760164224558935e-05, - "long_answer_loss": 0.122, - "loss": 0.145, - "short_answer_loss": NaN, - "step": 1008, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1541, - "grad_norm": 1.453125, - "learning_rate": 1.7587507312752262e-05, - "long_answer_loss": 0.1541, - "loss": 0.1385, - "short_answer_loss": NaN, - "step": 1009, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1573, - "grad_norm": 1.4921875, - "learning_rate": 1.7573364578672244e-05, - "long_answer_loss": 0.1573, - "loss": 0.1464, - "short_answer_loss": NaN, - "step": 1010, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1624, - "grad_norm": 1.359375, - "learning_rate": 1.755921406503593e-05, - "long_answer_loss": 0.1624, - "loss": 0.1454, - "short_answer_loss": NaN, - "step": 1011, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1365, - "grad_norm": 1.3125, - "learning_rate": 1.754505579354188e-05, - "long_answer_loss": 0.1365, - "loss": 0.139, - "short_answer_loss": NaN, - "step": 1012, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1558, - "grad_norm": 1.5546875, - "learning_rate": 1.7530889785900555e-05, - "long_answer_loss": 0.1558, - "loss": 0.1455, - "short_answer_loss": NaN, - "step": 1013, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1672, - "grad_norm": 1.4453125, - "learning_rate": 1.7516716063834278e-05, - "long_answer_loss": 0.1672, - "loss": 0.1458, - "short_answer_loss": NaN, - "step": 1014, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1342, - "grad_norm": 1.4921875, - "learning_rate": 1.7502534649077197e-05, - "long_answer_loss": 0.1342, - "loss": 0.1416, - "short_answer_loss": NaN, - "step": 1015, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1341, - "grad_norm": 1.3828125, - "learning_rate": 1.748834556337526e-05, - "long_answer_loss": 0.1341, - "loss": 0.1462, - "short_answer_loss": NaN, - "step": 1016, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1512, - "grad_norm": 1.34375, - "learning_rate": 1.7474148828486176e-05, - "long_answer_loss": 0.1512, - "loss": 0.145, - "short_answer_loss": NaN, - "step": 1017, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1629, - "grad_norm": 1.328125, - "learning_rate": 1.7459944466179377e-05, - "long_answer_loss": 0.1629, - "loss": 0.1442, - "short_answer_loss": NaN, - "step": 1018, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1454, - "grad_norm": 1.4296875, - "learning_rate": 1.744573249823601e-05, - "long_answer_loss": 0.1454, - "loss": 0.1408, - "short_answer_loss": NaN, - "step": 1019, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.135, - "grad_norm": 1.421875, - "learning_rate": 1.7431512946448862e-05, - "long_answer_loss": 0.135, - "loss": 0.137, - "short_answer_loss": NaN, - "step": 1020, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1469, - "grad_norm": 1.6171875, - "learning_rate": 1.741728583262236e-05, - "long_answer_loss": 0.1469, - "loss": 0.1501, - "short_answer_loss": NaN, - "step": 1021, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.132, - "grad_norm": 1.375, - "learning_rate": 1.7403051178572528e-05, - "long_answer_loss": 0.132, - "loss": 0.1334, - "short_answer_loss": NaN, - "step": 1022, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1249, - "grad_norm": 1.4375, - "learning_rate": 1.738880900612695e-05, - "long_answer_loss": 0.1249, - "loss": 0.1416, - "short_answer_loss": NaN, - "step": 1023, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1424, - "grad_norm": 1.6171875, - "learning_rate": 1.7374559337124743e-05, - "long_answer_loss": 0.1424, - "loss": 0.1475, - "short_answer_loss": NaN, - "step": 1024, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1555, - "grad_norm": 1.453125, - "learning_rate": 1.736030219341651e-05, - "long_answer_loss": 0.1555, - "loss": 0.1428, - "short_answer_loss": NaN, - "step": 1025, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1272, - "grad_norm": 1.3984375, - "learning_rate": 1.7346037596864322e-05, - "long_answer_loss": 0.1272, - "loss": 0.1353, - "short_answer_loss": NaN, - "step": 1026, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1666, - "grad_norm": 1.4609375, - "learning_rate": 1.733176556934168e-05, - "long_answer_loss": 0.1666, - "loss": 0.1495, - "short_answer_loss": NaN, - "step": 1027, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1335, - "grad_norm": 1.46875, - "learning_rate": 1.731748613273347e-05, - "long_answer_loss": 0.1335, - "loss": 0.1372, - "short_answer_loss": NaN, - "step": 1028, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1394, - "grad_norm": 1.4609375, - "learning_rate": 1.7303199308935956e-05, - "long_answer_loss": 0.1394, - "loss": 0.1408, - "short_answer_loss": NaN, - "step": 1029, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1159, - "grad_norm": 1.5078125, - "learning_rate": 1.7288905119856717e-05, - "long_answer_loss": 0.1159, - "loss": 0.1358, - "short_answer_loss": NaN, - "step": 1030, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.123, - "grad_norm": 1.4453125, - "learning_rate": 1.7274603587414622e-05, - "long_answer_loss": 0.123, - "loss": 0.1401, - "short_answer_loss": NaN, - "step": 1031, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1443, - "grad_norm": 1.5625, - "learning_rate": 1.726029473353982e-05, - "long_answer_loss": 0.1443, - "loss": 0.1413, - "short_answer_loss": NaN, - "step": 1032, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1265, - "grad_norm": 1.3515625, - "learning_rate": 1.724597858017366e-05, - "long_answer_loss": 0.1265, - "loss": 0.1499, - "short_answer_loss": NaN, - "step": 1033, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1579, - "grad_norm": 1.4375, - "learning_rate": 1.723165514926871e-05, - "long_answer_loss": 0.1579, - "loss": 0.1498, - "short_answer_loss": NaN, - "step": 1034, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.115, - "grad_norm": 1.421875, - "learning_rate": 1.7217324462788676e-05, - "long_answer_loss": 0.115, - "loss": 0.1454, - "short_answer_loss": NaN, - "step": 1035, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1249, - "grad_norm": 1.3984375, - "learning_rate": 1.720298654270841e-05, - "long_answer_loss": 0.1249, - "loss": 0.1408, - "short_answer_loss": NaN, - "step": 1036, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1493, - "grad_norm": 1.375, - "learning_rate": 1.7188641411013833e-05, - "long_answer_loss": 0.1493, - "loss": 0.1411, - "short_answer_loss": NaN, - "step": 1037, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1663, - "grad_norm": 1.375, - "learning_rate": 1.7174289089701944e-05, - "long_answer_loss": 0.1663, - "loss": 0.1463, - "short_answer_loss": NaN, - "step": 1038, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1366, - "grad_norm": 1.3984375, - "learning_rate": 1.7159929600780765e-05, - "long_answer_loss": 0.1366, - "loss": 0.1406, - "short_answer_loss": NaN, - "step": 1039, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1414, - "grad_norm": 1.4375, - "learning_rate": 1.7145562966269294e-05, - "long_answer_loss": 0.1414, - "loss": 0.1497, - "short_answer_loss": NaN, - "step": 1040, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1268, - "grad_norm": 1.359375, - "learning_rate": 1.71311892081975e-05, - "long_answer_loss": 0.1268, - "loss": 0.1374, - "short_answer_loss": NaN, - "step": 1041, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1364, - "grad_norm": 1.4921875, - "learning_rate": 1.7116808348606266e-05, - "long_answer_loss": 0.1364, - "loss": 0.143, - "short_answer_loss": NaN, - "step": 1042, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.13, - "grad_norm": 1.484375, - "learning_rate": 1.7102420409547374e-05, - "long_answer_loss": 0.13, - "loss": 0.1385, - "short_answer_loss": NaN, - "step": 1043, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1638, - "grad_norm": 1.4375, - "learning_rate": 1.7088025413083462e-05, - "long_answer_loss": 0.1638, - "loss": 0.1432, - "short_answer_loss": NaN, - "step": 1044, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1319, - "grad_norm": 1.40625, - "learning_rate": 1.7073623381287976e-05, - "long_answer_loss": 0.1319, - "loss": 0.1352, - "short_answer_loss": NaN, - "step": 1045, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1621, - "grad_norm": 1.4765625, - "learning_rate": 1.7059214336245164e-05, - "long_answer_loss": 0.1621, - "loss": 0.1478, - "short_answer_loss": NaN, - "step": 1046, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1217, - "grad_norm": 1.46875, - "learning_rate": 1.7044798300050025e-05, - "long_answer_loss": 0.1217, - "loss": 0.1409, - "short_answer_loss": NaN, - "step": 1047, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1424, - "grad_norm": 1.5078125, - "learning_rate": 1.703037529480827e-05, - "long_answer_loss": 0.1424, - "loss": 0.137, - "short_answer_loss": NaN, - "step": 1048, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1401, - "grad_norm": 1.4296875, - "learning_rate": 1.7015945342636307e-05, - "long_answer_loss": 0.1401, - "loss": 0.1443, - "short_answer_loss": NaN, - "step": 1049, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1351, - "grad_norm": 1.4921875, - "learning_rate": 1.70015084656612e-05, - "long_answer_loss": 0.1351, - "loss": 0.1469, - "short_answer_loss": NaN, - "step": 1050, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1336, - "grad_norm": 1.375, - "learning_rate": 1.698706468602061e-05, - "long_answer_loss": 0.1336, - "loss": 0.1328, - "short_answer_loss": NaN, - "step": 1051, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1307, - "grad_norm": 1.3671875, - "learning_rate": 1.6972614025862805e-05, - "long_answer_loss": 0.1307, - "loss": 0.1378, - "short_answer_loss": NaN, - "step": 1052, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1622, - "grad_norm": 1.46875, - "learning_rate": 1.6958156507346592e-05, - "long_answer_loss": 0.1622, - "loss": 0.1553, - "short_answer_loss": NaN, - "step": 1053, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.144, - "grad_norm": 1.453125, - "learning_rate": 1.6943692152641303e-05, - "long_answer_loss": 0.144, - "loss": 0.1455, - "short_answer_loss": NaN, - "step": 1054, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1307, - "grad_norm": 1.4140625, - "learning_rate": 1.6929220983926748e-05, - "long_answer_loss": 0.1307, - "loss": 0.1342, - "short_answer_loss": NaN, - "step": 1055, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1492, - "grad_norm": 1.3828125, - "learning_rate": 1.691474302339318e-05, - "long_answer_loss": 0.1492, - "loss": 0.1402, - "short_answer_loss": NaN, - "step": 1056, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1517, - "grad_norm": 1.34375, - "learning_rate": 1.690025829324127e-05, - "long_answer_loss": 0.1517, - "loss": 0.1461, - "short_answer_loss": NaN, - "step": 1057, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1439, - "grad_norm": 1.359375, - "learning_rate": 1.6885766815682087e-05, - "long_answer_loss": 0.1439, - "loss": 0.1465, - "short_answer_loss": NaN, - "step": 1058, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1183, - "grad_norm": 1.4140625, - "learning_rate": 1.6871268612937013e-05, - "long_answer_loss": 0.1183, - "loss": 0.1372, - "short_answer_loss": NaN, - "step": 1059, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1304, - "grad_norm": 1.3671875, - "learning_rate": 1.6856763707237776e-05, - "long_answer_loss": 0.1304, - "loss": 0.141, - "short_answer_loss": NaN, - "step": 1060, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1255, - "grad_norm": 1.5234375, - "learning_rate": 1.6842252120826358e-05, - "long_answer_loss": 0.1255, - "loss": 0.1449, - "short_answer_loss": NaN, - "step": 1061, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1385, - "grad_norm": 1.5234375, - "learning_rate": 1.6827733875954994e-05, - "long_answer_loss": 0.1385, - "loss": 0.1469, - "short_answer_loss": NaN, - "step": 1062, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1537, - "grad_norm": 1.359375, - "learning_rate": 1.6813208994886135e-05, - "long_answer_loss": 0.1537, - "loss": 0.1416, - "short_answer_loss": NaN, - "step": 1063, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1223, - "grad_norm": 1.4375, - "learning_rate": 1.6798677499892397e-05, - "long_answer_loss": 0.1223, - "loss": 0.1493, - "short_answer_loss": NaN, - "step": 1064, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1459, - "grad_norm": 1.4609375, - "learning_rate": 1.678413941325655e-05, - "long_answer_loss": 0.1459, - "loss": 0.1456, - "short_answer_loss": NaN, - "step": 1065, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1361, - "grad_norm": 1.46875, - "learning_rate": 1.6769594757271463e-05, - "long_answer_loss": 0.1361, - "loss": 0.1458, - "short_answer_loss": NaN, - "step": 1066, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1082, - "grad_norm": 1.46875, - "learning_rate": 1.6755043554240077e-05, - "long_answer_loss": 0.1082, - "loss": 0.1447, - "short_answer_loss": NaN, - "step": 1067, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1744, - "grad_norm": 1.4140625, - "learning_rate": 1.674048582647538e-05, - "long_answer_loss": 0.1744, - "loss": 0.1476, - "short_answer_loss": NaN, - "step": 1068, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1397, - "grad_norm": 1.6015625, - "learning_rate": 1.672592159630036e-05, - "long_answer_loss": 0.1397, - "loss": 0.1443, - "short_answer_loss": NaN, - "step": 1069, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1424, - "grad_norm": 1.53125, - "learning_rate": 1.6711350886047977e-05, - "long_answer_loss": 0.1424, - "loss": 0.1371, - "short_answer_loss": NaN, - "step": 1070, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1403, - "grad_norm": 1.3203125, - "learning_rate": 1.6696773718061128e-05, - "long_answer_loss": 0.1403, - "loss": 0.1335, - "short_answer_loss": NaN, - "step": 1071, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1599, - "grad_norm": 1.4609375, - "learning_rate": 1.6682190114692615e-05, - "long_answer_loss": 0.1599, - "loss": 0.1405, - "short_answer_loss": NaN, - "step": 1072, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.127, - "grad_norm": 1.3984375, - "learning_rate": 1.66676000983051e-05, - "long_answer_loss": 0.127, - "loss": 0.1411, - "short_answer_loss": NaN, - "step": 1073, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1255, - "grad_norm": 1.34375, - "learning_rate": 1.665300369127108e-05, - "long_answer_loss": 0.1255, - "loss": 0.1324, - "short_answer_loss": NaN, - "step": 1074, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1474, - "grad_norm": 1.4375, - "learning_rate": 1.6638400915972867e-05, - "long_answer_loss": 0.1474, - "loss": 0.1442, - "short_answer_loss": NaN, - "step": 1075, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1499, - "grad_norm": 1.4140625, - "learning_rate": 1.6623791794802518e-05, - "long_answer_loss": 0.1499, - "loss": 0.1438, - "short_answer_loss": NaN, - "step": 1076, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1433, - "grad_norm": 1.390625, - "learning_rate": 1.6609176350161836e-05, - "long_answer_loss": 0.1433, - "loss": 0.1397, - "short_answer_loss": NaN, - "step": 1077, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1669, - "grad_norm": 1.4609375, - "learning_rate": 1.659455460446231e-05, - "long_answer_loss": 0.1669, - "loss": 0.1459, - "short_answer_loss": NaN, - "step": 1078, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.135, - "grad_norm": 1.4453125, - "learning_rate": 1.6579926580125095e-05, - "long_answer_loss": 0.135, - "loss": 0.1417, - "short_answer_loss": NaN, - "step": 1079, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1372, - "grad_norm": 1.4375, - "learning_rate": 1.656529229958097e-05, - "long_answer_loss": 0.1372, - "loss": 0.1428, - "short_answer_loss": NaN, - "step": 1080, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1415, - "grad_norm": 1.3828125, - "learning_rate": 1.6550651785270323e-05, - "long_answer_loss": 0.1415, - "loss": 0.1378, - "short_answer_loss": NaN, - "step": 1081, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1349, - "grad_norm": 1.4765625, - "learning_rate": 1.653600505964308e-05, - "long_answer_loss": 0.1349, - "loss": 0.1386, - "short_answer_loss": NaN, - "step": 1082, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1367, - "grad_norm": 1.4921875, - "learning_rate": 1.65213521451587e-05, - "long_answer_loss": 0.1367, - "loss": 0.1478, - "short_answer_loss": NaN, - "step": 1083, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1538, - "grad_norm": 1.5, - "learning_rate": 1.650669306428613e-05, - "long_answer_loss": 0.1538, - "loss": 0.1439, - "short_answer_loss": NaN, - "step": 1084, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1326, - "grad_norm": 1.3984375, - "learning_rate": 1.6492027839503788e-05, - "long_answer_loss": 0.1326, - "loss": 0.1413, - "short_answer_loss": NaN, - "step": 1085, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1394, - "grad_norm": 1.421875, - "learning_rate": 1.647735649329949e-05, - "long_answer_loss": 0.1394, - "loss": 0.1388, - "short_answer_loss": NaN, - "step": 1086, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1488, - "grad_norm": 1.390625, - "learning_rate": 1.646267904817045e-05, - "long_answer_loss": 0.1488, - "loss": 0.1431, - "short_answer_loss": NaN, - "step": 1087, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.137, - "grad_norm": 1.4453125, - "learning_rate": 1.644799552662323e-05, - "long_answer_loss": 0.137, - "loss": 0.1355, - "short_answer_loss": NaN, - "step": 1088, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1189, - "grad_norm": 1.359375, - "learning_rate": 1.643330595117372e-05, - "long_answer_loss": 0.1189, - "loss": 0.1385, - "short_answer_loss": NaN, - "step": 1089, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1385, - "grad_norm": 1.421875, - "learning_rate": 1.6418610344347085e-05, - "long_answer_loss": 0.1385, - "loss": 0.1416, - "short_answer_loss": NaN, - "step": 1090, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.121, - "grad_norm": 1.46875, - "learning_rate": 1.640390872867774e-05, - "long_answer_loss": 0.121, - "loss": 0.1361, - "short_answer_loss": NaN, - "step": 1091, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1373, - "grad_norm": 1.390625, - "learning_rate": 1.6389201126709307e-05, - "long_answer_loss": 0.1373, - "loss": 0.135, - "short_answer_loss": NaN, - "step": 1092, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.132, - "grad_norm": 1.421875, - "learning_rate": 1.63744875609946e-05, - "long_answer_loss": 0.132, - "loss": 0.1382, - "short_answer_loss": NaN, - "step": 1093, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1161, - "grad_norm": 1.375, - "learning_rate": 1.6359768054095574e-05, - "long_answer_loss": 0.1161, - "loss": 0.1361, - "short_answer_loss": NaN, - "step": 1094, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1453, - "grad_norm": 1.375, - "learning_rate": 1.6345042628583284e-05, - "long_answer_loss": 0.1453, - "loss": 0.1388, - "short_answer_loss": NaN, - "step": 1095, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1506, - "grad_norm": 1.4375, - "learning_rate": 1.6330311307037875e-05, - "long_answer_loss": 0.1506, - "loss": 0.1393, - "short_answer_loss": NaN, - "step": 1096, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1316, - "grad_norm": 1.4375, - "learning_rate": 1.6315574112048523e-05, - "long_answer_loss": 0.1316, - "loss": 0.1449, - "short_answer_loss": NaN, - "step": 1097, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1235, - "grad_norm": 1.375, - "learning_rate": 1.630083106621342e-05, - "long_answer_loss": 0.1235, - "loss": 0.1381, - "short_answer_loss": NaN, - "step": 1098, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1347, - "grad_norm": 1.453125, - "learning_rate": 1.628608219213972e-05, - "long_answer_loss": 0.1347, - "loss": 0.1365, - "short_answer_loss": NaN, - "step": 1099, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1537, - "grad_norm": 1.453125, - "learning_rate": 1.6271327512443517e-05, - "long_answer_loss": 0.1537, - "loss": 0.1459, - "short_answer_loss": NaN, - "step": 1100, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1464, - "grad_norm": 1.390625, - "learning_rate": 1.6256567049749815e-05, - "long_answer_loss": 0.1464, - "loss": 0.1397, - "short_answer_loss": NaN, - "step": 1101, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1543, - "grad_norm": 1.421875, - "learning_rate": 1.6241800826692472e-05, - "long_answer_loss": 0.1543, - "loss": 0.1424, - "short_answer_loss": NaN, - "step": 1102, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1348, - "grad_norm": 1.5546875, - "learning_rate": 1.6227028865914188e-05, - "long_answer_loss": 0.1348, - "loss": 0.1382, - "short_answer_loss": NaN, - "step": 1103, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1496, - "grad_norm": 1.4375, - "learning_rate": 1.621225119006646e-05, - "long_answer_loss": 0.1496, - "loss": 0.146, - "short_answer_loss": NaN, - "step": 1104, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1386, - "grad_norm": 1.421875, - "learning_rate": 1.619746782180955e-05, - "long_answer_loss": 0.1386, - "loss": 0.1393, - "short_answer_loss": NaN, - "step": 1105, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1136, - "grad_norm": 1.453125, - "learning_rate": 1.6182678783812444e-05, - "long_answer_loss": 0.1136, - "loss": 0.1429, - "short_answer_loss": NaN, - "step": 1106, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1458, - "grad_norm": 1.3359375, - "learning_rate": 1.6167884098752835e-05, - "long_answer_loss": 0.1458, - "loss": 0.1399, - "short_answer_loss": NaN, - "step": 1107, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1425, - "grad_norm": 1.4609375, - "learning_rate": 1.6153083789317047e-05, - "long_answer_loss": 0.1425, - "loss": 0.1443, - "short_answer_loss": NaN, - "step": 1108, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1617, - "grad_norm": 1.375, - "learning_rate": 1.613827787820006e-05, - "long_answer_loss": 0.1617, - "loss": 0.1401, - "short_answer_loss": NaN, - "step": 1109, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1546, - "grad_norm": 1.578125, - "learning_rate": 1.612346638810543e-05, - "long_answer_loss": 0.1546, - "loss": 0.1324, - "short_answer_loss": NaN, - "step": 1110, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1539, - "grad_norm": 1.421875, - "learning_rate": 1.6108649341745262e-05, - "long_answer_loss": 0.1539, - "loss": 0.1411, - "short_answer_loss": NaN, - "step": 1111, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1427, - "grad_norm": 1.4609375, - "learning_rate": 1.6093826761840196e-05, - "long_answer_loss": 0.1427, - "loss": 0.1475, - "short_answer_loss": NaN, - "step": 1112, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.148, - "grad_norm": 1.453125, - "learning_rate": 1.607899867111934e-05, - "long_answer_loss": 0.148, - "loss": 0.1361, - "short_answer_loss": NaN, - "step": 1113, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1191, - "grad_norm": 1.359375, - "learning_rate": 1.6064165092320264e-05, - "long_answer_loss": 0.1191, - "loss": 0.1328, - "short_answer_loss": NaN, - "step": 1114, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1792, - "grad_norm": 1.46875, - "learning_rate": 1.6049326048188955e-05, - "long_answer_loss": 0.1792, - "loss": 0.144, - "short_answer_loss": NaN, - "step": 1115, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1313, - "grad_norm": 1.453125, - "learning_rate": 1.6034481561479765e-05, - "long_answer_loss": 0.1313, - "loss": 0.1393, - "short_answer_loss": NaN, - "step": 1116, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1359, - "grad_norm": 1.4609375, - "learning_rate": 1.6019631654955412e-05, - "long_answer_loss": 0.1359, - "loss": 0.144, - "short_answer_loss": NaN, - "step": 1117, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1277, - "grad_norm": 1.3671875, - "learning_rate": 1.6004776351386913e-05, - "long_answer_loss": 0.1277, - "loss": 0.135, - "short_answer_loss": NaN, - "step": 1118, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1505, - "grad_norm": 1.40625, - "learning_rate": 1.5989915673553564e-05, - "long_answer_loss": 0.1505, - "loss": 0.1391, - "short_answer_loss": NaN, - "step": 1119, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1302, - "grad_norm": 1.453125, - "learning_rate": 1.59750496442429e-05, - "long_answer_loss": 0.1302, - "loss": 0.1447, - "short_answer_loss": NaN, - "step": 1120, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1207, - "grad_norm": 1.3359375, - "learning_rate": 1.5960178286250668e-05, - "long_answer_loss": 0.1207, - "loss": 0.1341, - "short_answer_loss": NaN, - "step": 1121, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1697, - "grad_norm": 1.4375, - "learning_rate": 1.5945301622380772e-05, - "long_answer_loss": 0.1697, - "loss": 0.1441, - "short_answer_loss": NaN, - "step": 1122, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1466, - "grad_norm": 1.359375, - "learning_rate": 1.5930419675445273e-05, - "long_answer_loss": 0.1466, - "loss": 0.1319, - "short_answer_loss": NaN, - "step": 1123, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1492, - "grad_norm": 1.5078125, - "learning_rate": 1.5915532468264314e-05, - "long_answer_loss": 0.1492, - "loss": 0.1359, - "short_answer_loss": NaN, - "step": 1124, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1341, - "grad_norm": 1.3984375, - "learning_rate": 1.5900640023666108e-05, - "long_answer_loss": 0.1341, - "loss": 0.1454, - "short_answer_loss": NaN, - "step": 1125, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1246, - "grad_norm": 1.375, - "learning_rate": 1.5885742364486915e-05, - "long_answer_loss": 0.1246, - "loss": 0.1365, - "short_answer_loss": NaN, - "step": 1126, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1449, - "grad_norm": 1.4453125, - "learning_rate": 1.5870839513570967e-05, - "long_answer_loss": 0.1449, - "loss": 0.1393, - "short_answer_loss": NaN, - "step": 1127, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.149, - "grad_norm": 1.4296875, - "learning_rate": 1.5855931493770477e-05, - "long_answer_loss": 0.149, - "loss": 0.1423, - "short_answer_loss": NaN, - "step": 1128, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1357, - "grad_norm": 1.4453125, - "learning_rate": 1.5841018327945576e-05, - "long_answer_loss": 0.1357, - "loss": 0.145, - "short_answer_loss": NaN, - "step": 1129, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1314, - "grad_norm": 1.4609375, - "learning_rate": 1.5826100038964282e-05, - "long_answer_loss": 0.1314, - "loss": 0.1373, - "short_answer_loss": NaN, - "step": 1130, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1553, - "grad_norm": 1.390625, - "learning_rate": 1.581117664970247e-05, - "long_answer_loss": 0.1553, - "loss": 0.1332, - "short_answer_loss": NaN, - "step": 1131, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1194, - "grad_norm": 1.46875, - "learning_rate": 1.5796248183043848e-05, - "long_answer_loss": 0.1194, - "loss": 0.1375, - "short_answer_loss": NaN, - "step": 1132, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1377, - "grad_norm": 1.4140625, - "learning_rate": 1.5781314661879896e-05, - "long_answer_loss": 0.1377, - "loss": 0.1301, - "short_answer_loss": NaN, - "step": 1133, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.153, - "grad_norm": 1.515625, - "learning_rate": 1.5766376109109847e-05, - "long_answer_loss": 0.153, - "loss": 0.1465, - "short_answer_loss": NaN, - "step": 1134, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1211, - "grad_norm": 1.4765625, - "learning_rate": 1.5751432547640655e-05, - "long_answer_loss": 0.1211, - "loss": 0.1371, - "short_answer_loss": NaN, - "step": 1135, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1608, - "grad_norm": 1.453125, - "learning_rate": 1.573648400038695e-05, - "long_answer_loss": 0.1608, - "loss": 0.1442, - "short_answer_loss": NaN, - "step": 1136, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1268, - "grad_norm": 1.40625, - "learning_rate": 1.572153049027101e-05, - "long_answer_loss": 0.1268, - "loss": 0.1373, - "short_answer_loss": NaN, - "step": 1137, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1505, - "grad_norm": 1.4609375, - "learning_rate": 1.5706572040222715e-05, - "long_answer_loss": 0.1505, - "loss": 0.1437, - "short_answer_loss": NaN, - "step": 1138, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1376, - "grad_norm": 1.484375, - "learning_rate": 1.5691608673179532e-05, - "long_answer_loss": 0.1376, - "loss": 0.1398, - "short_answer_loss": NaN, - "step": 1139, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1378, - "grad_norm": 1.46875, - "learning_rate": 1.5676640412086463e-05, - "long_answer_loss": 0.1378, - "loss": 0.1319, - "short_answer_loss": NaN, - "step": 1140, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1355, - "grad_norm": 1.34375, - "learning_rate": 1.566166727989601e-05, - "long_answer_loss": 0.1355, - "loss": 0.129, - "short_answer_loss": NaN, - "step": 1141, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1417, - "grad_norm": 1.4765625, - "learning_rate": 1.564668929956815e-05, - "long_answer_loss": 0.1417, - "loss": 0.1411, - "short_answer_loss": NaN, - "step": 1142, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1259, - "grad_norm": 1.453125, - "learning_rate": 1.5631706494070298e-05, - "long_answer_loss": 0.1259, - "loss": 0.1447, - "short_answer_loss": NaN, - "step": 1143, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1341, - "grad_norm": 1.390625, - "learning_rate": 1.5616718886377253e-05, - "long_answer_loss": 0.1341, - "loss": 0.1326, - "short_answer_loss": NaN, - "step": 1144, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1329, - "grad_norm": 1.4296875, - "learning_rate": 1.5601726499471193e-05, - "long_answer_loss": 0.1329, - "loss": 0.131, - "short_answer_loss": NaN, - "step": 1145, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.118, - "grad_norm": 1.359375, - "learning_rate": 1.558672935634161e-05, - "long_answer_loss": 0.118, - "loss": 0.1346, - "short_answer_loss": NaN, - "step": 1146, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1564, - "grad_norm": 1.390625, - "learning_rate": 1.557172747998531e-05, - "long_answer_loss": 0.1564, - "loss": 0.1375, - "short_answer_loss": NaN, - "step": 1147, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1356, - "grad_norm": 1.5078125, - "learning_rate": 1.555672089340634e-05, - "long_answer_loss": 0.1356, - "loss": 0.1357, - "short_answer_loss": NaN, - "step": 1148, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.137, - "grad_norm": 1.4453125, - "learning_rate": 1.554170961961597e-05, - "long_answer_loss": 0.137, - "loss": 0.1416, - "short_answer_loss": NaN, - "step": 1149, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1249, - "grad_norm": 1.46875, - "learning_rate": 1.5526693681632664e-05, - "long_answer_loss": 0.1249, - "loss": 0.1455, - "short_answer_loss": NaN, - "step": 1150, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1419, - "grad_norm": 1.4453125, - "learning_rate": 1.5511673102482044e-05, - "long_answer_loss": 0.1419, - "loss": 0.1373, - "short_answer_loss": NaN, - "step": 1151, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1412, - "grad_norm": 1.6328125, - "learning_rate": 1.549664790519683e-05, - "long_answer_loss": 0.1412, - "loss": 0.1469, - "short_answer_loss": NaN, - "step": 1152, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1286, - "grad_norm": 1.4140625, - "learning_rate": 1.5481618112816844e-05, - "long_answer_loss": 0.1286, - "loss": 0.136, - "short_answer_loss": NaN, - "step": 1153, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1332, - "grad_norm": 1.40625, - "learning_rate": 1.546658374838894e-05, - "long_answer_loss": 0.1332, - "loss": 0.1317, - "short_answer_loss": NaN, - "step": 1154, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1415, - "grad_norm": 1.4140625, - "learning_rate": 1.545154483496698e-05, - "long_answer_loss": 0.1415, - "loss": 0.1436, - "short_answer_loss": NaN, - "step": 1155, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1331, - "grad_norm": 1.515625, - "learning_rate": 1.543650139561182e-05, - "long_answer_loss": 0.1331, - "loss": 0.1448, - "short_answer_loss": NaN, - "step": 1156, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1594, - "grad_norm": 1.453125, - "learning_rate": 1.542145345339124e-05, - "long_answer_loss": 0.1594, - "loss": 0.1387, - "short_answer_loss": NaN, - "step": 1157, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1549, - "grad_norm": 1.40625, - "learning_rate": 1.540640103137993e-05, - "long_answer_loss": 0.1549, - "loss": 0.1451, - "short_answer_loss": NaN, - "step": 1158, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1575, - "grad_norm": 1.390625, - "learning_rate": 1.539134415265945e-05, - "long_answer_loss": 0.1575, - "loss": 0.142, - "short_answer_loss": NaN, - "step": 1159, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1338, - "grad_norm": 1.4609375, - "learning_rate": 1.5376282840318196e-05, - "long_answer_loss": 0.1338, - "loss": 0.1372, - "short_answer_loss": NaN, - "step": 1160, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1499, - "grad_norm": 1.375, - "learning_rate": 1.5361217117451355e-05, - "long_answer_loss": 0.1499, - "loss": 0.1331, - "short_answer_loss": NaN, - "step": 1161, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1488, - "grad_norm": 1.5078125, - "learning_rate": 1.534614700716088e-05, - "long_answer_loss": 0.1488, - "loss": 0.1432, - "short_answer_loss": NaN, - "step": 1162, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.172, - "grad_norm": 1.453125, - "learning_rate": 1.5331072532555462e-05, - "long_answer_loss": 0.172, - "loss": 0.1418, - "short_answer_loss": NaN, - "step": 1163, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1356, - "grad_norm": 1.40625, - "learning_rate": 1.5315993716750472e-05, - "long_answer_loss": 0.1356, - "loss": 0.1302, - "short_answer_loss": NaN, - "step": 1164, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1372, - "grad_norm": 1.4140625, - "learning_rate": 1.5300910582867933e-05, - "long_answer_loss": 0.1372, - "loss": 0.1361, - "short_answer_loss": NaN, - "step": 1165, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1568, - "grad_norm": 1.46875, - "learning_rate": 1.528582315403651e-05, - "long_answer_loss": 0.1568, - "loss": 0.1498, - "short_answer_loss": NaN, - "step": 1166, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1185, - "grad_norm": 1.4453125, - "learning_rate": 1.527073145339144e-05, - "long_answer_loss": 0.1185, - "loss": 0.1337, - "short_answer_loss": NaN, - "step": 1167, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1493, - "grad_norm": 1.4765625, - "learning_rate": 1.5255635504074503e-05, - "long_answer_loss": 0.1493, - "loss": 0.1374, - "short_answer_loss": NaN, - "step": 1168, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1183, - "grad_norm": 1.4375, - "learning_rate": 1.5240535329234012e-05, - "long_answer_loss": 0.1183, - "loss": 0.1348, - "short_answer_loss": NaN, - "step": 1169, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1362, - "grad_norm": 1.3203125, - "learning_rate": 1.522543095202475e-05, - "long_answer_loss": 0.1362, - "loss": 0.1335, - "short_answer_loss": NaN, - "step": 1170, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1614, - "grad_norm": 1.5, - "learning_rate": 1.5210322395607945e-05, - "long_answer_loss": 0.1614, - "loss": 0.1402, - "short_answer_loss": NaN, - "step": 1171, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1417, - "grad_norm": 1.515625, - "learning_rate": 1.519520968315123e-05, - "long_answer_loss": 0.1417, - "loss": 0.1339, - "short_answer_loss": NaN, - "step": 1172, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1462, - "grad_norm": 1.3984375, - "learning_rate": 1.5180092837828618e-05, - "long_answer_loss": 0.1462, - "loss": 0.1358, - "short_answer_loss": NaN, - "step": 1173, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1245, - "grad_norm": 1.5390625, - "learning_rate": 1.5164971882820456e-05, - "long_answer_loss": 0.1245, - "loss": 0.1403, - "short_answer_loss": NaN, - "step": 1174, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1299, - "grad_norm": 1.3671875, - "learning_rate": 1.5149846841313389e-05, - "long_answer_loss": 0.1299, - "loss": 0.1352, - "short_answer_loss": NaN, - "step": 1175, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.152, - "grad_norm": 1.375, - "learning_rate": 1.513471773650033e-05, - "long_answer_loss": 0.152, - "loss": 0.1356, - "short_answer_loss": NaN, - "step": 1176, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1594, - "grad_norm": 1.484375, - "learning_rate": 1.5119584591580429e-05, - "long_answer_loss": 0.1594, - "loss": 0.148, - "short_answer_loss": NaN, - "step": 1177, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1353, - "grad_norm": 1.390625, - "learning_rate": 1.5104447429759024e-05, - "long_answer_loss": 0.1353, - "loss": 0.1365, - "short_answer_loss": NaN, - "step": 1178, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1544, - "grad_norm": 1.390625, - "learning_rate": 1.5089306274247616e-05, - "long_answer_loss": 0.1544, - "loss": 0.1402, - "short_answer_loss": NaN, - "step": 1179, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1402, - "grad_norm": 1.3984375, - "learning_rate": 1.507416114826383e-05, - "long_answer_loss": 0.1402, - "loss": 0.1405, - "short_answer_loss": NaN, - "step": 1180, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1442, - "grad_norm": 1.4140625, - "learning_rate": 1.5059012075031378e-05, - "long_answer_loss": 0.1442, - "loss": 0.1393, - "short_answer_loss": NaN, - "step": 1181, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1276, - "grad_norm": 1.453125, - "learning_rate": 1.5043859077780026e-05, - "long_answer_loss": 0.1276, - "loss": 0.1354, - "short_answer_loss": NaN, - "step": 1182, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1279, - "grad_norm": 1.390625, - "learning_rate": 1.5028702179745554e-05, - "long_answer_loss": 0.1279, - "loss": 0.1331, - "short_answer_loss": NaN, - "step": 1183, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1547, - "grad_norm": 1.3515625, - "learning_rate": 1.501354140416973e-05, - "long_answer_loss": 0.1547, - "loss": 0.1367, - "short_answer_loss": NaN, - "step": 1184, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1252, - "grad_norm": 1.3828125, - "learning_rate": 1.4998376774300257e-05, - "long_answer_loss": 0.1252, - "loss": 0.1347, - "short_answer_loss": NaN, - "step": 1185, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1316, - "grad_norm": 1.484375, - "learning_rate": 1.498320831339076e-05, - "long_answer_loss": 0.1316, - "loss": 0.1353, - "short_answer_loss": NaN, - "step": 1186, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1248, - "grad_norm": 1.390625, - "learning_rate": 1.4968036044700729e-05, - "long_answer_loss": 0.1248, - "loss": 0.138, - "short_answer_loss": NaN, - "step": 1187, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.164, - "grad_norm": 1.453125, - "learning_rate": 1.4952859991495504e-05, - "long_answer_loss": 0.164, - "loss": 0.1331, - "short_answer_loss": NaN, - "step": 1188, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1244, - "grad_norm": 1.3984375, - "learning_rate": 1.4937680177046218e-05, - "long_answer_loss": 0.1244, - "loss": 0.1313, - "short_answer_loss": NaN, - "step": 1189, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1033, - "grad_norm": 1.4375, - "learning_rate": 1.4922496624629775e-05, - "long_answer_loss": 0.1033, - "loss": 0.1357, - "short_answer_loss": NaN, - "step": 1190, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1276, - "grad_norm": 1.515625, - "learning_rate": 1.4907309357528812e-05, - "long_answer_loss": 0.1276, - "loss": 0.1375, - "short_answer_loss": NaN, - "step": 1191, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1301, - "grad_norm": 1.59375, - "learning_rate": 1.489211839903166e-05, - "long_answer_loss": 0.1301, - "loss": 0.1302, - "short_answer_loss": NaN, - "step": 1192, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1377, - "grad_norm": 1.484375, - "learning_rate": 1.487692377243231e-05, - "long_answer_loss": 0.1377, - "loss": 0.1379, - "short_answer_loss": NaN, - "step": 1193, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1347, - "grad_norm": 1.671875, - "learning_rate": 1.4861725501030389e-05, - "long_answer_loss": 0.1347, - "loss": 0.1357, - "short_answer_loss": NaN, - "step": 1194, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.144, - "grad_norm": 1.375, - "learning_rate": 1.4846523608131088e-05, - "long_answer_loss": 0.144, - "loss": 0.1278, - "short_answer_loss": NaN, - "step": 1195, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.125, - "grad_norm": 1.46875, - "learning_rate": 1.4831318117045177e-05, - "long_answer_loss": 0.125, - "loss": 0.1355, - "short_answer_loss": NaN, - "step": 1196, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1339, - "grad_norm": 1.4375, - "learning_rate": 1.4816109051088931e-05, - "long_answer_loss": 0.1339, - "loss": 0.1412, - "short_answer_loss": NaN, - "step": 1197, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1622, - "grad_norm": 1.4921875, - "learning_rate": 1.4800896433584107e-05, - "long_answer_loss": 0.1622, - "loss": 0.1453, - "short_answer_loss": NaN, - "step": 1198, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1194, - "grad_norm": 1.3828125, - "learning_rate": 1.4785680287857911e-05, - "long_answer_loss": 0.1194, - "loss": 0.1314, - "short_answer_loss": NaN, - "step": 1199, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1294, - "grad_norm": 1.5234375, - "learning_rate": 1.4770460637242955e-05, - "long_answer_loss": 0.1294, - "loss": 0.133, - "short_answer_loss": NaN, - "step": 1200, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.13, - "grad_norm": 1.4296875, - "learning_rate": 1.4755237505077236e-05, - "long_answer_loss": 0.13, - "loss": 0.1367, - "short_answer_loss": NaN, - "step": 1201, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1223, - "grad_norm": 1.46875, - "learning_rate": 1.4740010914704071e-05, - "long_answer_loss": 0.1223, - "loss": 0.1364, - "short_answer_loss": NaN, - "step": 1202, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1293, - "grad_norm": 1.421875, - "learning_rate": 1.47247808894721e-05, - "long_answer_loss": 0.1293, - "loss": 0.1396, - "short_answer_loss": NaN, - "step": 1203, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1281, - "grad_norm": 1.375, - "learning_rate": 1.4709547452735223e-05, - "long_answer_loss": 0.1281, - "loss": 0.1289, - "short_answer_loss": NaN, - "step": 1204, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1258, - "grad_norm": 1.3984375, - "learning_rate": 1.4694310627852559e-05, - "long_answer_loss": 0.1258, - "loss": 0.1401, - "short_answer_loss": NaN, - "step": 1205, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1266, - "grad_norm": 1.5390625, - "learning_rate": 1.467907043818844e-05, - "long_answer_loss": 0.1266, - "loss": 0.1384, - "short_answer_loss": NaN, - "step": 1206, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1235, - "grad_norm": 1.40625, - "learning_rate": 1.4663826907112348e-05, - "long_answer_loss": 0.1235, - "loss": 0.1303, - "short_answer_loss": NaN, - "step": 1207, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1438, - "grad_norm": 1.4765625, - "learning_rate": 1.464858005799889e-05, - "long_answer_loss": 0.1438, - "loss": 0.1363, - "short_answer_loss": NaN, - "step": 1208, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1449, - "grad_norm": 1.6171875, - "learning_rate": 1.4633329914227761e-05, - "long_answer_loss": 0.1449, - "loss": 0.1393, - "short_answer_loss": NaN, - "step": 1209, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1329, - "grad_norm": 1.5859375, - "learning_rate": 1.4618076499183713e-05, - "long_answer_loss": 0.1329, - "loss": 0.1371, - "short_answer_loss": NaN, - "step": 1210, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1368, - "grad_norm": 1.5, - "learning_rate": 1.4602819836256507e-05, - "long_answer_loss": 0.1368, - "loss": 0.1397, - "short_answer_loss": NaN, - "step": 1211, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1345, - "grad_norm": 1.4765625, - "learning_rate": 1.4587559948840892e-05, - "long_answer_loss": 0.1345, - "loss": 0.1374, - "short_answer_loss": NaN, - "step": 1212, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1303, - "grad_norm": 1.3828125, - "learning_rate": 1.4572296860336552e-05, - "long_answer_loss": 0.1303, - "loss": 0.1303, - "short_answer_loss": NaN, - "step": 1213, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1356, - "grad_norm": 1.515625, - "learning_rate": 1.4557030594148086e-05, - "long_answer_loss": 0.1356, - "loss": 0.138, - "short_answer_loss": NaN, - "step": 1214, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1285, - "grad_norm": 1.484375, - "learning_rate": 1.4541761173684965e-05, - "long_answer_loss": 0.1285, - "loss": 0.1302, - "short_answer_loss": NaN, - "step": 1215, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1382, - "grad_norm": 1.515625, - "learning_rate": 1.4526488622361493e-05, - "long_answer_loss": 0.1382, - "loss": 0.1375, - "short_answer_loss": NaN, - "step": 1216, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1162, - "grad_norm": 1.390625, - "learning_rate": 1.4511212963596779e-05, - "long_answer_loss": 0.1162, - "loss": 0.1376, - "short_answer_loss": NaN, - "step": 1217, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1353, - "grad_norm": 1.4140625, - "learning_rate": 1.44959342208147e-05, - "long_answer_loss": 0.1353, - "loss": 0.1363, - "short_answer_loss": NaN, - "step": 1218, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.132, - "grad_norm": 1.4609375, - "learning_rate": 1.4480652417443854e-05, - "long_answer_loss": 0.132, - "loss": 0.1388, - "short_answer_loss": NaN, - "step": 1219, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1203, - "grad_norm": 1.484375, - "learning_rate": 1.446536757691754e-05, - "long_answer_loss": 0.1203, - "loss": 0.1386, - "short_answer_loss": NaN, - "step": 1220, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1492, - "grad_norm": 1.5, - "learning_rate": 1.4450079722673706e-05, - "long_answer_loss": 0.1492, - "loss": 0.1381, - "short_answer_loss": NaN, - "step": 1221, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1228, - "grad_norm": 1.4765625, - "learning_rate": 1.4434788878154928e-05, - "long_answer_loss": 0.1228, - "loss": 0.1335, - "short_answer_loss": NaN, - "step": 1222, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1201, - "grad_norm": 1.4375, - "learning_rate": 1.4419495066808364e-05, - "long_answer_loss": 0.1201, - "loss": 0.1344, - "short_answer_loss": NaN, - "step": 1223, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1342, - "grad_norm": 1.53125, - "learning_rate": 1.4404198312085723e-05, - "long_answer_loss": 0.1342, - "loss": 0.1393, - "short_answer_loss": NaN, - "step": 1224, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.13, - "grad_norm": 1.4140625, - "learning_rate": 1.438889863744323e-05, - "long_answer_loss": 0.13, - "loss": 0.1313, - "short_answer_loss": NaN, - "step": 1225, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1214, - "grad_norm": 1.4375, - "learning_rate": 1.4373596066341577e-05, - "long_answer_loss": 0.1214, - "loss": 0.1381, - "short_answer_loss": NaN, - "step": 1226, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1288, - "grad_norm": 1.5078125, - "learning_rate": 1.435829062224591e-05, - "long_answer_loss": 0.1288, - "loss": 0.1359, - "short_answer_loss": NaN, - "step": 1227, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1405, - "grad_norm": 1.5, - "learning_rate": 1.4342982328625774e-05, - "long_answer_loss": 0.1405, - "loss": 0.1336, - "short_answer_loss": NaN, - "step": 1228, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1062, - "grad_norm": 1.453125, - "learning_rate": 1.4327671208955082e-05, - "long_answer_loss": 0.1062, - "loss": 0.1299, - "short_answer_loss": NaN, - "step": 1229, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1304, - "grad_norm": 1.46875, - "learning_rate": 1.4312357286712085e-05, - "long_answer_loss": 0.1304, - "loss": 0.1324, - "short_answer_loss": NaN, - "step": 1230, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1071, - "grad_norm": 1.390625, - "learning_rate": 1.4297040585379332e-05, - "long_answer_loss": 0.1071, - "loss": 0.1306, - "short_answer_loss": NaN, - "step": 1231, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.129, - "grad_norm": 1.3515625, - "learning_rate": 1.4281721128443625e-05, - "long_answer_loss": 0.129, - "loss": 0.1285, - "short_answer_loss": NaN, - "step": 1232, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.135, - "grad_norm": 1.59375, - "learning_rate": 1.4266398939396006e-05, - "long_answer_loss": 0.135, - "loss": 0.1428, - "short_answer_loss": NaN, - "step": 1233, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1537, - "grad_norm": 1.4921875, - "learning_rate": 1.4251074041731694e-05, - "long_answer_loss": 0.1537, - "loss": 0.1393, - "short_answer_loss": NaN, - "step": 1234, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1304, - "grad_norm": 1.421875, - "learning_rate": 1.4235746458950061e-05, - "long_answer_loss": 0.1304, - "loss": 0.1339, - "short_answer_loss": NaN, - "step": 1235, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1146, - "grad_norm": 1.40625, - "learning_rate": 1.422041621455461e-05, - "long_answer_loss": 0.1146, - "loss": 0.132, - "short_answer_loss": NaN, - "step": 1236, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1235, - "grad_norm": 1.4296875, - "learning_rate": 1.4205083332052906e-05, - "long_answer_loss": 0.1235, - "loss": 0.1316, - "short_answer_loss": NaN, - "step": 1237, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1267, - "grad_norm": 1.4296875, - "learning_rate": 1.4189747834956576e-05, - "long_answer_loss": 0.1267, - "loss": 0.1321, - "short_answer_loss": NaN, - "step": 1238, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1219, - "grad_norm": 1.484375, - "learning_rate": 1.4174409746781247e-05, - "long_answer_loss": 0.1219, - "loss": 0.1335, - "short_answer_loss": NaN, - "step": 1239, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1359, - "grad_norm": 1.5234375, - "learning_rate": 1.4159069091046526e-05, - "long_answer_loss": 0.1359, - "loss": 0.1457, - "short_answer_loss": NaN, - "step": 1240, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1002, - "grad_norm": 1.484375, - "learning_rate": 1.4143725891275946e-05, - "long_answer_loss": 0.1002, - "loss": 0.1367, - "short_answer_loss": NaN, - "step": 1241, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1268, - "grad_norm": 1.4140625, - "learning_rate": 1.412838017099696e-05, - "long_answer_loss": 0.1268, - "loss": 0.1263, - "short_answer_loss": NaN, - "step": 1242, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1261, - "grad_norm": 1.484375, - "learning_rate": 1.411303195374086e-05, - "long_answer_loss": 0.1261, - "loss": 0.1423, - "short_answer_loss": NaN, - "step": 1243, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1258, - "grad_norm": 1.4140625, - "learning_rate": 1.4097681263042789e-05, - "long_answer_loss": 0.1258, - "loss": 0.1346, - "short_answer_loss": NaN, - "step": 1244, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1439, - "grad_norm": 1.4453125, - "learning_rate": 1.4082328122441676e-05, - "long_answer_loss": 0.1439, - "loss": 0.1404, - "short_answer_loss": NaN, - "step": 1245, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1692, - "grad_norm": 1.484375, - "learning_rate": 1.4066972555480201e-05, - "long_answer_loss": 0.1692, - "loss": 0.1408, - "short_answer_loss": NaN, - "step": 1246, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1202, - "grad_norm": 1.3828125, - "learning_rate": 1.405161458570477e-05, - "long_answer_loss": 0.1202, - "loss": 0.1311, - "short_answer_loss": NaN, - "step": 1247, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1195, - "grad_norm": 1.421875, - "learning_rate": 1.4036254236665472e-05, - "long_answer_loss": 0.1195, - "loss": 0.137, - "short_answer_loss": NaN, - "step": 1248, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1264, - "grad_norm": 1.4375, - "learning_rate": 1.4020891531916047e-05, - "long_answer_loss": 0.1264, - "loss": 0.1485, - "short_answer_loss": NaN, - "step": 1249, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1213, - "grad_norm": 1.3515625, - "learning_rate": 1.4005526495013848e-05, - "long_answer_loss": 0.1213, - "loss": 0.1246, - "short_answer_loss": NaN, - "step": 1250, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1144, - "grad_norm": 1.375, - "learning_rate": 1.3990159149519797e-05, - "long_answer_loss": 0.1144, - "loss": 0.1242, - "short_answer_loss": NaN, - "step": 1251, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1169, - "grad_norm": 1.40625, - "learning_rate": 1.397478951899836e-05, - "long_answer_loss": 0.1169, - "loss": 0.1403, - "short_answer_loss": NaN, - "step": 1252, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1254, - "grad_norm": 1.4375, - "learning_rate": 1.3959417627017507e-05, - "long_answer_loss": 0.1254, - "loss": 0.1304, - "short_answer_loss": NaN, - "step": 1253, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1331, - "grad_norm": 1.390625, - "learning_rate": 1.3944043497148682e-05, - "long_answer_loss": 0.1331, - "loss": 0.132, - "short_answer_loss": NaN, - "step": 1254, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1532, - "grad_norm": 1.453125, - "learning_rate": 1.3928667152966748e-05, - "long_answer_loss": 0.1532, - "loss": 0.1364, - "short_answer_loss": NaN, - "step": 1255, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1598, - "grad_norm": 1.484375, - "learning_rate": 1.3913288618049975e-05, - "long_answer_loss": 0.1598, - "loss": 0.1348, - "short_answer_loss": NaN, - "step": 1256, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1317, - "grad_norm": 1.390625, - "learning_rate": 1.3897907915979984e-05, - "long_answer_loss": 0.1317, - "loss": 0.1392, - "short_answer_loss": NaN, - "step": 1257, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1521, - "grad_norm": 1.4609375, - "learning_rate": 1.3882525070341725e-05, - "long_answer_loss": 0.1521, - "loss": 0.1376, - "short_answer_loss": NaN, - "step": 1258, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1554, - "grad_norm": 1.4296875, - "learning_rate": 1.3867140104723433e-05, - "long_answer_loss": 0.1554, - "loss": 0.1407, - "short_answer_loss": NaN, - "step": 1259, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1281, - "grad_norm": 1.421875, - "learning_rate": 1.385175304271659e-05, - "long_answer_loss": 0.1281, - "loss": 0.1437, - "short_answer_loss": NaN, - "step": 1260, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1242, - "grad_norm": 1.421875, - "learning_rate": 1.3836363907915894e-05, - "long_answer_loss": 0.1242, - "loss": 0.1281, - "short_answer_loss": NaN, - "step": 1261, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1099, - "grad_norm": 1.390625, - "learning_rate": 1.3820972723919231e-05, - "long_answer_loss": 0.1099, - "loss": 0.1326, - "short_answer_loss": NaN, - "step": 1262, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1166, - "grad_norm": 1.3515625, - "learning_rate": 1.3805579514327616e-05, - "long_answer_loss": 0.1166, - "loss": 0.1307, - "short_answer_loss": NaN, - "step": 1263, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1346, - "grad_norm": 1.4921875, - "learning_rate": 1.379018430274518e-05, - "long_answer_loss": 0.1346, - "loss": 0.1366, - "short_answer_loss": NaN, - "step": 1264, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1454, - "grad_norm": 1.375, - "learning_rate": 1.3774787112779117e-05, - "long_answer_loss": 0.1454, - "loss": 0.1294, - "short_answer_loss": NaN, - "step": 1265, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1384, - "grad_norm": 1.4921875, - "learning_rate": 1.3759387968039658e-05, - "long_answer_loss": 0.1384, - "loss": 0.1391, - "short_answer_loss": NaN, - "step": 1266, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.134, - "grad_norm": 1.40625, - "learning_rate": 1.374398689214003e-05, - "long_answer_loss": 0.134, - "loss": 0.1397, - "short_answer_loss": NaN, - "step": 1267, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1302, - "grad_norm": 1.5234375, - "learning_rate": 1.3728583908696418e-05, - "long_answer_loss": 0.1302, - "loss": 0.142, - "short_answer_loss": NaN, - "step": 1268, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1454, - "grad_norm": 1.453125, - "learning_rate": 1.3713179041327946e-05, - "long_answer_loss": 0.1454, - "loss": 0.1338, - "short_answer_loss": NaN, - "step": 1269, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1291, - "grad_norm": 1.484375, - "learning_rate": 1.3697772313656607e-05, - "long_answer_loss": 0.1291, - "loss": 0.1334, - "short_answer_loss": NaN, - "step": 1270, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1658, - "grad_norm": 1.4375, - "learning_rate": 1.3682363749307261e-05, - "long_answer_loss": 0.1658, - "loss": 0.1341, - "short_answer_loss": NaN, - "step": 1271, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1131, - "grad_norm": 1.34375, - "learning_rate": 1.3666953371907584e-05, - "long_answer_loss": 0.1131, - "loss": 0.1291, - "short_answer_loss": NaN, - "step": 1272, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1271, - "grad_norm": 1.3671875, - "learning_rate": 1.3651541205088022e-05, - "long_answer_loss": 0.1271, - "loss": 0.131, - "short_answer_loss": NaN, - "step": 1273, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1428, - "grad_norm": 1.453125, - "learning_rate": 1.3636127272481772e-05, - "long_answer_loss": 0.1428, - "loss": 0.1287, - "short_answer_loss": NaN, - "step": 1274, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1256, - "grad_norm": 1.375, - "learning_rate": 1.3620711597724739e-05, - "long_answer_loss": 0.1256, - "loss": 0.1322, - "short_answer_loss": NaN, - "step": 1275, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.14, - "grad_norm": 1.4453125, - "learning_rate": 1.3605294204455502e-05, - "long_answer_loss": 0.14, - "loss": 0.14, - "short_answer_loss": NaN, - "step": 1276, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1428, - "grad_norm": 1.3359375, - "learning_rate": 1.3589875116315259e-05, - "long_answer_loss": 0.1428, - "loss": 0.1293, - "short_answer_loss": NaN, - "step": 1277, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1218, - "grad_norm": 1.4765625, - "learning_rate": 1.3574454356947833e-05, - "long_answer_loss": 0.1218, - "loss": 0.1305, - "short_answer_loss": NaN, - "step": 1278, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1426, - "grad_norm": 1.328125, - "learning_rate": 1.3559031949999587e-05, - "long_answer_loss": 0.1426, - "loss": 0.129, - "short_answer_loss": NaN, - "step": 1279, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1252, - "grad_norm": 1.46875, - "learning_rate": 1.3543607919119425e-05, - "long_answer_loss": 0.1252, - "loss": 0.1349, - "short_answer_loss": NaN, - "step": 1280, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.142, - "grad_norm": 1.4453125, - "learning_rate": 1.3528182287958733e-05, - "long_answer_loss": 0.142, - "loss": 0.1457, - "short_answer_loss": NaN, - "step": 1281, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1327, - "grad_norm": 1.5, - "learning_rate": 1.3512755080171349e-05, - "long_answer_loss": 0.1327, - "loss": 0.1335, - "short_answer_loss": NaN, - "step": 1282, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1111, - "grad_norm": 1.46875, - "learning_rate": 1.3497326319413539e-05, - "long_answer_loss": 0.1111, - "loss": 0.1291, - "short_answer_loss": NaN, - "step": 1283, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1178, - "grad_norm": 1.3515625, - "learning_rate": 1.3481896029343943e-05, - "long_answer_loss": 0.1178, - "loss": 0.1198, - "short_answer_loss": NaN, - "step": 1284, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1229, - "grad_norm": 1.46875, - "learning_rate": 1.3466464233623546e-05, - "long_answer_loss": 0.1229, - "loss": 0.1341, - "short_answer_loss": NaN, - "step": 1285, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.124, - "grad_norm": 1.453125, - "learning_rate": 1.345103095591565e-05, - "long_answer_loss": 0.124, - "loss": 0.1314, - "short_answer_loss": NaN, - "step": 1286, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1297, - "grad_norm": 1.390625, - "learning_rate": 1.343559621988581e-05, - "long_answer_loss": 0.1297, - "loss": 0.1293, - "short_answer_loss": NaN, - "step": 1287, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1322, - "grad_norm": 1.5, - "learning_rate": 1.3420160049201841e-05, - "long_answer_loss": 0.1322, - "loss": 0.1362, - "short_answer_loss": NaN, - "step": 1288, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1261, - "grad_norm": 1.4609375, - "learning_rate": 1.340472246753374e-05, - "long_answer_loss": 0.1261, - "loss": 0.1346, - "short_answer_loss": NaN, - "step": 1289, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1178, - "grad_norm": 1.4453125, - "learning_rate": 1.3389283498553678e-05, - "long_answer_loss": 0.1178, - "loss": 0.1348, - "short_answer_loss": NaN, - "step": 1290, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1238, - "grad_norm": 1.5, - "learning_rate": 1.3373843165935945e-05, - "long_answer_loss": 0.1238, - "loss": 0.1397, - "short_answer_loss": NaN, - "step": 1291, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1396, - "grad_norm": 1.4375, - "learning_rate": 1.3358401493356934e-05, - "long_answer_loss": 0.1396, - "loss": 0.133, - "short_answer_loss": NaN, - "step": 1292, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1232, - "grad_norm": 1.4296875, - "learning_rate": 1.3342958504495083e-05, - "long_answer_loss": 0.1232, - "loss": 0.1307, - "short_answer_loss": NaN, - "step": 1293, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1483, - "grad_norm": 1.4296875, - "learning_rate": 1.3327514223030845e-05, - "long_answer_loss": 0.1483, - "loss": 0.1354, - "short_answer_loss": NaN, - "step": 1294, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1183, - "grad_norm": 1.453125, - "learning_rate": 1.3312068672646671e-05, - "long_answer_loss": 0.1183, - "loss": 0.1431, - "short_answer_loss": NaN, - "step": 1295, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1229, - "grad_norm": 1.359375, - "learning_rate": 1.3296621877026938e-05, - "long_answer_loss": 0.1229, - "loss": 0.1384, - "short_answer_loss": NaN, - "step": 1296, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1218, - "grad_norm": 1.4296875, - "learning_rate": 1.3281173859857951e-05, - "long_answer_loss": 0.1218, - "loss": 0.1323, - "short_answer_loss": NaN, - "step": 1297, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1291, - "grad_norm": 1.421875, - "learning_rate": 1.3265724644827873e-05, - "long_answer_loss": 0.1291, - "loss": 0.1328, - "short_answer_loss": NaN, - "step": 1298, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1104, - "grad_norm": 1.5078125, - "learning_rate": 1.325027425562671e-05, - "long_answer_loss": 0.1104, - "loss": 0.1295, - "short_answer_loss": NaN, - "step": 1299, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1227, - "grad_norm": 1.34375, - "learning_rate": 1.3234822715946272e-05, - "long_answer_loss": 0.1227, - "loss": 0.1301, - "short_answer_loss": NaN, - "step": 1300, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.131, - "grad_norm": 1.421875, - "learning_rate": 1.3219370049480128e-05, - "long_answer_loss": 0.131, - "loss": 0.1362, - "short_answer_loss": NaN, - "step": 1301, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1161, - "grad_norm": 1.4140625, - "learning_rate": 1.3203916279923579e-05, - "long_answer_loss": 0.1161, - "loss": 0.1242, - "short_answer_loss": NaN, - "step": 1302, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1381, - "grad_norm": 1.4296875, - "learning_rate": 1.3188461430973612e-05, - "long_answer_loss": 0.1381, - "loss": 0.1407, - "short_answer_loss": NaN, - "step": 1303, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.146, - "grad_norm": 1.4921875, - "learning_rate": 1.3173005526328875e-05, - "long_answer_loss": 0.146, - "loss": 0.1309, - "short_answer_loss": NaN, - "step": 1304, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1337, - "grad_norm": 1.4453125, - "learning_rate": 1.3157548589689625e-05, - "long_answer_loss": 0.1337, - "loss": 0.1295, - "short_answer_loss": NaN, - "step": 1305, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1376, - "grad_norm": 1.3515625, - "learning_rate": 1.3142090644757719e-05, - "long_answer_loss": 0.1376, - "loss": 0.125, - "short_answer_loss": NaN, - "step": 1306, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1283, - "grad_norm": 1.3671875, - "learning_rate": 1.3126631715236546e-05, - "long_answer_loss": 0.1283, - "loss": 0.1245, - "short_answer_loss": NaN, - "step": 1307, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1443, - "grad_norm": 1.359375, - "learning_rate": 1.3111171824831004e-05, - "long_answer_loss": 0.1443, - "loss": 0.1283, - "short_answer_loss": NaN, - "step": 1308, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.0887, - "grad_norm": 1.28125, - "learning_rate": 1.3095710997247474e-05, - "long_answer_loss": 0.0887, - "loss": 0.0991, - "short_answer_loss": NaN, - "step": 1309, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.0883, - "grad_norm": 1.2265625, - "learning_rate": 1.3080249256193766e-05, - "long_answer_loss": 0.0883, - "loss": 0.0919, - "short_answer_loss": NaN, - "step": 1310, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.089, - "grad_norm": 1.1640625, - "learning_rate": 1.3064786625379096e-05, - "long_answer_loss": 0.089, - "loss": 0.0808, - "short_answer_loss": NaN, - "step": 1311, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.0936, - "grad_norm": 1.2265625, - "learning_rate": 1.3049323128514041e-05, - "long_answer_loss": 0.0936, - "loss": 0.0863, - "short_answer_loss": NaN, - "step": 1312, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.0967, - "grad_norm": 1.3125, - "learning_rate": 1.3033858789310504e-05, - "long_answer_loss": 0.0967, - "loss": 0.093, - "short_answer_loss": NaN, - "step": 1313, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.0807, - "grad_norm": 1.3046875, - "learning_rate": 1.3018393631481686e-05, - "long_answer_loss": 0.0807, - "loss": 0.0824, - "short_answer_loss": NaN, - "step": 1314, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0766, - "grad_norm": 1.3984375, - "learning_rate": 1.3002927678742044e-05, - "long_answer_loss": 0.0766, - "loss": 0.0875, - "short_answer_loss": NaN, - "step": 1315, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.086, - "grad_norm": 1.3984375, - "learning_rate": 1.298746095480724e-05, - "long_answer_loss": 0.086, - "loss": 0.0831, - "short_answer_loss": NaN, - "step": 1316, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.1022, - "grad_norm": 1.5546875, - "learning_rate": 1.297199348339414e-05, - "long_answer_loss": 0.1022, - "loss": 0.087, - "short_answer_loss": NaN, - "step": 1317, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0888, - "grad_norm": 1.5234375, - "learning_rate": 1.2956525288220738e-05, - "long_answer_loss": 0.0888, - "loss": 0.0874, - "short_answer_loss": NaN, - "step": 1318, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.071, - "grad_norm": 1.578125, - "learning_rate": 1.2941056393006144e-05, - "long_answer_loss": 0.071, - "loss": 0.0926, - "short_answer_loss": NaN, - "step": 1319, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0929, - "grad_norm": 1.609375, - "learning_rate": 1.2925586821470542e-05, - "long_answer_loss": 0.0929, - "loss": 0.0822, - "short_answer_loss": NaN, - "step": 1320, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.07, - "grad_norm": 1.46875, - "learning_rate": 1.2910116597335157e-05, - "long_answer_loss": 0.07, - "loss": 0.0808, - "short_answer_loss": NaN, - "step": 1321, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0737, - "grad_norm": 1.5703125, - "learning_rate": 1.2894645744322203e-05, - "long_answer_loss": 0.0737, - "loss": 0.0858, - "short_answer_loss": NaN, - "step": 1322, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0703, - "grad_norm": 1.4375, - "learning_rate": 1.2879174286154874e-05, - "long_answer_loss": 0.0703, - "loss": 0.0835, - "short_answer_loss": NaN, - "step": 1323, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.1101, - "grad_norm": 1.578125, - "learning_rate": 1.2863702246557283e-05, - "long_answer_loss": 0.1101, - "loss": 0.0935, - "short_answer_loss": NaN, - "step": 1324, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0851, - "grad_norm": 1.4453125, - "learning_rate": 1.2848229649254435e-05, - "long_answer_loss": 0.0851, - "loss": 0.0848, - "short_answer_loss": NaN, - "step": 1325, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0851, - "grad_norm": 1.46875, - "learning_rate": 1.2832756517972185e-05, - "long_answer_loss": 0.0851, - "loss": 0.0859, - "short_answer_loss": NaN, - "step": 1326, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0851, - "grad_norm": 1.3671875, - "learning_rate": 1.2817282876437223e-05, - "long_answer_loss": 0.0851, - "loss": 0.0809, - "short_answer_loss": NaN, - "step": 1327, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0889, - "grad_norm": 1.390625, - "learning_rate": 1.2801808748377e-05, - "long_answer_loss": 0.0889, - "loss": 0.0892, - "short_answer_loss": NaN, - "step": 1328, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.1083, - "grad_norm": 1.3984375, - "learning_rate": 1.2786334157519733e-05, - "long_answer_loss": 0.1083, - "loss": 0.0855, - "short_answer_loss": NaN, - "step": 1329, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.086, - "grad_norm": 1.359375, - "learning_rate": 1.2770859127594334e-05, - "long_answer_loss": 0.086, - "loss": 0.0908, - "short_answer_loss": NaN, - "step": 1330, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0867, - "grad_norm": 1.28125, - "learning_rate": 1.2755383682330394e-05, - "long_answer_loss": 0.0867, - "loss": 0.0827, - "short_answer_loss": NaN, - "step": 1331, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.1012, - "grad_norm": 1.53125, - "learning_rate": 1.2739907845458146e-05, - "long_answer_loss": 0.1012, - "loss": 0.0903, - "short_answer_loss": NaN, - "step": 1332, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0957, - "grad_norm": 1.34375, - "learning_rate": 1.2724431640708418e-05, - "long_answer_loss": 0.0957, - "loss": 0.0819, - "short_answer_loss": NaN, - "step": 1333, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0999, - "grad_norm": 1.4140625, - "learning_rate": 1.2708955091812593e-05, - "long_answer_loss": 0.0999, - "loss": 0.0926, - "short_answer_loss": NaN, - "step": 1334, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0748, - "grad_norm": 1.3828125, - "learning_rate": 1.2693478222502604e-05, - "long_answer_loss": 0.0748, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 1335, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0813, - "grad_norm": 1.3203125, - "learning_rate": 1.2678001056510854e-05, - "long_answer_loss": 0.0813, - "loss": 0.0821, - "short_answer_loss": NaN, - "step": 1336, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0686, - "grad_norm": 1.515625, - "learning_rate": 1.2662523617570213e-05, - "long_answer_loss": 0.0686, - "loss": 0.0884, - "short_answer_loss": NaN, - "step": 1337, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.1061, - "grad_norm": 1.390625, - "learning_rate": 1.2647045929413966e-05, - "long_answer_loss": 0.1061, - "loss": 0.0875, - "short_answer_loss": NaN, - "step": 1338, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.106, - "grad_norm": 1.3984375, - "learning_rate": 1.2631568015775777e-05, - "long_answer_loss": 0.106, - "loss": 0.0869, - "short_answer_loss": NaN, - "step": 1339, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.1007, - "grad_norm": 1.4609375, - "learning_rate": 1.2616089900389663e-05, - "long_answer_loss": 0.1007, - "loss": 0.0858, - "short_answer_loss": NaN, - "step": 1340, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0857, - "grad_norm": 1.4453125, - "learning_rate": 1.2600611606989945e-05, - "long_answer_loss": 0.0857, - "loss": 0.0858, - "short_answer_loss": NaN, - "step": 1341, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0922, - "grad_norm": 1.375, - "learning_rate": 1.2585133159311217e-05, - "long_answer_loss": 0.0922, - "loss": 0.0852, - "short_answer_loss": NaN, - "step": 1342, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0946, - "grad_norm": 1.3984375, - "learning_rate": 1.256965458108831e-05, - "long_answer_loss": 0.0946, - "loss": 0.0846, - "short_answer_loss": NaN, - "step": 1343, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.092, - "grad_norm": 1.3984375, - "learning_rate": 1.2554175896056259e-05, - "long_answer_loss": 0.092, - "loss": 0.0858, - "short_answer_loss": NaN, - "step": 1344, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0707, - "grad_norm": 1.2578125, - "learning_rate": 1.2538697127950258e-05, - "long_answer_loss": 0.0707, - "loss": 0.0788, - "short_answer_loss": NaN, - "step": 1345, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0801, - "grad_norm": 1.3515625, - "learning_rate": 1.252321830050563e-05, - "long_answer_loss": 0.0801, - "loss": 0.0839, - "short_answer_loss": NaN, - "step": 1346, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0828, - "grad_norm": 1.390625, - "learning_rate": 1.2507739437457795e-05, - "long_answer_loss": 0.0828, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 1347, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.1096, - "grad_norm": 1.4921875, - "learning_rate": 1.249226056254221e-05, - "long_answer_loss": 0.1096, - "loss": 0.0884, - "short_answer_loss": NaN, - "step": 1348, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0804, - "grad_norm": 1.3984375, - "learning_rate": 1.2476781699494372e-05, - "long_answer_loss": 0.0804, - "loss": 0.0852, - "short_answer_loss": NaN, - "step": 1349, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.083, - "grad_norm": 1.484375, - "learning_rate": 1.2461302872049741e-05, - "long_answer_loss": 0.083, - "loss": 0.0818, - "short_answer_loss": NaN, - "step": 1350, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0825, - "grad_norm": 1.328125, - "learning_rate": 1.2445824103943744e-05, - "long_answer_loss": 0.0825, - "loss": 0.0838, - "short_answer_loss": NaN, - "step": 1351, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.083, - "grad_norm": 1.390625, - "learning_rate": 1.243034541891169e-05, - "long_answer_loss": 0.083, - "loss": 0.0853, - "short_answer_loss": NaN, - "step": 1352, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0684, - "grad_norm": 1.359375, - "learning_rate": 1.2414866840688786e-05, - "long_answer_loss": 0.0684, - "loss": 0.0807, - "short_answer_loss": NaN, - "step": 1353, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0827, - "grad_norm": 1.4453125, - "learning_rate": 1.239938839301006e-05, - "long_answer_loss": 0.0827, - "loss": 0.0833, - "short_answer_loss": NaN, - "step": 1354, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0866, - "grad_norm": 1.4375, - "learning_rate": 1.238391009961034e-05, - "long_answer_loss": 0.0866, - "loss": 0.0867, - "short_answer_loss": NaN, - "step": 1355, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0857, - "grad_norm": 1.3046875, - "learning_rate": 1.2368431984224226e-05, - "long_answer_loss": 0.0857, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 1356, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0837, - "grad_norm": 1.4765625, - "learning_rate": 1.2352954070586036e-05, - "long_answer_loss": 0.0837, - "loss": 0.084, - "short_answer_loss": NaN, - "step": 1357, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0797, - "grad_norm": 1.34375, - "learning_rate": 1.2337476382429791e-05, - "long_answer_loss": 0.0797, - "loss": 0.0845, - "short_answer_loss": NaN, - "step": 1358, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0851, - "grad_norm": 1.390625, - "learning_rate": 1.2321998943489147e-05, - "long_answer_loss": 0.0851, - "loss": 0.0808, - "short_answer_loss": NaN, - "step": 1359, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0974, - "grad_norm": 1.4140625, - "learning_rate": 1.23065217774974e-05, - "long_answer_loss": 0.0974, - "loss": 0.0881, - "short_answer_loss": NaN, - "step": 1360, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0969, - "grad_norm": 1.2890625, - "learning_rate": 1.2291044908187405e-05, - "long_answer_loss": 0.0969, - "loss": 0.0832, - "short_answer_loss": NaN, - "step": 1361, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0731, - "grad_norm": 1.421875, - "learning_rate": 1.2275568359291587e-05, - "long_answer_loss": 0.0731, - "loss": 0.0834, - "short_answer_loss": NaN, - "step": 1362, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.1057, - "grad_norm": 1.5078125, - "learning_rate": 1.2260092154541857e-05, - "long_answer_loss": 0.1057, - "loss": 0.0897, - "short_answer_loss": NaN, - "step": 1363, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0856, - "grad_norm": 1.484375, - "learning_rate": 1.2244616317669607e-05, - "long_answer_loss": 0.0856, - "loss": 0.0881, - "short_answer_loss": NaN, - "step": 1364, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0663, - "grad_norm": 1.359375, - "learning_rate": 1.2229140872405672e-05, - "long_answer_loss": 0.0663, - "loss": 0.0864, - "short_answer_loss": NaN, - "step": 1365, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0932, - "grad_norm": 1.359375, - "learning_rate": 1.2213665842480271e-05, - "long_answer_loss": 0.0932, - "loss": 0.0829, - "short_answer_loss": NaN, - "step": 1366, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0804, - "grad_norm": 1.296875, - "learning_rate": 1.2198191251623006e-05, - "long_answer_loss": 0.0804, - "loss": 0.0845, - "short_answer_loss": NaN, - "step": 1367, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0857, - "grad_norm": 1.328125, - "learning_rate": 1.218271712356278e-05, - "long_answer_loss": 0.0857, - "loss": 0.0814, - "short_answer_loss": NaN, - "step": 1368, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0953, - "grad_norm": 1.4296875, - "learning_rate": 1.2167243482027816e-05, - "long_answer_loss": 0.0953, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 1369, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0867, - "grad_norm": 1.609375, - "learning_rate": 1.2151770350745568e-05, - "long_answer_loss": 0.0867, - "loss": 0.0822, - "short_answer_loss": NaN, - "step": 1370, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0701, - "grad_norm": 1.3359375, - "learning_rate": 1.2136297753442721e-05, - "long_answer_loss": 0.0701, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1371, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.081, - "grad_norm": 1.3984375, - "learning_rate": 1.2120825713845125e-05, - "long_answer_loss": 0.081, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 1372, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0932, - "grad_norm": 1.453125, - "learning_rate": 1.2105354255677798e-05, - "long_answer_loss": 0.0932, - "loss": 0.0843, - "short_answer_loss": NaN, - "step": 1373, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0708, - "grad_norm": 1.4375, - "learning_rate": 1.2089883402664851e-05, - "long_answer_loss": 0.0708, - "loss": 0.0843, - "short_answer_loss": NaN, - "step": 1374, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.08, - "grad_norm": 1.34375, - "learning_rate": 1.2074413178529461e-05, - "long_answer_loss": 0.08, - "loss": 0.0828, - "short_answer_loss": NaN, - "step": 1375, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0834, - "grad_norm": 1.375, - "learning_rate": 1.2058943606993861e-05, - "long_answer_loss": 0.0834, - "loss": 0.0835, - "short_answer_loss": NaN, - "step": 1376, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.081, - "grad_norm": 1.4453125, - "learning_rate": 1.2043474711779263e-05, - "long_answer_loss": 0.081, - "loss": 0.0861, - "short_answer_loss": NaN, - "step": 1377, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0948, - "grad_norm": 1.4609375, - "learning_rate": 1.2028006516605863e-05, - "long_answer_loss": 0.0948, - "loss": 0.0867, - "short_answer_loss": NaN, - "step": 1378, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.1134, - "grad_norm": 1.4140625, - "learning_rate": 1.2012539045192759e-05, - "long_answer_loss": 0.1134, - "loss": 0.079, - "short_answer_loss": NaN, - "step": 1379, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0703, - "grad_norm": 1.46875, - "learning_rate": 1.199707232125796e-05, - "long_answer_loss": 0.0703, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 1380, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.1042, - "grad_norm": 1.4375, - "learning_rate": 1.1981606368518313e-05, - "long_answer_loss": 0.1042, - "loss": 0.0879, - "short_answer_loss": NaN, - "step": 1381, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0727, - "grad_norm": 1.421875, - "learning_rate": 1.1966141210689497e-05, - "long_answer_loss": 0.0727, - "loss": 0.0842, - "short_answer_loss": NaN, - "step": 1382, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0665, - "grad_norm": 1.4375, - "learning_rate": 1.195067687148596e-05, - "long_answer_loss": 0.0665, - "loss": 0.0764, - "short_answer_loss": NaN, - "step": 1383, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0811, - "grad_norm": 1.4296875, - "learning_rate": 1.1935213374620907e-05, - "long_answer_loss": 0.0811, - "loss": 0.0875, - "short_answer_loss": NaN, - "step": 1384, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0858, - "grad_norm": 1.453125, - "learning_rate": 1.1919750743806239e-05, - "long_answer_loss": 0.0858, - "loss": 0.0847, - "short_answer_loss": NaN, - "step": 1385, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0793, - "grad_norm": 1.4296875, - "learning_rate": 1.1904289002752529e-05, - "long_answer_loss": 0.0793, - "loss": 0.0786, - "short_answer_loss": NaN, - "step": 1386, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0769, - "grad_norm": 1.375, - "learning_rate": 1.1888828175169e-05, - "long_answer_loss": 0.0769, - "loss": 0.0839, - "short_answer_loss": NaN, - "step": 1387, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0846, - "grad_norm": 1.296875, - "learning_rate": 1.1873368284763457e-05, - "long_answer_loss": 0.0846, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 1388, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0824, - "grad_norm": 1.4765625, - "learning_rate": 1.1857909355242283e-05, - "long_answer_loss": 0.0824, - "loss": 0.0823, - "short_answer_loss": NaN, - "step": 1389, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0772, - "grad_norm": 1.3984375, - "learning_rate": 1.1842451410310373e-05, - "long_answer_loss": 0.0772, - "loss": 0.0846, - "short_answer_loss": NaN, - "step": 1390, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0789, - "grad_norm": 1.40625, - "learning_rate": 1.182699447367113e-05, - "long_answer_loss": 0.0789, - "loss": 0.082, - "short_answer_loss": NaN, - "step": 1391, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0912, - "grad_norm": 1.40625, - "learning_rate": 1.1811538569026391e-05, - "long_answer_loss": 0.0912, - "loss": 0.0846, - "short_answer_loss": NaN, - "step": 1392, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0715, - "grad_norm": 1.328125, - "learning_rate": 1.1796083720076426e-05, - "long_answer_loss": 0.0715, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 1393, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0857, - "grad_norm": 1.375, - "learning_rate": 1.1780629950519875e-05, - "long_answer_loss": 0.0857, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 1394, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.098, - "grad_norm": 1.3671875, - "learning_rate": 1.1765177284053731e-05, - "long_answer_loss": 0.098, - "loss": 0.0819, - "short_answer_loss": NaN, - "step": 1395, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0813, - "grad_norm": 1.4453125, - "learning_rate": 1.1749725744373295e-05, - "long_answer_loss": 0.0813, - "loss": 0.0824, - "short_answer_loss": NaN, - "step": 1396, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0734, - "grad_norm": 1.328125, - "learning_rate": 1.173427535517213e-05, - "long_answer_loss": 0.0734, - "loss": 0.0822, - "short_answer_loss": NaN, - "step": 1397, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0958, - "grad_norm": 1.3359375, - "learning_rate": 1.1718826140142055e-05, - "long_answer_loss": 0.0958, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 1398, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0984, - "grad_norm": 1.5625, - "learning_rate": 1.170337812297306e-05, - "long_answer_loss": 0.0984, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 1399, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0702, - "grad_norm": 1.4296875, - "learning_rate": 1.1687931327353333e-05, - "long_answer_loss": 0.0702, - "loss": 0.0832, - "short_answer_loss": NaN, - "step": 1400, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0731, - "grad_norm": 1.421875, - "learning_rate": 1.1672485776969156e-05, - "long_answer_loss": 0.0731, - "loss": 0.0833, - "short_answer_loss": NaN, - "step": 1401, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0774, - "grad_norm": 1.453125, - "learning_rate": 1.1657041495504922e-05, - "long_answer_loss": 0.0774, - "loss": 0.0851, - "short_answer_loss": NaN, - "step": 1402, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0871, - "grad_norm": 1.375, - "learning_rate": 1.1641598506643066e-05, - "long_answer_loss": 0.0871, - "loss": 0.0804, - "short_answer_loss": NaN, - "step": 1403, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0885, - "grad_norm": 1.390625, - "learning_rate": 1.1626156834064057e-05, - "long_answer_loss": 0.0885, - "loss": 0.0873, - "short_answer_loss": NaN, - "step": 1404, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0761, - "grad_norm": 1.46875, - "learning_rate": 1.1610716501446328e-05, - "long_answer_loss": 0.0761, - "loss": 0.0845, - "short_answer_loss": NaN, - "step": 1405, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0856, - "grad_norm": 1.421875, - "learning_rate": 1.1595277532466262e-05, - "long_answer_loss": 0.0856, - "loss": 0.0853, - "short_answer_loss": NaN, - "step": 1406, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0693, - "grad_norm": 1.328125, - "learning_rate": 1.1579839950798165e-05, - "long_answer_loss": 0.0693, - "loss": 0.079, - "short_answer_loss": NaN, - "step": 1407, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0878, - "grad_norm": 1.4453125, - "learning_rate": 1.1564403780114192e-05, - "long_answer_loss": 0.0878, - "loss": 0.0861, - "short_answer_loss": NaN, - "step": 1408, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0636, - "grad_norm": 1.3828125, - "learning_rate": 1.1548969044084358e-05, - "long_answer_loss": 0.0636, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 1409, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0819, - "grad_norm": 1.46875, - "learning_rate": 1.1533535766376454e-05, - "long_answer_loss": 0.0819, - "loss": 0.0812, - "short_answer_loss": NaN, - "step": 1410, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0836, - "grad_norm": 1.4609375, - "learning_rate": 1.151810397065606e-05, - "long_answer_loss": 0.0836, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 1411, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0822, - "grad_norm": 1.4453125, - "learning_rate": 1.150267368058646e-05, - "long_answer_loss": 0.0822, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 1412, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0863, - "grad_norm": 1.4765625, - "learning_rate": 1.1487244919828654e-05, - "long_answer_loss": 0.0863, - "loss": 0.0787, - "short_answer_loss": NaN, - "step": 1413, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0829, - "grad_norm": 1.53125, - "learning_rate": 1.1471817712041272e-05, - "long_answer_loss": 0.0829, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 1414, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0806, - "grad_norm": 1.5703125, - "learning_rate": 1.1456392080880578e-05, - "long_answer_loss": 0.0806, - "loss": 0.0871, - "short_answer_loss": NaN, - "step": 1415, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0796, - "grad_norm": 1.546875, - "learning_rate": 1.1440968050000416e-05, - "long_answer_loss": 0.0796, - "loss": 0.0835, - "short_answer_loss": NaN, - "step": 1416, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0786, - "grad_norm": 1.53125, - "learning_rate": 1.1425545643052171e-05, - "long_answer_loss": 0.0786, - "loss": 0.0887, - "short_answer_loss": NaN, - "step": 1417, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0843, - "grad_norm": 1.4375, - "learning_rate": 1.1410124883684744e-05, - "long_answer_loss": 0.0843, - "loss": 0.0801, - "short_answer_loss": NaN, - "step": 1418, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0787, - "grad_norm": 1.390625, - "learning_rate": 1.1394705795544503e-05, - "long_answer_loss": 0.0787, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 1419, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.079, - "grad_norm": 1.390625, - "learning_rate": 1.1379288402275264e-05, - "long_answer_loss": 0.079, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 1420, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.076, - "grad_norm": 1.5546875, - "learning_rate": 1.1363872727518226e-05, - "long_answer_loss": 0.076, - "loss": 0.0867, - "short_answer_loss": NaN, - "step": 1421, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0726, - "grad_norm": 1.5859375, - "learning_rate": 1.134845879491198e-05, - "long_answer_loss": 0.0726, - "loss": 0.0856, - "short_answer_loss": NaN, - "step": 1422, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0805, - "grad_norm": 1.53125, - "learning_rate": 1.1333046628092417e-05, - "long_answer_loss": 0.0805, - "loss": 0.0829, - "short_answer_loss": NaN, - "step": 1423, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0978, - "grad_norm": 1.375, - "learning_rate": 1.131763625069274e-05, - "long_answer_loss": 0.0978, - "loss": 0.081, - "short_answer_loss": NaN, - "step": 1424, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0826, - "grad_norm": 1.4140625, - "learning_rate": 1.1302227686343398e-05, - "long_answer_loss": 0.0826, - "loss": 0.0831, - "short_answer_loss": NaN, - "step": 1425, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.075, - "grad_norm": 1.4375, - "learning_rate": 1.1286820958672057e-05, - "long_answer_loss": 0.075, - "loss": 0.0831, - "short_answer_loss": NaN, - "step": 1426, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0636, - "grad_norm": 1.4609375, - "learning_rate": 1.1271416091303586e-05, - "long_answer_loss": 0.0636, - "loss": 0.0751, - "short_answer_loss": NaN, - "step": 1427, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.092, - "grad_norm": 1.40625, - "learning_rate": 1.1256013107859974e-05, - "long_answer_loss": 0.092, - "loss": 0.0808, - "short_answer_loss": NaN, - "step": 1428, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0747, - "grad_norm": 1.421875, - "learning_rate": 1.1240612031960347e-05, - "long_answer_loss": 0.0747, - "loss": 0.0831, - "short_answer_loss": NaN, - "step": 1429, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0879, - "grad_norm": 1.453125, - "learning_rate": 1.1225212887220886e-05, - "long_answer_loss": 0.0879, - "loss": 0.0781, - "short_answer_loss": NaN, - "step": 1430, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0784, - "grad_norm": 1.4921875, - "learning_rate": 1.1209815697254825e-05, - "long_answer_loss": 0.0784, - "loss": 0.088, - "short_answer_loss": NaN, - "step": 1431, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0798, - "grad_norm": 1.375, - "learning_rate": 1.1194420485672384e-05, - "long_answer_loss": 0.0798, - "loss": 0.0788, - "short_answer_loss": NaN, - "step": 1432, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0801, - "grad_norm": 1.46875, - "learning_rate": 1.1179027276080772e-05, - "long_answer_loss": 0.0801, - "loss": 0.0804, - "short_answer_loss": NaN, - "step": 1433, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0786, - "grad_norm": 1.546875, - "learning_rate": 1.1163636092084105e-05, - "long_answer_loss": 0.0786, - "loss": 0.0817, - "short_answer_loss": NaN, - "step": 1434, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0794, - "grad_norm": 1.3671875, - "learning_rate": 1.1148246957283415e-05, - "long_answer_loss": 0.0794, - "loss": 0.085, - "short_answer_loss": NaN, - "step": 1435, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0681, - "grad_norm": 1.4765625, - "learning_rate": 1.1132859895276574e-05, - "long_answer_loss": 0.0681, - "loss": 0.0845, - "short_answer_loss": NaN, - "step": 1436, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0926, - "grad_norm": 1.5703125, - "learning_rate": 1.1117474929658276e-05, - "long_answer_loss": 0.0926, - "loss": 0.0817, - "short_answer_loss": NaN, - "step": 1437, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0912, - "grad_norm": 1.4765625, - "learning_rate": 1.1102092084020018e-05, - "long_answer_loss": 0.0912, - "loss": 0.081, - "short_answer_loss": NaN, - "step": 1438, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0596, - "grad_norm": 1.390625, - "learning_rate": 1.1086711381950026e-05, - "long_answer_loss": 0.0596, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 1439, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0739, - "grad_norm": 1.515625, - "learning_rate": 1.1071332847033255e-05, - "long_answer_loss": 0.0739, - "loss": 0.0828, - "short_answer_loss": NaN, - "step": 1440, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0889, - "grad_norm": 1.46875, - "learning_rate": 1.105595650285132e-05, - "long_answer_loss": 0.0889, - "loss": 0.0825, - "short_answer_loss": NaN, - "step": 1441, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0858, - "grad_norm": 1.421875, - "learning_rate": 1.1040582372982494e-05, - "long_answer_loss": 0.0858, - "loss": 0.0803, - "short_answer_loss": NaN, - "step": 1442, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0918, - "grad_norm": 1.453125, - "learning_rate": 1.1025210481001642e-05, - "long_answer_loss": 0.0918, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 1443, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0775, - "grad_norm": 1.5078125, - "learning_rate": 1.1009840850480207e-05, - "long_answer_loss": 0.0775, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 1444, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0748, - "grad_norm": 1.4453125, - "learning_rate": 1.0994473504986155e-05, - "long_answer_loss": 0.0748, - "loss": 0.0802, - "short_answer_loss": NaN, - "step": 1445, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0944, - "grad_norm": 1.515625, - "learning_rate": 1.0979108468083956e-05, - "long_answer_loss": 0.0944, - "loss": 0.0855, - "short_answer_loss": NaN, - "step": 1446, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0746, - "grad_norm": 1.390625, - "learning_rate": 1.0963745763334533e-05, - "long_answer_loss": 0.0746, - "loss": 0.0767, - "short_answer_loss": NaN, - "step": 1447, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.073, - "grad_norm": 1.34375, - "learning_rate": 1.0948385414295235e-05, - "long_answer_loss": 0.073, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 1448, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0751, - "grad_norm": 1.5, - "learning_rate": 1.0933027444519805e-05, - "long_answer_loss": 0.0751, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 1449, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0762, - "grad_norm": 1.484375, - "learning_rate": 1.0917671877558327e-05, - "long_answer_loss": 0.0762, - "loss": 0.0865, - "short_answer_loss": NaN, - "step": 1450, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0766, - "grad_norm": 1.4296875, - "learning_rate": 1.0902318736957214e-05, - "long_answer_loss": 0.0766, - "loss": 0.0794, - "short_answer_loss": NaN, - "step": 1451, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0849, - "grad_norm": 1.46875, - "learning_rate": 1.0886968046259141e-05, - "long_answer_loss": 0.0849, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 1452, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0906, - "grad_norm": 1.3515625, - "learning_rate": 1.0871619829003044e-05, - "long_answer_loss": 0.0906, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 1453, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0765, - "grad_norm": 1.375, - "learning_rate": 1.0856274108724052e-05, - "long_answer_loss": 0.0765, - "loss": 0.0821, - "short_answer_loss": NaN, - "step": 1454, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0873, - "grad_norm": 1.328125, - "learning_rate": 1.0840930908953477e-05, - "long_answer_loss": 0.0873, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 1455, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0871, - "grad_norm": 1.484375, - "learning_rate": 1.0825590253218758e-05, - "long_answer_loss": 0.0871, - "loss": 0.0815, - "short_answer_loss": NaN, - "step": 1456, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0745, - "grad_norm": 1.421875, - "learning_rate": 1.0810252165043427e-05, - "long_answer_loss": 0.0745, - "loss": 0.0798, - "short_answer_loss": NaN, - "step": 1457, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0729, - "grad_norm": 1.4375, - "learning_rate": 1.07949166679471e-05, - "long_answer_loss": 0.0729, - "loss": 0.0842, - "short_answer_loss": NaN, - "step": 1458, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0866, - "grad_norm": 1.3671875, - "learning_rate": 1.0779583785445393e-05, - "long_answer_loss": 0.0866, - "loss": 0.0836, - "short_answer_loss": NaN, - "step": 1459, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0715, - "grad_norm": 1.390625, - "learning_rate": 1.0764253541049941e-05, - "long_answer_loss": 0.0715, - "loss": 0.0786, - "short_answer_loss": NaN, - "step": 1460, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.1092, - "grad_norm": 1.4453125, - "learning_rate": 1.074892595826831e-05, - "long_answer_loss": 0.1092, - "loss": 0.0886, - "short_answer_loss": NaN, - "step": 1461, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0715, - "grad_norm": 1.5078125, - "learning_rate": 1.0733601060603999e-05, - "long_answer_loss": 0.0715, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 1462, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0765, - "grad_norm": 1.4140625, - "learning_rate": 1.0718278871556374e-05, - "long_answer_loss": 0.0765, - "loss": 0.0808, - "short_answer_loss": NaN, - "step": 1463, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0692, - "grad_norm": 1.3828125, - "learning_rate": 1.0702959414620673e-05, - "long_answer_loss": 0.0692, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 1464, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0881, - "grad_norm": 1.390625, - "learning_rate": 1.0687642713287916e-05, - "long_answer_loss": 0.0881, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 1465, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.086, - "grad_norm": 1.5234375, - "learning_rate": 1.0672328791044921e-05, - "long_answer_loss": 0.086, - "loss": 0.0837, - "short_answer_loss": NaN, - "step": 1466, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.065, - "grad_norm": 1.390625, - "learning_rate": 1.0657017671374233e-05, - "long_answer_loss": 0.065, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 1467, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0787, - "grad_norm": 1.4140625, - "learning_rate": 1.0641709377754094e-05, - "long_answer_loss": 0.0787, - "loss": 0.0812, - "short_answer_loss": NaN, - "step": 1468, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0766, - "grad_norm": 1.484375, - "learning_rate": 1.0626403933658426e-05, - "long_answer_loss": 0.0766, - "loss": 0.0834, - "short_answer_loss": NaN, - "step": 1469, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0778, - "grad_norm": 1.421875, - "learning_rate": 1.0611101362556773e-05, - "long_answer_loss": 0.0778, - "loss": 0.0834, - "short_answer_loss": NaN, - "step": 1470, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0884, - "grad_norm": 1.4140625, - "learning_rate": 1.059580168791428e-05, - "long_answer_loss": 0.0884, - "loss": 0.082, - "short_answer_loss": NaN, - "step": 1471, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0676, - "grad_norm": 1.4296875, - "learning_rate": 1.0580504933191635e-05, - "long_answer_loss": 0.0676, - "loss": 0.0798, - "short_answer_loss": NaN, - "step": 1472, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0868, - "grad_norm": 1.3515625, - "learning_rate": 1.0565211121845075e-05, - "long_answer_loss": 0.0868, - "loss": 0.0803, - "short_answer_loss": NaN, - "step": 1473, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.075, - "grad_norm": 1.4140625, - "learning_rate": 1.0549920277326293e-05, - "long_answer_loss": 0.075, - "loss": 0.079, - "short_answer_loss": NaN, - "step": 1474, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.08, - "grad_norm": 1.4609375, - "learning_rate": 1.0534632423082462e-05, - "long_answer_loss": 0.08, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 1475, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0609, - "grad_norm": 1.390625, - "learning_rate": 1.051934758255615e-05, - "long_answer_loss": 0.0609, - "loss": 0.0779, - "short_answer_loss": NaN, - "step": 1476, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0834, - "grad_norm": 1.4296875, - "learning_rate": 1.0504065779185302e-05, - "long_answer_loss": 0.0834, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 1477, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0688, - "grad_norm": 1.3671875, - "learning_rate": 1.0488787036403226e-05, - "long_answer_loss": 0.0688, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 1478, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0802, - "grad_norm": 1.40625, - "learning_rate": 1.0473511377638512e-05, - "long_answer_loss": 0.0802, - "loss": 0.0783, - "short_answer_loss": NaN, - "step": 1479, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0994, - "grad_norm": 1.4765625, - "learning_rate": 1.0458238826315041e-05, - "long_answer_loss": 0.0994, - "loss": 0.0807, - "short_answer_loss": NaN, - "step": 1480, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0845, - "grad_norm": 1.375, - "learning_rate": 1.0442969405851917e-05, - "long_answer_loss": 0.0845, - "loss": 0.0852, - "short_answer_loss": NaN, - "step": 1481, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0796, - "grad_norm": 1.3359375, - "learning_rate": 1.0427703139663453e-05, - "long_answer_loss": 0.0796, - "loss": 0.0786, - "short_answer_loss": NaN, - "step": 1482, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0865, - "grad_norm": 1.4296875, - "learning_rate": 1.041244005115911e-05, - "long_answer_loss": 0.0865, - "loss": 0.0782, - "short_answer_loss": NaN, - "step": 1483, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0864, - "grad_norm": 1.40625, - "learning_rate": 1.0397180163743494e-05, - "long_answer_loss": 0.0864, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 1484, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0764, - "grad_norm": 1.4375, - "learning_rate": 1.0381923500816288e-05, - "long_answer_loss": 0.0764, - "loss": 0.0811, - "short_answer_loss": NaN, - "step": 1485, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.07, - "grad_norm": 1.5078125, - "learning_rate": 1.036667008577224e-05, - "long_answer_loss": 0.07, - "loss": 0.0849, - "short_answer_loss": NaN, - "step": 1486, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0768, - "grad_norm": 1.3984375, - "learning_rate": 1.0351419942001115e-05, - "long_answer_loss": 0.0768, - "loss": 0.0784, - "short_answer_loss": NaN, - "step": 1487, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0831, - "grad_norm": 1.4453125, - "learning_rate": 1.0336173092887655e-05, - "long_answer_loss": 0.0831, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 1488, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0695, - "grad_norm": 1.375, - "learning_rate": 1.0320929561811564e-05, - "long_answer_loss": 0.0695, - "loss": 0.0756, - "short_answer_loss": NaN, - "step": 1489, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0723, - "grad_norm": 1.4375, - "learning_rate": 1.0305689372147442e-05, - "long_answer_loss": 0.0723, - "loss": 0.0786, - "short_answer_loss": NaN, - "step": 1490, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0876, - "grad_norm": 1.5234375, - "learning_rate": 1.029045254726478e-05, - "long_answer_loss": 0.0876, - "loss": 0.0837, - "short_answer_loss": NaN, - "step": 1491, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0644, - "grad_norm": 1.5, - "learning_rate": 1.0275219110527898e-05, - "long_answer_loss": 0.0644, - "loss": 0.0803, - "short_answer_loss": NaN, - "step": 1492, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0864, - "grad_norm": 1.546875, - "learning_rate": 1.025998908529593e-05, - "long_answer_loss": 0.0864, - "loss": 0.0799, - "short_answer_loss": NaN, - "step": 1493, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0661, - "grad_norm": 1.390625, - "learning_rate": 1.0244762494922766e-05, - "long_answer_loss": 0.0661, - "loss": 0.0788, - "short_answer_loss": NaN, - "step": 1494, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0757, - "grad_norm": 1.3671875, - "learning_rate": 1.0229539362757046e-05, - "long_answer_loss": 0.0757, - "loss": 0.0787, - "short_answer_loss": NaN, - "step": 1495, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0947, - "grad_norm": 1.4609375, - "learning_rate": 1.021431971214209e-05, - "long_answer_loss": 0.0947, - "loss": 0.0783, - "short_answer_loss": NaN, - "step": 1496, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0871, - "grad_norm": 1.4296875, - "learning_rate": 1.0199103566415896e-05, - "long_answer_loss": 0.0871, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 1497, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0774, - "grad_norm": 1.4296875, - "learning_rate": 1.0183890948911074e-05, - "long_answer_loss": 0.0774, - "loss": 0.0792, - "short_answer_loss": NaN, - "step": 1498, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.073, - "grad_norm": 1.375, - "learning_rate": 1.0168681882954825e-05, - "long_answer_loss": 0.073, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 1499, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0767, - "grad_norm": 1.390625, - "learning_rate": 1.0153476391868917e-05, - "long_answer_loss": 0.0767, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 1500, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0713, - "grad_norm": 1.3671875, - "learning_rate": 1.0138274498969614e-05, - "long_answer_loss": 0.0713, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 1501, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0754, - "grad_norm": 1.3984375, - "learning_rate": 1.012307622756769e-05, - "long_answer_loss": 0.0754, - "loss": 0.0819, - "short_answer_loss": NaN, - "step": 1502, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0941, - "grad_norm": 1.59375, - "learning_rate": 1.010788160096834e-05, - "long_answer_loss": 0.0941, - "loss": 0.0858, - "short_answer_loss": NaN, - "step": 1503, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0829, - "grad_norm": 1.4765625, - "learning_rate": 1.009269064247119e-05, - "long_answer_loss": 0.0829, - "loss": 0.085, - "short_answer_loss": NaN, - "step": 1504, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0668, - "grad_norm": 1.3984375, - "learning_rate": 1.0077503375370226e-05, - "long_answer_loss": 0.0668, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 1505, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.07, - "grad_norm": 1.421875, - "learning_rate": 1.0062319822953787e-05, - "long_answer_loss": 0.07, - "loss": 0.0787, - "short_answer_loss": NaN, - "step": 1506, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0695, - "grad_norm": 1.3671875, - "learning_rate": 1.0047140008504499e-05, - "long_answer_loss": 0.0695, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 1507, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0883, - "grad_norm": 1.484375, - "learning_rate": 1.0031963955299272e-05, - "long_answer_loss": 0.0883, - "loss": 0.0858, - "short_answer_loss": NaN, - "step": 1508, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0757, - "grad_norm": 1.4375, - "learning_rate": 1.0016791686609248e-05, - "long_answer_loss": 0.0757, - "loss": 0.0823, - "short_answer_loss": NaN, - "step": 1509, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0724, - "grad_norm": 1.515625, - "learning_rate": 1.0001623225699747e-05, - "long_answer_loss": 0.0724, - "loss": 0.0838, - "short_answer_loss": NaN, - "step": 1510, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0758, - "grad_norm": 1.3515625, - "learning_rate": 9.986458595830275e-06, - "long_answer_loss": 0.0758, - "loss": 0.0748, - "short_answer_loss": NaN, - "step": 1511, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0864, - "grad_norm": 1.4765625, - "learning_rate": 9.971297820254447e-06, - "long_answer_loss": 0.0864, - "loss": 0.086, - "short_answer_loss": NaN, - "step": 1512, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0776, - "grad_norm": 1.375, - "learning_rate": 9.956140922219975e-06, - "long_answer_loss": 0.0776, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 1513, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0826, - "grad_norm": 1.5390625, - "learning_rate": 9.940987924968623e-06, - "long_answer_loss": 0.0826, - "loss": 0.0811, - "short_answer_loss": NaN, - "step": 1514, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0727, - "grad_norm": 1.4765625, - "learning_rate": 9.925838851736172e-06, - "long_answer_loss": 0.0727, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 1515, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0954, - "grad_norm": 1.3828125, - "learning_rate": 9.910693725752384e-06, - "long_answer_loss": 0.0954, - "loss": 0.078, - "short_answer_loss": NaN, - "step": 1516, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.07, - "grad_norm": 1.609375, - "learning_rate": 9.895552570240979e-06, - "long_answer_loss": 0.07, - "loss": 0.0794, - "short_answer_loss": NaN, - "step": 1517, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0809, - "grad_norm": 1.4140625, - "learning_rate": 9.880415408419577e-06, - "long_answer_loss": 0.0809, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1518, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0741, - "grad_norm": 1.453125, - "learning_rate": 9.865282263499672e-06, - "long_answer_loss": 0.0741, - "loss": 0.0772, - "short_answer_loss": NaN, - "step": 1519, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0694, - "grad_norm": 1.46875, - "learning_rate": 9.850153158686617e-06, - "long_answer_loss": 0.0694, - "loss": 0.0808, - "short_answer_loss": NaN, - "step": 1520, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0843, - "grad_norm": 1.4453125, - "learning_rate": 9.835028117179549e-06, - "long_answer_loss": 0.0843, - "loss": 0.0767, - "short_answer_loss": NaN, - "step": 1521, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0758, - "grad_norm": 1.546875, - "learning_rate": 9.819907162171385e-06, - "long_answer_loss": 0.0758, - "loss": 0.0772, - "short_answer_loss": NaN, - "step": 1522, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0814, - "grad_norm": 1.5390625, - "learning_rate": 9.80479031684877e-06, - "long_answer_loss": 0.0814, - "loss": 0.0849, - "short_answer_loss": NaN, - "step": 1523, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0871, - "grad_norm": 1.546875, - "learning_rate": 9.789677604392058e-06, - "long_answer_loss": 0.0871, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 1524, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0672, - "grad_norm": 1.3671875, - "learning_rate": 9.77456904797525e-06, - "long_answer_loss": 0.0672, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 1525, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0726, - "grad_norm": 1.4765625, - "learning_rate": 9.75946467076599e-06, - "long_answer_loss": 0.0726, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 1526, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0676, - "grad_norm": 1.375, - "learning_rate": 9.7443644959255e-06, - "long_answer_loss": 0.0676, - "loss": 0.0731, - "short_answer_loss": NaN, - "step": 1527, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0762, - "grad_norm": 1.515625, - "learning_rate": 9.729268546608565e-06, - "long_answer_loss": 0.0762, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 1528, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0729, - "grad_norm": 1.5546875, - "learning_rate": 9.714176845963494e-06, - "long_answer_loss": 0.0729, - "loss": 0.0807, - "short_answer_loss": NaN, - "step": 1529, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0747, - "grad_norm": 1.453125, - "learning_rate": 9.69908941713207e-06, - "long_answer_loss": 0.0747, - "loss": 0.077, - "short_answer_loss": NaN, - "step": 1530, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0722, - "grad_norm": 1.421875, - "learning_rate": 9.684006283249536e-06, - "long_answer_loss": 0.0722, - "loss": 0.0786, - "short_answer_loss": NaN, - "step": 1531, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.078, - "grad_norm": 1.3984375, - "learning_rate": 9.668927467444538e-06, - "long_answer_loss": 0.078, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 1532, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0885, - "grad_norm": 1.4140625, - "learning_rate": 9.65385299283912e-06, - "long_answer_loss": 0.0885, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 1533, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0729, - "grad_norm": 1.5078125, - "learning_rate": 9.638782882548645e-06, - "long_answer_loss": 0.0729, - "loss": 0.0824, - "short_answer_loss": NaN, - "step": 1534, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0716, - "grad_norm": 1.4375, - "learning_rate": 9.623717159681805e-06, - "long_answer_loss": 0.0716, - "loss": 0.0798, - "short_answer_loss": NaN, - "step": 1535, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0669, - "grad_norm": 1.4765625, - "learning_rate": 9.60865584734055e-06, - "long_answer_loss": 0.0669, - "loss": 0.0836, - "short_answer_loss": NaN, - "step": 1536, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0793, - "grad_norm": 1.484375, - "learning_rate": 9.593598968620072e-06, - "long_answer_loss": 0.0793, - "loss": 0.0828, - "short_answer_loss": NaN, - "step": 1537, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0736, - "grad_norm": 1.421875, - "learning_rate": 9.578546546608766e-06, - "long_answer_loss": 0.0736, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 1538, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0655, - "grad_norm": 1.546875, - "learning_rate": 9.563498604388183e-06, - "long_answer_loss": 0.0655, - "loss": 0.0792, - "short_answer_loss": NaN, - "step": 1539, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0734, - "grad_norm": 1.390625, - "learning_rate": 9.548455165033023e-06, - "long_answer_loss": 0.0734, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 1540, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0869, - "grad_norm": 1.3671875, - "learning_rate": 9.533416251611064e-06, - "long_answer_loss": 0.0869, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 1541, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0851, - "grad_norm": 1.421875, - "learning_rate": 9.51838188718316e-06, - "long_answer_loss": 0.0851, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 1542, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0894, - "grad_norm": 1.46875, - "learning_rate": 9.50335209480317e-06, - "long_answer_loss": 0.0894, - "loss": 0.0804, - "short_answer_loss": NaN, - "step": 1543, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0673, - "grad_norm": 1.3125, - "learning_rate": 9.48832689751796e-06, - "long_answer_loss": 0.0673, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 1544, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0924, - "grad_norm": 1.53125, - "learning_rate": 9.473306318367334e-06, - "long_answer_loss": 0.0924, - "loss": 0.0839, - "short_answer_loss": NaN, - "step": 1545, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0821, - "grad_norm": 1.5625, - "learning_rate": 9.458290380384033e-06, - "long_answer_loss": 0.0821, - "loss": 0.0752, - "short_answer_loss": NaN, - "step": 1546, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0648, - "grad_norm": 1.3203125, - "learning_rate": 9.443279106593663e-06, - "long_answer_loss": 0.0648, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 1547, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0897, - "grad_norm": 1.4296875, - "learning_rate": 9.428272520014691e-06, - "long_answer_loss": 0.0897, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 1548, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0823, - "grad_norm": 1.4921875, - "learning_rate": 9.413270643658393e-06, - "long_answer_loss": 0.0823, - "loss": 0.0818, - "short_answer_loss": NaN, - "step": 1549, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0818, - "grad_norm": 1.421875, - "learning_rate": 9.398273500528811e-06, - "long_answer_loss": 0.0818, - "loss": 0.0846, - "short_answer_loss": NaN, - "step": 1550, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0732, - "grad_norm": 1.3828125, - "learning_rate": 9.383281113622753e-06, - "long_answer_loss": 0.0732, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 1551, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0822, - "grad_norm": 1.46875, - "learning_rate": 9.368293505929707e-06, - "long_answer_loss": 0.0822, - "loss": 0.0777, - "short_answer_loss": NaN, - "step": 1552, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0738, - "grad_norm": 1.5, - "learning_rate": 9.353310700431852e-06, - "long_answer_loss": 0.0738, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 1553, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0777, - "grad_norm": 1.4609375, - "learning_rate": 9.33833272010399e-06, - "long_answer_loss": 0.0777, - "loss": 0.0828, - "short_answer_loss": NaN, - "step": 1554, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0756, - "grad_norm": 1.3828125, - "learning_rate": 9.323359587913542e-06, - "long_answer_loss": 0.0756, - "loss": 0.0796, - "short_answer_loss": NaN, - "step": 1555, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0692, - "grad_norm": 1.5390625, - "learning_rate": 9.308391326820467e-06, - "long_answer_loss": 0.0692, - "loss": 0.0838, - "short_answer_loss": NaN, - "step": 1556, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0729, - "grad_norm": 1.3671875, - "learning_rate": 9.293427959777288e-06, - "long_answer_loss": 0.0729, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 1557, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0667, - "grad_norm": 1.421875, - "learning_rate": 9.278469509728996e-06, - "long_answer_loss": 0.0667, - "loss": 0.0783, - "short_answer_loss": NaN, - "step": 1558, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.1008, - "grad_norm": 1.5234375, - "learning_rate": 9.263515999613054e-06, - "long_answer_loss": 0.1008, - "loss": 0.0837, - "short_answer_loss": NaN, - "step": 1559, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0704, - "grad_norm": 1.3671875, - "learning_rate": 9.248567452359351e-06, - "long_answer_loss": 0.0704, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 1560, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0713, - "grad_norm": 1.4453125, - "learning_rate": 9.233623890890155e-06, - "long_answer_loss": 0.0713, - "loss": 0.0774, - "short_answer_loss": NaN, - "step": 1561, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0754, - "grad_norm": 1.515625, - "learning_rate": 9.218685338120109e-06, - "long_answer_loss": 0.0754, - "loss": 0.0841, - "short_answer_loss": NaN, - "step": 1562, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0767, - "grad_norm": 1.4296875, - "learning_rate": 9.203751816956152e-06, - "long_answer_loss": 0.0767, - "loss": 0.0827, - "short_answer_loss": NaN, - "step": 1563, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0826, - "grad_norm": 1.4140625, - "learning_rate": 9.188823350297532e-06, - "long_answer_loss": 0.0826, - "loss": 0.0837, - "short_answer_loss": NaN, - "step": 1564, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0763, - "grad_norm": 1.3828125, - "learning_rate": 9.173899961035722e-06, - "long_answer_loss": 0.0763, - "loss": 0.0751, - "short_answer_loss": NaN, - "step": 1565, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0696, - "grad_norm": 1.3671875, - "learning_rate": 9.158981672054427e-06, - "long_answer_loss": 0.0696, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 1566, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.085, - "grad_norm": 1.375, - "learning_rate": 9.144068506229524e-06, - "long_answer_loss": 0.085, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 1567, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0872, - "grad_norm": 1.46875, - "learning_rate": 9.129160486429037e-06, - "long_answer_loss": 0.0872, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 1568, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.077, - "grad_norm": 1.375, - "learning_rate": 9.114257635513093e-06, - "long_answer_loss": 0.077, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 1569, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0749, - "grad_norm": 1.421875, - "learning_rate": 9.099359976333893e-06, - "long_answer_loss": 0.0749, - "loss": 0.0784, - "short_answer_loss": NaN, - "step": 1570, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0914, - "grad_norm": 1.4140625, - "learning_rate": 9.084467531735694e-06, - "long_answer_loss": 0.0914, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 1571, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0904, - "grad_norm": 1.4140625, - "learning_rate": 9.06958032455473e-06, - "long_answer_loss": 0.0904, - "loss": 0.0787, - "short_answer_loss": NaN, - "step": 1572, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0669, - "grad_norm": 1.40625, - "learning_rate": 9.054698377619227e-06, - "long_answer_loss": 0.0669, - "loss": 0.0796, - "short_answer_loss": NaN, - "step": 1573, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0885, - "grad_norm": 1.3671875, - "learning_rate": 9.039821713749335e-06, - "long_answer_loss": 0.0885, - "loss": 0.0784, - "short_answer_loss": NaN, - "step": 1574, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0885, - "grad_norm": 1.40625, - "learning_rate": 9.024950355757101e-06, - "long_answer_loss": 0.0885, - "loss": 0.0807, - "short_answer_loss": NaN, - "step": 1575, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0827, - "grad_norm": 1.359375, - "learning_rate": 9.010084326446435e-06, - "long_answer_loss": 0.0827, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 1576, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0735, - "grad_norm": 1.40625, - "learning_rate": 8.995223648613088e-06, - "long_answer_loss": 0.0735, - "loss": 0.0796, - "short_answer_loss": NaN, - "step": 1577, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0607, - "grad_norm": 1.359375, - "learning_rate": 8.980368345044587e-06, - "long_answer_loss": 0.0607, - "loss": 0.0742, - "short_answer_loss": NaN, - "step": 1578, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0662, - "grad_norm": 1.3828125, - "learning_rate": 8.965518438520238e-06, - "long_answer_loss": 0.0662, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 1579, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0684, - "grad_norm": 1.4453125, - "learning_rate": 8.950673951811053e-06, - "long_answer_loss": 0.0684, - "loss": 0.084, - "short_answer_loss": NaN, - "step": 1580, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0806, - "grad_norm": 1.375, - "learning_rate": 8.93583490767974e-06, - "long_answer_loss": 0.0806, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 1581, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0884, - "grad_norm": 1.5390625, - "learning_rate": 8.921001328880665e-06, - "long_answer_loss": 0.0884, - "loss": 0.0819, - "short_answer_loss": NaN, - "step": 1582, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0896, - "grad_norm": 1.4140625, - "learning_rate": 8.906173238159807e-06, - "long_answer_loss": 0.0896, - "loss": 0.0798, - "short_answer_loss": NaN, - "step": 1583, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0696, - "grad_norm": 1.4296875, - "learning_rate": 8.89135065825474e-06, - "long_answer_loss": 0.0696, - "loss": 0.0786, - "short_answer_loss": NaN, - "step": 1584, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0711, - "grad_norm": 1.375, - "learning_rate": 8.87653361189457e-06, - "long_answer_loss": 0.0711, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 1585, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0657, - "grad_norm": 1.484375, - "learning_rate": 8.861722121799942e-06, - "long_answer_loss": 0.0657, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 1586, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0796, - "grad_norm": 1.578125, - "learning_rate": 8.846916210682951e-06, - "long_answer_loss": 0.0796, - "loss": 0.0807, - "short_answer_loss": NaN, - "step": 1587, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0682, - "grad_norm": 1.3984375, - "learning_rate": 8.83211590124717e-06, - "long_answer_loss": 0.0682, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 1588, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.074, - "grad_norm": 1.421875, - "learning_rate": 8.817321216187557e-06, - "long_answer_loss": 0.074, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 1589, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0709, - "grad_norm": 1.390625, - "learning_rate": 8.802532178190453e-06, - "long_answer_loss": 0.0709, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 1590, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.077, - "grad_norm": 1.453125, - "learning_rate": 8.787748809933546e-06, - "long_answer_loss": 0.077, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 1591, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0866, - "grad_norm": 1.453125, - "learning_rate": 8.772971134085817e-06, - "long_answer_loss": 0.0866, - "loss": 0.079, - "short_answer_loss": NaN, - "step": 1592, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0719, - "grad_norm": 1.40625, - "learning_rate": 8.758199173307535e-06, - "long_answer_loss": 0.0719, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 1593, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0687, - "grad_norm": 1.390625, - "learning_rate": 8.743432950250188e-06, - "long_answer_loss": 0.0687, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 1594, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0776, - "grad_norm": 1.546875, - "learning_rate": 8.728672487556486e-06, - "long_answer_loss": 0.0776, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 1595, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.066, - "grad_norm": 1.3984375, - "learning_rate": 8.713917807860284e-06, - "long_answer_loss": 0.066, - "loss": 0.0775, - "short_answer_loss": NaN, - "step": 1596, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0795, - "grad_norm": 1.40625, - "learning_rate": 8.699168933786584e-06, - "long_answer_loss": 0.0795, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 1597, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0724, - "grad_norm": 1.4921875, - "learning_rate": 8.684425887951477e-06, - "long_answer_loss": 0.0724, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 1598, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0805, - "grad_norm": 1.4296875, - "learning_rate": 8.669688692962128e-06, - "long_answer_loss": 0.0805, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 1599, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0714, - "grad_norm": 1.375, - "learning_rate": 8.654957371416722e-06, - "long_answer_loss": 0.0714, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 1600, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.071, - "grad_norm": 1.4140625, - "learning_rate": 8.640231945904429e-06, - "long_answer_loss": 0.071, - "loss": 0.0756, - "short_answer_loss": NaN, - "step": 1601, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0719, - "grad_norm": 1.4453125, - "learning_rate": 8.625512439005401e-06, - "long_answer_loss": 0.0719, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 1602, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0836, - "grad_norm": 1.4609375, - "learning_rate": 8.610798873290694e-06, - "long_answer_loss": 0.0836, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 1603, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0761, - "grad_norm": 1.375, - "learning_rate": 8.596091271322262e-06, - "long_answer_loss": 0.0761, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 1604, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0805, - "grad_norm": 1.4453125, - "learning_rate": 8.581389655652914e-06, - "long_answer_loss": 0.0805, - "loss": 0.0775, - "short_answer_loss": NaN, - "step": 1605, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0727, - "grad_norm": 1.375, - "learning_rate": 8.566694048826282e-06, - "long_answer_loss": 0.0727, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 1606, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0762, - "grad_norm": 1.3515625, - "learning_rate": 8.55200447337677e-06, - "long_answer_loss": 0.0762, - "loss": 0.078, - "short_answer_loss": NaN, - "step": 1607, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0932, - "grad_norm": 1.375, - "learning_rate": 8.537320951829556e-06, - "long_answer_loss": 0.0932, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 1608, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0736, - "grad_norm": 1.3828125, - "learning_rate": 8.522643506700511e-06, - "long_answer_loss": 0.0736, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 1609, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.092, - "grad_norm": 1.4140625, - "learning_rate": 8.507972160496213e-06, - "long_answer_loss": 0.092, - "loss": 0.0802, - "short_answer_loss": NaN, - "step": 1610, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0819, - "grad_norm": 1.484375, - "learning_rate": 8.493306935713872e-06, - "long_answer_loss": 0.0819, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 1611, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.081, - "grad_norm": 1.3515625, - "learning_rate": 8.478647854841304e-06, - "long_answer_loss": 0.081, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 1612, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0652, - "grad_norm": 1.3984375, - "learning_rate": 8.463994940356926e-06, - "long_answer_loss": 0.0652, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 1613, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0663, - "grad_norm": 1.4296875, - "learning_rate": 8.449348214729678e-06, - "long_answer_loss": 0.0663, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 1614, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0695, - "grad_norm": 1.390625, - "learning_rate": 8.434707700419028e-06, - "long_answer_loss": 0.0695, - "loss": 0.0764, - "short_answer_loss": NaN, - "step": 1615, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0718, - "grad_norm": 1.4453125, - "learning_rate": 8.420073419874905e-06, - "long_answer_loss": 0.0718, - "loss": 0.0789, - "short_answer_loss": NaN, - "step": 1616, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0677, - "grad_norm": 1.46875, - "learning_rate": 8.405445395537692e-06, - "long_answer_loss": 0.0677, - "loss": 0.0803, - "short_answer_loss": NaN, - "step": 1617, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0772, - "grad_norm": 1.734375, - "learning_rate": 8.390823649838164e-06, - "long_answer_loss": 0.0772, - "loss": 0.0892, - "short_answer_loss": NaN, - "step": 1618, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0532, - "grad_norm": 1.4140625, - "learning_rate": 8.376208205197484e-06, - "long_answer_loss": 0.0532, - "loss": 0.0782, - "short_answer_loss": NaN, - "step": 1619, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0623, - "grad_norm": 1.40625, - "learning_rate": 8.361599084027136e-06, - "long_answer_loss": 0.0623, - "loss": 0.0765, - "short_answer_loss": NaN, - "step": 1620, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0779, - "grad_norm": 1.3984375, - "learning_rate": 8.346996308728922e-06, - "long_answer_loss": 0.0779, - "loss": 0.0771, - "short_answer_loss": NaN, - "step": 1621, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0653, - "grad_norm": 1.4296875, - "learning_rate": 8.33239990169491e-06, - "long_answer_loss": 0.0653, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 1622, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0766, - "grad_norm": 1.484375, - "learning_rate": 8.31780988530739e-06, - "long_answer_loss": 0.0766, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 1623, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0784, - "grad_norm": 1.46875, - "learning_rate": 8.303226281938875e-06, - "long_answer_loss": 0.0784, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 1624, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0739, - "grad_norm": 1.4453125, - "learning_rate": 8.288649113952025e-06, - "long_answer_loss": 0.0739, - "loss": 0.0788, - "short_answer_loss": NaN, - "step": 1625, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0635, - "grad_norm": 1.453125, - "learning_rate": 8.274078403699642e-06, - "long_answer_loss": 0.0635, - "loss": 0.0781, - "short_answer_loss": NaN, - "step": 1626, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0746, - "grad_norm": 1.46875, - "learning_rate": 8.25951417352462e-06, - "long_answer_loss": 0.0746, - "loss": 0.0766, - "short_answer_loss": NaN, - "step": 1627, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0599, - "grad_norm": 1.3125, - "learning_rate": 8.244956445759928e-06, - "long_answer_loss": 0.0599, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 1628, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0597, - "grad_norm": 1.4453125, - "learning_rate": 8.23040524272854e-06, - "long_answer_loss": 0.0597, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 1629, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0814, - "grad_norm": 1.5, - "learning_rate": 8.21586058674345e-06, - "long_answer_loss": 0.0814, - "loss": 0.0798, - "short_answer_loss": NaN, - "step": 1630, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0666, - "grad_norm": 1.4453125, - "learning_rate": 8.201322500107606e-06, - "long_answer_loss": 0.0666, - "loss": 0.0756, - "short_answer_loss": NaN, - "step": 1631, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0777, - "grad_norm": 1.4296875, - "learning_rate": 8.186791005113866e-06, - "long_answer_loss": 0.0777, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 1632, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0824, - "grad_norm": 1.515625, - "learning_rate": 8.172266124045009e-06, - "long_answer_loss": 0.0824, - "loss": 0.0788, - "short_answer_loss": NaN, - "step": 1633, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0808, - "grad_norm": 1.4609375, - "learning_rate": 8.157747879173646e-06, - "long_answer_loss": 0.0808, - "loss": 0.0817, - "short_answer_loss": NaN, - "step": 1634, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0852, - "grad_norm": 1.390625, - "learning_rate": 8.143236292762229e-06, - "long_answer_loss": 0.0852, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1635, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0769, - "grad_norm": 1.3515625, - "learning_rate": 8.128731387062986e-06, - "long_answer_loss": 0.0769, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 1636, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0694, - "grad_norm": 1.453125, - "learning_rate": 8.114233184317918e-06, - "long_answer_loss": 0.0694, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 1637, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0749, - "grad_norm": 1.328125, - "learning_rate": 8.099741706758726e-06, - "long_answer_loss": 0.0749, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 1638, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0646, - "grad_norm": 1.421875, - "learning_rate": 8.085256976606825e-06, - "long_answer_loss": 0.0646, - "loss": 0.0785, - "short_answer_loss": NaN, - "step": 1639, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0623, - "grad_norm": 1.3359375, - "learning_rate": 8.070779016073256e-06, - "long_answer_loss": 0.0623, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 1640, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0639, - "grad_norm": 1.34375, - "learning_rate": 8.056307847358701e-06, - "long_answer_loss": 0.0639, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 1641, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0624, - "grad_norm": 1.390625, - "learning_rate": 8.041843492653411e-06, - "long_answer_loss": 0.0624, - "loss": 0.0751, - "short_answer_loss": NaN, - "step": 1642, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0897, - "grad_norm": 1.390625, - "learning_rate": 8.0273859741372e-06, - "long_answer_loss": 0.0897, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 1643, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.084, - "grad_norm": 1.3671875, - "learning_rate": 8.012935313979398e-06, - "long_answer_loss": 0.084, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 1644, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0779, - "grad_norm": 1.4453125, - "learning_rate": 7.998491534338807e-06, - "long_answer_loss": 0.0779, - "loss": 0.0798, - "short_answer_loss": NaN, - "step": 1645, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0888, - "grad_norm": 1.4375, - "learning_rate": 7.984054657363696e-06, - "long_answer_loss": 0.0888, - "loss": 0.0772, - "short_answer_loss": NaN, - "step": 1646, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0887, - "grad_norm": 1.421875, - "learning_rate": 7.96962470519173e-06, - "long_answer_loss": 0.0887, - "loss": 0.0779, - "short_answer_loss": NaN, - "step": 1647, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0645, - "grad_norm": 1.453125, - "learning_rate": 7.95520169994998e-06, - "long_answer_loss": 0.0645, - "loss": 0.0767, - "short_answer_loss": NaN, - "step": 1648, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0737, - "grad_norm": 1.4140625, - "learning_rate": 7.940785663754837e-06, - "long_answer_loss": 0.0737, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 1649, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0875, - "grad_norm": 1.4140625, - "learning_rate": 7.926376618712027e-06, - "long_answer_loss": 0.0875, - "loss": 0.0799, - "short_answer_loss": NaN, - "step": 1650, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0746, - "grad_norm": 1.390625, - "learning_rate": 7.911974586916543e-06, - "long_answer_loss": 0.0746, - "loss": 0.0756, - "short_answer_loss": NaN, - "step": 1651, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0704, - "grad_norm": 1.3984375, - "learning_rate": 7.897579590452625e-06, - "long_answer_loss": 0.0704, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 1652, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0743, - "grad_norm": 1.5703125, - "learning_rate": 7.883191651393737e-06, - "long_answer_loss": 0.0743, - "loss": 0.0784, - "short_answer_loss": NaN, - "step": 1653, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0652, - "grad_norm": 1.4375, - "learning_rate": 7.868810791802503e-06, - "long_answer_loss": 0.0652, - "loss": 0.0739, - "short_answer_loss": NaN, - "step": 1654, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0738, - "grad_norm": 1.3671875, - "learning_rate": 7.85443703373071e-06, - "long_answer_loss": 0.0738, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 1655, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0888, - "grad_norm": 1.4375, - "learning_rate": 7.84007039921924e-06, - "long_answer_loss": 0.0888, - "loss": 0.0819, - "short_answer_loss": NaN, - "step": 1656, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0782, - "grad_norm": 1.390625, - "learning_rate": 7.82571091029806e-06, - "long_answer_loss": 0.0782, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 1657, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0793, - "grad_norm": 1.3828125, - "learning_rate": 7.811358588986167e-06, - "long_answer_loss": 0.0793, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 1658, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0771, - "grad_norm": 1.3984375, - "learning_rate": 7.797013457291596e-06, - "long_answer_loss": 0.0771, - "loss": 0.0792, - "short_answer_loss": NaN, - "step": 1659, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0918, - "grad_norm": 1.3828125, - "learning_rate": 7.782675537211323e-06, - "long_answer_loss": 0.0918, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1660, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0704, - "grad_norm": 1.40625, - "learning_rate": 7.768344850731293e-06, - "long_answer_loss": 0.0704, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1661, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0665, - "grad_norm": 1.3125, - "learning_rate": 7.754021419826344e-06, - "long_answer_loss": 0.0665, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 1662, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0666, - "grad_norm": 1.421875, - "learning_rate": 7.739705266460182e-06, - "long_answer_loss": 0.0666, - "loss": 0.0785, - "short_answer_loss": NaN, - "step": 1663, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0852, - "grad_norm": 1.4609375, - "learning_rate": 7.725396412585378e-06, - "long_answer_loss": 0.0852, - "loss": 0.0787, - "short_answer_loss": NaN, - "step": 1664, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0709, - "grad_norm": 1.390625, - "learning_rate": 7.711094880143286e-06, - "long_answer_loss": 0.0709, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 1665, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0749, - "grad_norm": 1.4765625, - "learning_rate": 7.696800691064047e-06, - "long_answer_loss": 0.0749, - "loss": 0.0786, - "short_answer_loss": NaN, - "step": 1666, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0613, - "grad_norm": 1.375, - "learning_rate": 7.682513867266528e-06, - "long_answer_loss": 0.0613, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 1667, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0689, - "grad_norm": 1.4453125, - "learning_rate": 7.668234430658325e-06, - "long_answer_loss": 0.0689, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 1668, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0632, - "grad_norm": 1.390625, - "learning_rate": 7.653962403135678e-06, - "long_answer_loss": 0.0632, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 1669, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.084, - "grad_norm": 1.3203125, - "learning_rate": 7.639697806583493e-06, - "long_answer_loss": 0.084, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 1670, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0795, - "grad_norm": 1.3671875, - "learning_rate": 7.625440662875258e-06, - "long_answer_loss": 0.0795, - "loss": 0.0778, - "short_answer_loss": NaN, - "step": 1671, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.1012, - "grad_norm": 1.4375, - "learning_rate": 7.611190993873052e-06, - "long_answer_loss": 0.1012, - "loss": 0.0809, - "short_answer_loss": NaN, - "step": 1672, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0862, - "grad_norm": 1.4375, - "learning_rate": 7.596948821427477e-06, - "long_answer_loss": 0.0862, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 1673, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0783, - "grad_norm": 1.40625, - "learning_rate": 7.582714167377644e-06, - "long_answer_loss": 0.0783, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 1674, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0754, - "grad_norm": 1.3515625, - "learning_rate": 7.568487053551146e-06, - "long_answer_loss": 0.0754, - "loss": 0.0756, - "short_answer_loss": NaN, - "step": 1675, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0815, - "grad_norm": 1.3671875, - "learning_rate": 7.554267501763993e-06, - "long_answer_loss": 0.0815, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 1676, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0601, - "grad_norm": 1.40625, - "learning_rate": 7.540055533820625e-06, - "long_answer_loss": 0.0601, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 1677, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0544, - "grad_norm": 1.3125, - "learning_rate": 7.525851171513828e-06, - "long_answer_loss": 0.0544, - "loss": 0.0683, - "short_answer_loss": NaN, - "step": 1678, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0844, - "grad_norm": 1.4921875, - "learning_rate": 7.51165443662474e-06, - "long_answer_loss": 0.0844, - "loss": 0.0792, - "short_answer_loss": NaN, - "step": 1679, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0779, - "grad_norm": 1.4296875, - "learning_rate": 7.497465350922802e-06, - "long_answer_loss": 0.0779, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 1680, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0749, - "grad_norm": 1.484375, - "learning_rate": 7.483283936165725e-06, - "long_answer_loss": 0.0749, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1681, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0689, - "grad_norm": 1.4609375, - "learning_rate": 7.469110214099448e-06, - "long_answer_loss": 0.0689, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 1682, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0852, - "grad_norm": 1.453125, - "learning_rate": 7.454944206458123e-06, - "long_answer_loss": 0.0852, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 1683, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0885, - "grad_norm": 1.4453125, - "learning_rate": 7.440785934964077e-06, - "long_answer_loss": 0.0885, - "loss": 0.082, - "short_answer_loss": NaN, - "step": 1684, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0694, - "grad_norm": 1.53125, - "learning_rate": 7.42663542132776e-06, - "long_answer_loss": 0.0694, - "loss": 0.0811, - "short_answer_loss": NaN, - "step": 1685, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0914, - "grad_norm": 1.4921875, - "learning_rate": 7.412492687247744e-06, - "long_answer_loss": 0.0914, - "loss": 0.0827, - "short_answer_loss": NaN, - "step": 1686, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0654, - "grad_norm": 1.40625, - "learning_rate": 7.398357754410653e-06, - "long_answer_loss": 0.0654, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 1687, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0701, - "grad_norm": 1.546875, - "learning_rate": 7.384230644491163e-06, - "long_answer_loss": 0.0701, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 1688, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0637, - "grad_norm": 1.3984375, - "learning_rate": 7.370111379151943e-06, - "long_answer_loss": 0.0637, - "loss": 0.0724, - "short_answer_loss": NaN, - "step": 1689, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0687, - "grad_norm": 1.390625, - "learning_rate": 7.355999980043648e-06, - "long_answer_loss": 0.0687, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 1690, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0837, - "grad_norm": 1.4140625, - "learning_rate": 7.341896468804853e-06, - "long_answer_loss": 0.0837, - "loss": 0.0775, - "short_answer_loss": NaN, - "step": 1691, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0973, - "grad_norm": 1.40625, - "learning_rate": 7.327800867062054e-06, - "long_answer_loss": 0.0973, - "loss": 0.0789, - "short_answer_loss": NaN, - "step": 1692, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0448, - "grad_norm": 1.421875, - "learning_rate": 7.313713196429606e-06, - "long_answer_loss": 0.0448, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 1693, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0771, - "grad_norm": 1.4140625, - "learning_rate": 7.2996334785097055e-06, - "long_answer_loss": 0.0771, - "loss": 0.0783, - "short_answer_loss": NaN, - "step": 1694, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0806, - "grad_norm": 1.4453125, - "learning_rate": 7.285561734892357e-06, - "long_answer_loss": 0.0806, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 1695, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0876, - "grad_norm": 1.40625, - "learning_rate": 7.27149798715534e-06, - "long_answer_loss": 0.0876, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 1696, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0584, - "grad_norm": 1.4296875, - "learning_rate": 7.2574422568641635e-06, - "long_answer_loss": 0.0584, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 1697, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0629, - "grad_norm": 1.421875, - "learning_rate": 7.243394565572051e-06, - "long_answer_loss": 0.0629, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1698, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0778, - "grad_norm": 1.4140625, - "learning_rate": 7.2293549348199e-06, - "long_answer_loss": 0.0778, - "loss": 0.0739, - "short_answer_loss": NaN, - "step": 1699, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0836, - "grad_norm": 1.3515625, - "learning_rate": 7.21532338613623e-06, - "long_answer_loss": 0.0836, - "loss": 0.0766, - "short_answer_loss": NaN, - "step": 1700, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0734, - "grad_norm": 1.453125, - "learning_rate": 7.201299941037199e-06, - "long_answer_loss": 0.0734, - "loss": 0.0827, - "short_answer_loss": NaN, - "step": 1701, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0595, - "grad_norm": 1.3359375, - "learning_rate": 7.187284621026508e-06, - "long_answer_loss": 0.0595, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 1702, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0779, - "grad_norm": 1.3515625, - "learning_rate": 7.173277447595414e-06, - "long_answer_loss": 0.0779, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 1703, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0778, - "grad_norm": 1.3828125, - "learning_rate": 7.159278442222683e-06, - "long_answer_loss": 0.0778, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 1704, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0756, - "grad_norm": 1.3046875, - "learning_rate": 7.14528762637455e-06, - "long_answer_loss": 0.0756, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 1705, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0649, - "grad_norm": 1.3671875, - "learning_rate": 7.131305021504697e-06, - "long_answer_loss": 0.0649, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 1706, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0722, - "grad_norm": 1.4453125, - "learning_rate": 7.117330649054213e-06, - "long_answer_loss": 0.0722, - "loss": 0.0783, - "short_answer_loss": NaN, - "step": 1707, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0833, - "grad_norm": 1.4375, - "learning_rate": 7.103364530451567e-06, - "long_answer_loss": 0.0833, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 1708, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0821, - "grad_norm": 1.5625, - "learning_rate": 7.089406687112554e-06, - "long_answer_loss": 0.0821, - "loss": 0.08, - "short_answer_loss": NaN, - "step": 1709, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0648, - "grad_norm": 1.390625, - "learning_rate": 7.075457140440312e-06, - "long_answer_loss": 0.0648, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 1710, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0715, - "grad_norm": 1.3359375, - "learning_rate": 7.06151591182522e-06, - "long_answer_loss": 0.0715, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 1711, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0784, - "grad_norm": 1.3828125, - "learning_rate": 7.047583022644938e-06, - "long_answer_loss": 0.0784, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 1712, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0715, - "grad_norm": 1.4453125, - "learning_rate": 7.033658494264309e-06, - "long_answer_loss": 0.0715, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 1713, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0836, - "grad_norm": 1.4765625, - "learning_rate": 7.0197423480353685e-06, - "long_answer_loss": 0.0836, - "loss": 0.0766, - "short_answer_loss": NaN, - "step": 1714, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0661, - "grad_norm": 1.4140625, - "learning_rate": 7.005834605297303e-06, - "long_answer_loss": 0.0661, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 1715, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0971, - "grad_norm": 1.515625, - "learning_rate": 6.9919352873763915e-06, - "long_answer_loss": 0.0971, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 1716, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0748, - "grad_norm": 1.4453125, - "learning_rate": 6.978044415586032e-06, - "long_answer_loss": 0.0748, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 1717, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0727, - "grad_norm": 1.40625, - "learning_rate": 6.9641620112266284e-06, - "long_answer_loss": 0.0727, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 1718, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0756, - "grad_norm": 1.4296875, - "learning_rate": 6.9502880955856385e-06, - "long_answer_loss": 0.0756, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 1719, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0645, - "grad_norm": 1.4296875, - "learning_rate": 6.936422689937475e-06, - "long_answer_loss": 0.0645, - "loss": 0.0801, - "short_answer_loss": NaN, - "step": 1720, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0733, - "grad_norm": 1.359375, - "learning_rate": 6.9225658155435146e-06, - "long_answer_loss": 0.0733, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 1721, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0721, - "grad_norm": 1.3984375, - "learning_rate": 6.9087174936520505e-06, - "long_answer_loss": 0.0721, - "loss": 0.077, - "short_answer_loss": NaN, - "step": 1722, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0769, - "grad_norm": 1.484375, - "learning_rate": 6.89487774549826e-06, - "long_answer_loss": 0.0769, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 1723, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0686, - "grad_norm": 1.5, - "learning_rate": 6.88104659230418e-06, - "long_answer_loss": 0.0686, - "loss": 0.0775, - "short_answer_loss": NaN, - "step": 1724, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0856, - "grad_norm": 1.4453125, - "learning_rate": 6.867224055278648e-06, - "long_answer_loss": 0.0856, - "loss": 0.0792, - "short_answer_loss": NaN, - "step": 1725, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0745, - "grad_norm": 1.34375, - "learning_rate": 6.853410155617321e-06, - "long_answer_loss": 0.0745, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 1726, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0689, - "grad_norm": 1.4375, - "learning_rate": 6.839604914502577e-06, - "long_answer_loss": 0.0689, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 1727, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0613, - "grad_norm": 1.375, - "learning_rate": 6.825808353103542e-06, - "long_answer_loss": 0.0613, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 1728, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0746, - "grad_norm": 1.390625, - "learning_rate": 6.812020492576024e-06, - "long_answer_loss": 0.0746, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 1729, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0736, - "grad_norm": 1.4765625, - "learning_rate": 6.798241354062484e-06, - "long_answer_loss": 0.0736, - "loss": 0.0767, - "short_answer_loss": NaN, - "step": 1730, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0878, - "grad_norm": 1.46875, - "learning_rate": 6.784470958692018e-06, - "long_answer_loss": 0.0878, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 1731, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0788, - "grad_norm": 1.3671875, - "learning_rate": 6.77070932758031e-06, - "long_answer_loss": 0.0788, - "loss": 0.0756, - "short_answer_loss": NaN, - "step": 1732, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0836, - "grad_norm": 1.421875, - "learning_rate": 6.75695648182961e-06, - "long_answer_loss": 0.0836, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 1733, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.076, - "grad_norm": 1.3828125, - "learning_rate": 6.743212442528673e-06, - "long_answer_loss": 0.076, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 1734, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0818, - "grad_norm": 1.34375, - "learning_rate": 6.729477230752796e-06, - "long_answer_loss": 0.0818, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 1735, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0648, - "grad_norm": 1.640625, - "learning_rate": 6.715750867563692e-06, - "long_answer_loss": 0.0648, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 1736, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0825, - "grad_norm": 1.484375, - "learning_rate": 6.7020333740095305e-06, - "long_answer_loss": 0.0825, - "loss": 0.0778, - "short_answer_loss": NaN, - "step": 1737, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0636, - "grad_norm": 1.3125, - "learning_rate": 6.688324771124881e-06, - "long_answer_loss": 0.0636, - "loss": 0.0689, - "short_answer_loss": NaN, - "step": 1738, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0849, - "grad_norm": 1.4765625, - "learning_rate": 6.67462507993067e-06, - "long_answer_loss": 0.0849, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 1739, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0857, - "grad_norm": 1.4765625, - "learning_rate": 6.660934321434166e-06, - "long_answer_loss": 0.0857, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 1740, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0784, - "grad_norm": 1.4140625, - "learning_rate": 6.647252516628936e-06, - "long_answer_loss": 0.0784, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 1741, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0727, - "grad_norm": 1.4609375, - "learning_rate": 6.63357968649482e-06, - "long_answer_loss": 0.0727, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 1742, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.054, - "grad_norm": 1.3515625, - "learning_rate": 6.619915851997899e-06, - "long_answer_loss": 0.054, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 1743, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0541, - "grad_norm": 1.40625, - "learning_rate": 6.606261034090446e-06, - "long_answer_loss": 0.0541, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 1744, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0677, - "grad_norm": 1.5390625, - "learning_rate": 6.592615253710922e-06, - "long_answer_loss": 0.0677, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 1745, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.079, - "grad_norm": 1.4765625, - "learning_rate": 6.5789785317839275e-06, - "long_answer_loss": 0.079, - "loss": 0.0775, - "short_answer_loss": NaN, - "step": 1746, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0649, - "grad_norm": 1.4296875, - "learning_rate": 6.5653508892201675e-06, - "long_answer_loss": 0.0649, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 1747, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0674, - "grad_norm": 1.3125, - "learning_rate": 6.551732346916431e-06, - "long_answer_loss": 0.0674, - "loss": 0.0649, - "short_answer_loss": NaN, - "step": 1748, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0663, - "grad_norm": 1.4140625, - "learning_rate": 6.538122925755549e-06, - "long_answer_loss": 0.0663, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 1749, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0682, - "grad_norm": 1.4375, - "learning_rate": 6.524522646606362e-06, - "long_answer_loss": 0.0682, - "loss": 0.0763, - "short_answer_loss": NaN, - "step": 1750, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0738, - "grad_norm": 1.484375, - "learning_rate": 6.5109315303237026e-06, - "long_answer_loss": 0.0738, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1751, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0788, - "grad_norm": 1.421875, - "learning_rate": 6.4973495977483475e-06, - "long_answer_loss": 0.0788, - "loss": 0.0742, - "short_answer_loss": NaN, - "step": 1752, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0776, - "grad_norm": 1.4375, - "learning_rate": 6.4837768697069755e-06, - "long_answer_loss": 0.0776, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 1753, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0822, - "grad_norm": 1.390625, - "learning_rate": 6.470213367012187e-06, - "long_answer_loss": 0.0822, - "loss": 0.0786, - "short_answer_loss": NaN, - "step": 1754, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0679, - "grad_norm": 1.4140625, - "learning_rate": 6.456659110462402e-06, - "long_answer_loss": 0.0679, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 1755, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0776, - "grad_norm": 1.4453125, - "learning_rate": 6.443114120841874e-06, - "long_answer_loss": 0.0776, - "loss": 0.0828, - "short_answer_loss": NaN, - "step": 1756, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0642, - "grad_norm": 1.453125, - "learning_rate": 6.429578418920653e-06, - "long_answer_loss": 0.0642, - "loss": 0.0752, - "short_answer_loss": NaN, - "step": 1757, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0631, - "grad_norm": 1.3203125, - "learning_rate": 6.41605202545454e-06, - "long_answer_loss": 0.0631, - "loss": 0.0674, - "short_answer_loss": NaN, - "step": 1758, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0583, - "grad_norm": 1.4296875, - "learning_rate": 6.402534961185069e-06, - "long_answer_loss": 0.0583, - "loss": 0.0767, - "short_answer_loss": NaN, - "step": 1759, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0731, - "grad_norm": 1.4140625, - "learning_rate": 6.389027246839452e-06, - "long_answer_loss": 0.0731, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 1760, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0752, - "grad_norm": 1.484375, - "learning_rate": 6.37552890313059e-06, - "long_answer_loss": 0.0752, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 1761, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0836, - "grad_norm": 1.453125, - "learning_rate": 6.362039950756983e-06, - "long_answer_loss": 0.0836, - "loss": 0.0783, - "short_answer_loss": NaN, - "step": 1762, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.074, - "grad_norm": 1.3046875, - "learning_rate": 6.348560410402768e-06, - "long_answer_loss": 0.074, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 1763, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0748, - "grad_norm": 1.3515625, - "learning_rate": 6.3350903027376135e-06, - "long_answer_loss": 0.0748, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 1764, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.08, - "grad_norm": 1.4296875, - "learning_rate": 6.321629648416743e-06, - "long_answer_loss": 0.08, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 1765, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0757, - "grad_norm": 1.5078125, - "learning_rate": 6.308178468080886e-06, - "long_answer_loss": 0.0757, - "loss": 0.0748, - "short_answer_loss": NaN, - "step": 1766, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0829, - "grad_norm": 1.3671875, - "learning_rate": 6.294736782356231e-06, - "long_answer_loss": 0.0829, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 1767, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0741, - "grad_norm": 1.40625, - "learning_rate": 6.281304611854427e-06, - "long_answer_loss": 0.0741, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 1768, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0715, - "grad_norm": 1.4296875, - "learning_rate": 6.2678819771725015e-06, - "long_answer_loss": 0.0715, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 1769, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0707, - "grad_norm": 1.375, - "learning_rate": 6.2544688988929e-06, - "long_answer_loss": 0.0707, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 1770, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0735, - "grad_norm": 1.421875, - "learning_rate": 6.241065397583374e-06, - "long_answer_loss": 0.0735, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 1771, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0703, - "grad_norm": 1.484375, - "learning_rate": 6.227671493797027e-06, - "long_answer_loss": 0.0703, - "loss": 0.0777, - "short_answer_loss": NaN, - "step": 1772, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0697, - "grad_norm": 1.359375, - "learning_rate": 6.214287208072211e-06, - "long_answer_loss": 0.0697, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 1773, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0742, - "grad_norm": 1.4375, - "learning_rate": 6.200912560932554e-06, - "long_answer_loss": 0.0742, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 1774, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0803, - "grad_norm": 1.4453125, - "learning_rate": 6.187547572886897e-06, - "long_answer_loss": 0.0803, - "loss": 0.0789, - "short_answer_loss": NaN, - "step": 1775, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0582, - "grad_norm": 1.296875, - "learning_rate": 6.174192264429256e-06, - "long_answer_loss": 0.0582, - "loss": 0.067, - "short_answer_loss": NaN, - "step": 1776, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0649, - "grad_norm": 1.3359375, - "learning_rate": 6.160846656038835e-06, - "long_answer_loss": 0.0649, - "loss": 0.0689, - "short_answer_loss": NaN, - "step": 1777, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0749, - "grad_norm": 1.4296875, - "learning_rate": 6.147510768179924e-06, - "long_answer_loss": 0.0749, - "loss": 0.0772, - "short_answer_loss": NaN, - "step": 1778, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0704, - "grad_norm": 1.40625, - "learning_rate": 6.134184621301952e-06, - "long_answer_loss": 0.0704, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 1779, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.081, - "grad_norm": 1.390625, - "learning_rate": 6.120868235839369e-06, - "long_answer_loss": 0.081, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 1780, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0757, - "grad_norm": 1.53125, - "learning_rate": 6.107561632211683e-06, - "long_answer_loss": 0.0757, - "loss": 0.078, - "short_answer_loss": NaN, - "step": 1781, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0741, - "grad_norm": 1.3359375, - "learning_rate": 6.094264830823395e-06, - "long_answer_loss": 0.0741, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 1782, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0723, - "grad_norm": 1.375, - "learning_rate": 6.0809778520639734e-06, - "long_answer_loss": 0.0723, - "loss": 0.0748, - "short_answer_loss": NaN, - "step": 1783, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0763, - "grad_norm": 1.4140625, - "learning_rate": 6.067700716307827e-06, - "long_answer_loss": 0.0763, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 1784, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0613, - "grad_norm": 1.484375, - "learning_rate": 6.05443344391427e-06, - "long_answer_loss": 0.0613, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 1785, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0635, - "grad_norm": 1.453125, - "learning_rate": 6.041176055227498e-06, - "long_answer_loss": 0.0635, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 1786, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0855, - "grad_norm": 1.421875, - "learning_rate": 6.027928570576528e-06, - "long_answer_loss": 0.0855, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 1787, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0857, - "grad_norm": 1.40625, - "learning_rate": 6.014691010275231e-06, - "long_answer_loss": 0.0857, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 1788, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0804, - "grad_norm": 1.4609375, - "learning_rate": 6.001463394622217e-06, - "long_answer_loss": 0.0804, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 1789, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0486, - "grad_norm": 1.390625, - "learning_rate": 5.988245743900874e-06, - "long_answer_loss": 0.0486, - "loss": 0.0699, - "short_answer_loss": NaN, - "step": 1790, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0874, - "grad_norm": 1.34375, - "learning_rate": 5.975038078379299e-06, - "long_answer_loss": 0.0874, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 1791, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0758, - "grad_norm": 1.453125, - "learning_rate": 5.96184041831028e-06, - "long_answer_loss": 0.0758, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 1792, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0746, - "grad_norm": 1.421875, - "learning_rate": 5.948652783931266e-06, - "long_answer_loss": 0.0746, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 1793, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0804, - "grad_norm": 1.4453125, - "learning_rate": 5.935475195464326e-06, - "long_answer_loss": 0.0804, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 1794, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0702, - "grad_norm": 1.6796875, - "learning_rate": 5.922307673116132e-06, - "long_answer_loss": 0.0702, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 1795, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0683, - "grad_norm": 1.3828125, - "learning_rate": 5.909150237077908e-06, - "long_answer_loss": 0.0683, - "loss": 0.0748, - "short_answer_loss": NaN, - "step": 1796, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0718, - "grad_norm": 1.390625, - "learning_rate": 5.896002907525424e-06, - "long_answer_loss": 0.0718, - "loss": 0.0727, - "short_answer_loss": NaN, - "step": 1797, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0745, - "grad_norm": 1.5234375, - "learning_rate": 5.8828657046189474e-06, - "long_answer_loss": 0.0745, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 1798, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0631, - "grad_norm": 1.4375, - "learning_rate": 5.86973864850322e-06, - "long_answer_loss": 0.0631, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 1799, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0641, - "grad_norm": 1.3984375, - "learning_rate": 5.856621759307421e-06, - "long_answer_loss": 0.0641, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 1800, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0829, - "grad_norm": 1.3515625, - "learning_rate": 5.843515057145139e-06, - "long_answer_loss": 0.0829, - "loss": 0.0743, - "short_answer_loss": NaN, - "step": 1801, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0813, - "grad_norm": 1.4296875, - "learning_rate": 5.830418562114348e-06, - "long_answer_loss": 0.0813, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 1802, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0873, - "grad_norm": 1.453125, - "learning_rate": 5.8173322942973634e-06, - "long_answer_loss": 0.0873, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 1803, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0811, - "grad_norm": 1.5859375, - "learning_rate": 5.804256273760819e-06, - "long_answer_loss": 0.0811, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1804, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0725, - "grad_norm": 1.4921875, - "learning_rate": 5.791190520555645e-06, - "long_answer_loss": 0.0725, - "loss": 0.0752, - "short_answer_loss": NaN, - "step": 1805, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.088, - "grad_norm": 1.390625, - "learning_rate": 5.778135054717008e-06, - "long_answer_loss": 0.088, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 1806, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0912, - "grad_norm": 1.390625, - "learning_rate": 5.7650898962643165e-06, - "long_answer_loss": 0.0912, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 1807, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0726, - "grad_norm": 1.3828125, - "learning_rate": 5.752055065201167e-06, - "long_answer_loss": 0.0726, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 1808, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0637, - "grad_norm": 1.421875, - "learning_rate": 5.739030581515324e-06, - "long_answer_loss": 0.0637, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 1809, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0958, - "grad_norm": 1.40625, - "learning_rate": 5.726016465178681e-06, - "long_answer_loss": 0.0958, - "loss": 0.0752, - "short_answer_loss": NaN, - "step": 1810, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.07, - "grad_norm": 1.3671875, - "learning_rate": 5.7130127361472345e-06, - "long_answer_loss": 0.07, - "loss": 0.0724, - "short_answer_loss": NaN, - "step": 1811, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0828, - "grad_norm": 1.40625, - "learning_rate": 5.700019414361059e-06, - "long_answer_loss": 0.0828, - "loss": 0.0777, - "short_answer_loss": NaN, - "step": 1812, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0673, - "grad_norm": 1.4140625, - "learning_rate": 5.687036519744251e-06, - "long_answer_loss": 0.0673, - "loss": 0.0714, - "short_answer_loss": NaN, - "step": 1813, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0633, - "grad_norm": 1.34375, - "learning_rate": 5.674064072204953e-06, - "long_answer_loss": 0.0633, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 1814, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.084, - "grad_norm": 1.390625, - "learning_rate": 5.661102091635251e-06, - "long_answer_loss": 0.084, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 1815, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0624, - "grad_norm": 1.3359375, - "learning_rate": 5.648150597911203e-06, - "long_answer_loss": 0.0624, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 1816, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0526, - "grad_norm": 1.3828125, - "learning_rate": 5.635209610892779e-06, - "long_answer_loss": 0.0526, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 1817, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0765, - "grad_norm": 1.3828125, - "learning_rate": 5.622279150423839e-06, - "long_answer_loss": 0.0765, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 1818, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0944, - "grad_norm": 1.375, - "learning_rate": 5.609359236332107e-06, - "long_answer_loss": 0.0944, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 1819, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0833, - "grad_norm": 1.4140625, - "learning_rate": 5.596449888429116e-06, - "long_answer_loss": 0.0833, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 1820, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0754, - "grad_norm": 1.390625, - "learning_rate": 5.5835511265102265e-06, - "long_answer_loss": 0.0754, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 1821, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.059, - "grad_norm": 1.390625, - "learning_rate": 5.570662970354536e-06, - "long_answer_loss": 0.059, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 1822, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0896, - "grad_norm": 1.453125, - "learning_rate": 5.557785439724908e-06, - "long_answer_loss": 0.0896, - "loss": 0.0792, - "short_answer_loss": NaN, - "step": 1823, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0861, - "grad_norm": 1.4921875, - "learning_rate": 5.544918554367879e-06, - "long_answer_loss": 0.0861, - "loss": 0.0766, - "short_answer_loss": NaN, - "step": 1824, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0703, - "grad_norm": 1.421875, - "learning_rate": 5.532062334013703e-06, - "long_answer_loss": 0.0703, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 1825, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0631, - "grad_norm": 1.4609375, - "learning_rate": 5.5192167983762425e-06, - "long_answer_loss": 0.0631, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 1826, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0906, - "grad_norm": 1.453125, - "learning_rate": 5.5063819671529935e-06, - "long_answer_loss": 0.0906, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 1827, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0906, - "grad_norm": 1.515625, - "learning_rate": 5.493557860025042e-06, - "long_answer_loss": 0.0906, - "loss": 0.0806, - "short_answer_loss": NaN, - "step": 1828, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0662, - "grad_norm": 1.53125, - "learning_rate": 5.48074449665701e-06, - "long_answer_loss": 0.0662, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 1829, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.075, - "grad_norm": 1.46875, - "learning_rate": 5.467941896697075e-06, - "long_answer_loss": 0.075, - "loss": 0.0777, - "short_answer_loss": NaN, - "step": 1830, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0731, - "grad_norm": 1.4375, - "learning_rate": 5.455150079776876e-06, - "long_answer_loss": 0.0731, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 1831, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0772, - "grad_norm": 1.46875, - "learning_rate": 5.442369065511552e-06, - "long_answer_loss": 0.0772, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 1832, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0597, - "grad_norm": 1.5859375, - "learning_rate": 5.429598873499643e-06, - "long_answer_loss": 0.0597, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 1833, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0769, - "grad_norm": 1.4140625, - "learning_rate": 5.416839523323118e-06, - "long_answer_loss": 0.0769, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 1834, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.088, - "grad_norm": 1.4609375, - "learning_rate": 5.404091034547311e-06, - "long_answer_loss": 0.088, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 1835, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0951, - "grad_norm": 1.453125, - "learning_rate": 5.391353426720904e-06, - "long_answer_loss": 0.0951, - "loss": 0.0743, - "short_answer_loss": NaN, - "step": 1836, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0712, - "grad_norm": 1.5390625, - "learning_rate": 5.378626719375895e-06, - "long_answer_loss": 0.0712, - "loss": 0.0751, - "short_answer_loss": NaN, - "step": 1837, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0658, - "grad_norm": 1.65625, - "learning_rate": 5.3659109320275565e-06, - "long_answer_loss": 0.0658, - "loss": 0.0772, - "short_answer_loss": NaN, - "step": 1838, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0735, - "grad_norm": 1.34375, - "learning_rate": 5.353206084174439e-06, - "long_answer_loss": 0.0735, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 1839, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.074, - "grad_norm": 1.359375, - "learning_rate": 5.340512195298291e-06, - "long_answer_loss": 0.074, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 1840, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.072, - "grad_norm": 1.3828125, - "learning_rate": 5.327829284864076e-06, - "long_answer_loss": 0.072, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 1841, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0797, - "grad_norm": 1.328125, - "learning_rate": 5.315157372319915e-06, - "long_answer_loss": 0.0797, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 1842, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0609, - "grad_norm": 1.40625, - "learning_rate": 5.302496477097067e-06, - "long_answer_loss": 0.0609, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 1843, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.1022, - "grad_norm": 1.328125, - "learning_rate": 5.2898466186098934e-06, - "long_answer_loss": 0.1022, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 1844, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0842, - "grad_norm": 1.4296875, - "learning_rate": 5.277207816255838e-06, - "long_answer_loss": 0.0842, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 1845, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0753, - "grad_norm": 1.3046875, - "learning_rate": 5.264580089415391e-06, - "long_answer_loss": 0.0753, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 1846, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0705, - "grad_norm": 1.3515625, - "learning_rate": 5.25196345745204e-06, - "long_answer_loss": 0.0705, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 1847, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0687, - "grad_norm": 1.4453125, - "learning_rate": 5.239357939712296e-06, - "long_answer_loss": 0.0687, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 1848, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0665, - "grad_norm": 1.3984375, - "learning_rate": 5.226763555525592e-06, - "long_answer_loss": 0.0665, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 1849, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0673, - "grad_norm": 1.390625, - "learning_rate": 5.214180324204307e-06, - "long_answer_loss": 0.0673, - "loss": 0.0717, - "short_answer_loss": NaN, - "step": 1850, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0614, - "grad_norm": 1.3828125, - "learning_rate": 5.201608265043717e-06, - "long_answer_loss": 0.0614, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 1851, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0737, - "grad_norm": 1.375, - "learning_rate": 5.189047397321961e-06, - "long_answer_loss": 0.0737, - "loss": 0.0779, - "short_answer_loss": NaN, - "step": 1852, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0672, - "grad_norm": 1.5078125, - "learning_rate": 5.176497740300021e-06, - "long_answer_loss": 0.0672, - "loss": 0.0796, - "short_answer_loss": NaN, - "step": 1853, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0734, - "grad_norm": 1.421875, - "learning_rate": 5.1639593132216864e-06, - "long_answer_loss": 0.0734, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 1854, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0751, - "grad_norm": 1.296875, - "learning_rate": 5.151432135313529e-06, - "long_answer_loss": 0.0751, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 1855, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0676, - "grad_norm": 1.4609375, - "learning_rate": 5.138916225784871e-06, - "long_answer_loss": 0.0676, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 1856, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0746, - "grad_norm": 1.3671875, - "learning_rate": 5.126411603827748e-06, - "long_answer_loss": 0.0746, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 1857, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0684, - "grad_norm": 1.3984375, - "learning_rate": 5.113918288616894e-06, - "long_answer_loss": 0.0684, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 1858, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0682, - "grad_norm": 1.3671875, - "learning_rate": 5.101436299309706e-06, - "long_answer_loss": 0.0682, - "loss": 0.0659, - "short_answer_loss": NaN, - "step": 1859, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0737, - "grad_norm": 1.4140625, - "learning_rate": 5.088965655046213e-06, - "long_answer_loss": 0.0737, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 1860, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0797, - "grad_norm": 1.5234375, - "learning_rate": 5.076506374949043e-06, - "long_answer_loss": 0.0797, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 1861, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0679, - "grad_norm": 1.453125, - "learning_rate": 5.0640584781234016e-06, - "long_answer_loss": 0.0679, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 1862, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0815, - "grad_norm": 1.3984375, - "learning_rate": 5.051621983657042e-06, - "long_answer_loss": 0.0815, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 1863, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0682, - "grad_norm": 1.3125, - "learning_rate": 5.039196910620224e-06, - "long_answer_loss": 0.0682, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 1864, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0706, - "grad_norm": 1.421875, - "learning_rate": 5.026783278065708e-06, - "long_answer_loss": 0.0706, - "loss": 0.0743, - "short_answer_loss": NaN, - "step": 1865, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0842, - "grad_norm": 1.4453125, - "learning_rate": 5.01438110502869e-06, - "long_answer_loss": 0.0842, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 1866, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0686, - "grad_norm": 1.4609375, - "learning_rate": 5.00199041052682e-06, - "long_answer_loss": 0.0686, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 1867, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0551, - "grad_norm": 1.390625, - "learning_rate": 4.989611213560123e-06, - "long_answer_loss": 0.0551, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 1868, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0633, - "grad_norm": 1.4921875, - "learning_rate": 4.977243533111008e-06, - "long_answer_loss": 0.0633, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 1869, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0895, - "grad_norm": 1.3828125, - "learning_rate": 4.9648873881442185e-06, - "long_answer_loss": 0.0895, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 1870, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0688, - "grad_norm": 1.3671875, - "learning_rate": 4.9525427976068124e-06, - "long_answer_loss": 0.0688, - "loss": 0.077, - "short_answer_loss": NaN, - "step": 1871, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0936, - "grad_norm": 1.46875, - "learning_rate": 4.940209780428133e-06, - "long_answer_loss": 0.0936, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 1872, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0523, - "grad_norm": 1.484375, - "learning_rate": 4.927888355519758e-06, - "long_answer_loss": 0.0523, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 1873, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0758, - "grad_norm": 1.3828125, - "learning_rate": 4.915578541775523e-06, - "long_answer_loss": 0.0758, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 1874, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0731, - "grad_norm": 1.359375, - "learning_rate": 4.90328035807142e-06, - "long_answer_loss": 0.0731, - "loss": 0.0686, - "short_answer_loss": NaN, - "step": 1875, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.074, - "grad_norm": 1.3671875, - "learning_rate": 4.890993823265647e-06, - "long_answer_loss": 0.074, - "loss": 0.068, - "short_answer_loss": NaN, - "step": 1876, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0781, - "grad_norm": 1.3125, - "learning_rate": 4.878718956198504e-06, - "long_answer_loss": 0.0781, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 1877, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0788, - "grad_norm": 1.28125, - "learning_rate": 4.866455775692421e-06, - "long_answer_loss": 0.0788, - "loss": 0.068, - "short_answer_loss": NaN, - "step": 1878, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0767, - "grad_norm": 1.359375, - "learning_rate": 4.854204300551901e-06, - "long_answer_loss": 0.0767, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 1879, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0585, - "grad_norm": 1.3984375, - "learning_rate": 4.841964549563499e-06, - "long_answer_loss": 0.0585, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 1880, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0577, - "grad_norm": 1.40625, - "learning_rate": 4.8297365414957955e-06, - "long_answer_loss": 0.0577, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 1881, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0705, - "grad_norm": 1.390625, - "learning_rate": 4.817520295099348e-06, - "long_answer_loss": 0.0705, - "loss": 0.0766, - "short_answer_loss": NaN, - "step": 1882, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0583, - "grad_norm": 1.46875, - "learning_rate": 4.805315829106708e-06, - "long_answer_loss": 0.0583, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 1883, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0639, - "grad_norm": 1.421875, - "learning_rate": 4.793123162232328e-06, - "long_answer_loss": 0.0639, - "loss": 0.0687, - "short_answer_loss": NaN, - "step": 1884, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0731, - "grad_norm": 1.3828125, - "learning_rate": 4.780942313172602e-06, - "long_answer_loss": 0.0731, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1885, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0624, - "grad_norm": 1.4921875, - "learning_rate": 4.768773300605775e-06, - "long_answer_loss": 0.0624, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 1886, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0717, - "grad_norm": 1.5078125, - "learning_rate": 4.756616143191956e-06, - "long_answer_loss": 0.0717, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 1887, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0768, - "grad_norm": 1.4140625, - "learning_rate": 4.744470859573075e-06, - "long_answer_loss": 0.0768, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 1888, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0806, - "grad_norm": 1.34375, - "learning_rate": 4.732337468372838e-06, - "long_answer_loss": 0.0806, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 1889, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0893, - "grad_norm": 1.3671875, - "learning_rate": 4.720215988196746e-06, - "long_answer_loss": 0.0893, - "loss": 0.0784, - "short_answer_loss": NaN, - "step": 1890, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0811, - "grad_norm": 1.3671875, - "learning_rate": 4.708106437632003e-06, - "long_answer_loss": 0.0811, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 1891, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0611, - "grad_norm": 1.453125, - "learning_rate": 4.6960088352475475e-06, - "long_answer_loss": 0.0611, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 1892, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0724, - "grad_norm": 1.3984375, - "learning_rate": 4.683923199593974e-06, - "long_answer_loss": 0.0724, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 1893, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0715, - "grad_norm": 1.3828125, - "learning_rate": 4.671849549203541e-06, - "long_answer_loss": 0.0715, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 1894, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0695, - "grad_norm": 1.375, - "learning_rate": 4.659787902590125e-06, - "long_answer_loss": 0.0695, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 1895, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0751, - "grad_norm": 1.4375, - "learning_rate": 4.647738278249193e-06, - "long_answer_loss": 0.0751, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 1896, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0703, - "grad_norm": 1.5390625, - "learning_rate": 4.635700694657781e-06, - "long_answer_loss": 0.0703, - "loss": 0.0784, - "short_answer_loss": NaN, - "step": 1897, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.074, - "grad_norm": 1.5234375, - "learning_rate": 4.62367517027446e-06, - "long_answer_loss": 0.074, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 1898, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0738, - "grad_norm": 1.421875, - "learning_rate": 4.6116617235393105e-06, - "long_answer_loss": 0.0738, - "loss": 0.0783, - "short_answer_loss": NaN, - "step": 1899, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0733, - "grad_norm": 1.3828125, - "learning_rate": 4.599660372873883e-06, - "long_answer_loss": 0.0733, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 1900, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.061, - "grad_norm": 1.3984375, - "learning_rate": 4.587671136681203e-06, - "long_answer_loss": 0.061, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 1901, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0717, - "grad_norm": 1.4140625, - "learning_rate": 4.575694033345691e-06, - "long_answer_loss": 0.0717, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 1902, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0633, - "grad_norm": 1.390625, - "learning_rate": 4.563729081233184e-06, - "long_answer_loss": 0.0633, - "loss": 0.0674, - "short_answer_loss": NaN, - "step": 1903, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0738, - "grad_norm": 1.4375, - "learning_rate": 4.551776298690875e-06, - "long_answer_loss": 0.0738, - "loss": 0.0743, - "short_answer_loss": NaN, - "step": 1904, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0762, - "grad_norm": 1.4296875, - "learning_rate": 4.539835704047304e-06, - "long_answer_loss": 0.0762, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1905, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0719, - "grad_norm": 1.390625, - "learning_rate": 4.527907315612315e-06, - "long_answer_loss": 0.0719, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 1906, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0759, - "grad_norm": 1.40625, - "learning_rate": 4.515991151677038e-06, - "long_answer_loss": 0.0759, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 1907, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0911, - "grad_norm": 1.34375, - "learning_rate": 4.504087230513862e-06, - "long_answer_loss": 0.0911, - "loss": 0.0763, - "short_answer_loss": NaN, - "step": 1908, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0789, - "grad_norm": 1.34375, - "learning_rate": 4.492195570376391e-06, - "long_answer_loss": 0.0789, - "loss": 0.0666, - "short_answer_loss": NaN, - "step": 1909, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0783, - "grad_norm": 1.453125, - "learning_rate": 4.480316189499436e-06, - "long_answer_loss": 0.0783, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 1910, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.084, - "grad_norm": 1.375, - "learning_rate": 4.468449106098983e-06, - "long_answer_loss": 0.084, - "loss": 0.0727, - "short_answer_loss": NaN, - "step": 1911, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0628, - "grad_norm": 1.4296875, - "learning_rate": 4.456594338372151e-06, - "long_answer_loss": 0.0628, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 1912, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0656, - "grad_norm": 1.390625, - "learning_rate": 4.4447519044971815e-06, - "long_answer_loss": 0.0656, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 1913, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0656, - "grad_norm": 1.4609375, - "learning_rate": 4.4329218226333995e-06, - "long_answer_loss": 0.0656, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 1914, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0662, - "grad_norm": 1.4609375, - "learning_rate": 4.421104110921191e-06, - "long_answer_loss": 0.0662, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 1915, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0976, - "grad_norm": 1.4765625, - "learning_rate": 4.4092987874819704e-06, - "long_answer_loss": 0.0976, - "loss": 0.0774, - "short_answer_loss": NaN, - "step": 1916, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0638, - "grad_norm": 1.3984375, - "learning_rate": 4.397505870418162e-06, - "long_answer_loss": 0.0638, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 1917, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.088, - "grad_norm": 1.328125, - "learning_rate": 4.385725377813163e-06, - "long_answer_loss": 0.088, - "loss": 0.0771, - "short_answer_loss": NaN, - "step": 1918, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0714, - "grad_norm": 1.4375, - "learning_rate": 4.3739573277313095e-06, - "long_answer_loss": 0.0714, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 1919, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0754, - "grad_norm": 1.3828125, - "learning_rate": 4.3622017382178735e-06, - "long_answer_loss": 0.0754, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 1920, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0719, - "grad_norm": 1.4375, - "learning_rate": 4.35045862729901e-06, - "long_answer_loss": 0.0719, - "loss": 0.0764, - "short_answer_loss": NaN, - "step": 1921, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0689, - "grad_norm": 1.3984375, - "learning_rate": 4.338728012981743e-06, - "long_answer_loss": 0.0689, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 1922, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.066, - "grad_norm": 1.375, - "learning_rate": 4.327009913253934e-06, - "long_answer_loss": 0.066, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 1923, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.054, - "grad_norm": 1.484375, - "learning_rate": 4.3153043460842504e-06, - "long_answer_loss": 0.054, - "loss": 0.0706, - "short_answer_loss": NaN, - "step": 1924, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0484, - "grad_norm": 1.4921875, - "learning_rate": 4.303611329422154e-06, - "long_answer_loss": 0.0484, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 1925, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0765, - "grad_norm": 1.34375, - "learning_rate": 4.2919308811978364e-06, - "long_answer_loss": 0.0765, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 1926, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0781, - "grad_norm": 1.359375, - "learning_rate": 4.28026301932225e-06, - "long_answer_loss": 0.0781, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 1927, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0638, - "grad_norm": 1.4296875, - "learning_rate": 4.268607761687019e-06, - "long_answer_loss": 0.0638, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 1928, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0806, - "grad_norm": 1.390625, - "learning_rate": 4.256965126164454e-06, - "long_answer_loss": 0.0806, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 1929, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.056, - "grad_norm": 1.359375, - "learning_rate": 4.245335130607508e-06, - "long_answer_loss": 0.056, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 1930, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0765, - "grad_norm": 1.359375, - "learning_rate": 4.233717792849754e-06, - "long_answer_loss": 0.0765, - "loss": 0.0677, - "short_answer_loss": NaN, - "step": 1931, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0683, - "grad_norm": 1.3984375, - "learning_rate": 4.222113130705352e-06, - "long_answer_loss": 0.0683, - "loss": 0.0683, - "short_answer_loss": NaN, - "step": 1932, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0865, - "grad_norm": 1.4921875, - "learning_rate": 4.210521161969018e-06, - "long_answer_loss": 0.0865, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 1933, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0747, - "grad_norm": 1.3359375, - "learning_rate": 4.198941904416027e-06, - "long_answer_loss": 0.0747, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 1934, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0633, - "grad_norm": 1.3984375, - "learning_rate": 4.18737537580213e-06, - "long_answer_loss": 0.0633, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 1935, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0704, - "grad_norm": 1.3671875, - "learning_rate": 4.175821593863595e-06, - "long_answer_loss": 0.0704, - "loss": 0.0677, - "short_answer_loss": NaN, - "step": 1936, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0515, - "grad_norm": 1.3515625, - "learning_rate": 4.164280576317106e-06, - "long_answer_loss": 0.0515, - "loss": 0.0686, - "short_answer_loss": NaN, - "step": 1937, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0735, - "grad_norm": 1.3671875, - "learning_rate": 4.152752340859814e-06, - "long_answer_loss": 0.0735, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 1938, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0694, - "grad_norm": 1.421875, - "learning_rate": 4.1412369051692336e-06, - "long_answer_loss": 0.0694, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 1939, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0722, - "grad_norm": 1.4140625, - "learning_rate": 4.129734286903275e-06, - "long_answer_loss": 0.0722, - "loss": 0.069, - "short_answer_loss": NaN, - "step": 1940, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.055, - "grad_norm": 1.4453125, - "learning_rate": 4.118244503700189e-06, - "long_answer_loss": 0.055, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 1941, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0641, - "grad_norm": 1.3515625, - "learning_rate": 4.106767573178531e-06, - "long_answer_loss": 0.0641, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 1942, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0684, - "grad_norm": 1.3359375, - "learning_rate": 4.095303512937176e-06, - "long_answer_loss": 0.0684, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 1943, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0839, - "grad_norm": 1.4609375, - "learning_rate": 4.083852340555233e-06, - "long_answer_loss": 0.0839, - "loss": 0.0756, - "short_answer_loss": NaN, - "step": 1944, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0574, - "grad_norm": 1.34375, - "learning_rate": 4.072414073592076e-06, - "long_answer_loss": 0.0574, - "loss": 0.068, - "short_answer_loss": NaN, - "step": 1945, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0798, - "grad_norm": 1.4296875, - "learning_rate": 4.060988729587267e-06, - "long_answer_loss": 0.0798, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 1946, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0727, - "grad_norm": 1.515625, - "learning_rate": 4.0495763260605654e-06, - "long_answer_loss": 0.0727, - "loss": 0.0763, - "short_answer_loss": NaN, - "step": 1947, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0754, - "grad_norm": 1.453125, - "learning_rate": 4.038176880511883e-06, - "long_answer_loss": 0.0754, - "loss": 0.0731, - "short_answer_loss": NaN, - "step": 1948, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0821, - "grad_norm": 1.3828125, - "learning_rate": 4.026790410421262e-06, - "long_answer_loss": 0.0821, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 1949, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0685, - "grad_norm": 1.4296875, - "learning_rate": 4.015416933248853e-06, - "long_answer_loss": 0.0685, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 1950, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0678, - "grad_norm": 1.328125, - "learning_rate": 4.0040564664348665e-06, - "long_answer_loss": 0.0678, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 1951, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0633, - "grad_norm": 1.359375, - "learning_rate": 3.992709027399588e-06, - "long_answer_loss": 0.0633, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 1952, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0734, - "grad_norm": 1.3125, - "learning_rate": 3.9813746335433025e-06, - "long_answer_loss": 0.0734, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 1953, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0726, - "grad_norm": 1.4453125, - "learning_rate": 3.970053302246307e-06, - "long_answer_loss": 0.0726, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 1954, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0718, - "grad_norm": 1.34375, - "learning_rate": 3.958745050868861e-06, - "long_answer_loss": 0.0718, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 1955, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0936, - "grad_norm": 1.4765625, - "learning_rate": 3.947449896751167e-06, - "long_answer_loss": 0.0936, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 1956, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0763, - "grad_norm": 1.390625, - "learning_rate": 3.936167857213349e-06, - "long_answer_loss": 0.0763, - "loss": 0.0782, - "short_answer_loss": NaN, - "step": 1957, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0744, - "grad_norm": 1.390625, - "learning_rate": 3.924898949555415e-06, - "long_answer_loss": 0.0744, - "loss": 0.0789, - "short_answer_loss": NaN, - "step": 1958, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0628, - "grad_norm": 1.3671875, - "learning_rate": 3.9136431910572465e-06, - "long_answer_loss": 0.0628, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 1959, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.06, - "grad_norm": 1.3515625, - "learning_rate": 3.90240059897854e-06, - "long_answer_loss": 0.06, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 1960, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0645, - "grad_norm": 1.453125, - "learning_rate": 3.891171190558833e-06, - "long_answer_loss": 0.0645, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 1961, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0723, - "grad_norm": 1.3359375, - "learning_rate": 3.879954983017421e-06, - "long_answer_loss": 0.0723, - "loss": 0.0686, - "short_answer_loss": NaN, - "step": 1962, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.08, - "grad_norm": 1.3828125, - "learning_rate": 3.868751993553368e-06, - "long_answer_loss": 0.08, - "loss": 0.0714, - "short_answer_loss": NaN, - "step": 1963, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0609, - "grad_norm": 1.3984375, - "learning_rate": 3.8575622393454735e-06, - "long_answer_loss": 0.0609, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 1964, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0603, - "grad_norm": 1.3359375, - "learning_rate": 3.846385737552231e-06, - "long_answer_loss": 0.0603, - "loss": 0.0675, - "short_answer_loss": NaN, - "step": 1965, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0765, - "grad_norm": 1.3984375, - "learning_rate": 3.835222505311822e-06, - "long_answer_loss": 0.0765, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 1966, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0832, - "grad_norm": 1.4453125, - "learning_rate": 3.824072559742076e-06, - "long_answer_loss": 0.0832, - "loss": 0.0748, - "short_answer_loss": NaN, - "step": 1967, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0736, - "grad_norm": 1.53125, - "learning_rate": 3.8129359179404494e-06, - "long_answer_loss": 0.0736, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 1968, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0815, - "grad_norm": 1.5078125, - "learning_rate": 3.801812596984003e-06, - "long_answer_loss": 0.0815, - "loss": 0.0765, - "short_answer_loss": NaN, - "step": 1969, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0847, - "grad_norm": 1.40625, - "learning_rate": 3.790702613929356e-06, - "long_answer_loss": 0.0847, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 1970, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0691, - "grad_norm": 1.453125, - "learning_rate": 3.7796059858126927e-06, - "long_answer_loss": 0.0691, - "loss": 0.0743, - "short_answer_loss": NaN, - "step": 1971, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0592, - "grad_norm": 1.3828125, - "learning_rate": 3.768522729649711e-06, - "long_answer_loss": 0.0592, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 1972, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0794, - "grad_norm": 1.3828125, - "learning_rate": 3.7574528624356036e-06, - "long_answer_loss": 0.0794, - "loss": 0.0727, - "short_answer_loss": NaN, - "step": 1973, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0508, - "grad_norm": 1.328125, - "learning_rate": 3.746396401145036e-06, - "long_answer_loss": 0.0508, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 1974, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0747, - "grad_norm": 1.4375, - "learning_rate": 3.735353362732112e-06, - "long_answer_loss": 0.0747, - "loss": 0.077, - "short_answer_loss": NaN, - "step": 1975, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0796, - "grad_norm": 1.4453125, - "learning_rate": 3.724323764130358e-06, - "long_answer_loss": 0.0796, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 1976, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.07, - "grad_norm": 1.34375, - "learning_rate": 3.713307622252686e-06, - "long_answer_loss": 0.07, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 1977, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0648, - "grad_norm": 1.4609375, - "learning_rate": 3.702304953991383e-06, - "long_answer_loss": 0.0648, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 1978, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0669, - "grad_norm": 1.40625, - "learning_rate": 3.6913157762180544e-06, - "long_answer_loss": 0.0669, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 1979, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.07, - "grad_norm": 1.4296875, - "learning_rate": 3.68034010578365e-06, - "long_answer_loss": 0.07, - "loss": 0.0763, - "short_answer_loss": NaN, - "step": 1980, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0745, - "grad_norm": 1.3515625, - "learning_rate": 3.669377959518379e-06, - "long_answer_loss": 0.0745, - "loss": 0.0763, - "short_answer_loss": NaN, - "step": 1981, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0661, - "grad_norm": 1.3828125, - "learning_rate": 3.658429354231728e-06, - "long_answer_loss": 0.0661, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 1982, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0659, - "grad_norm": 1.484375, - "learning_rate": 3.647494306712415e-06, - "long_answer_loss": 0.0659, - "loss": 0.076, - "short_answer_loss": NaN, - "step": 1983, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0723, - "grad_norm": 1.3671875, - "learning_rate": 3.63657283372837e-06, - "long_answer_loss": 0.0723, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 1984, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0747, - "grad_norm": 1.3671875, - "learning_rate": 3.625664952026711e-06, - "long_answer_loss": 0.0747, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 1985, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0923, - "grad_norm": 1.4375, - "learning_rate": 3.614770678333698e-06, - "long_answer_loss": 0.0923, - "loss": 0.0748, - "short_answer_loss": NaN, - "step": 1986, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0648, - "grad_norm": 1.359375, - "learning_rate": 3.6038900293547536e-06, - "long_answer_loss": 0.0648, - "loss": 0.0676, - "short_answer_loss": NaN, - "step": 1987, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0694, - "grad_norm": 1.484375, - "learning_rate": 3.593023021774375e-06, - "long_answer_loss": 0.0694, - "loss": 0.077, - "short_answer_loss": NaN, - "step": 1988, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0708, - "grad_norm": 1.421875, - "learning_rate": 3.5821696722561735e-06, - "long_answer_loss": 0.0708, - "loss": 0.0756, - "short_answer_loss": NaN, - "step": 1989, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0831, - "grad_norm": 1.3515625, - "learning_rate": 3.571329997442792e-06, - "long_answer_loss": 0.0831, - "loss": 0.0695, - "short_answer_loss": NaN, - "step": 1990, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0713, - "grad_norm": 1.359375, - "learning_rate": 3.560504013955916e-06, - "long_answer_loss": 0.0713, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 1991, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0799, - "grad_norm": 1.3671875, - "learning_rate": 3.549691738396235e-06, - "long_answer_loss": 0.0799, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 1992, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0857, - "grad_norm": 1.421875, - "learning_rate": 3.5388931873434186e-06, - "long_answer_loss": 0.0857, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 1993, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0732, - "grad_norm": 1.3671875, - "learning_rate": 3.528108377356093e-06, - "long_answer_loss": 0.0732, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 1994, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0873, - "grad_norm": 1.375, - "learning_rate": 3.5173373249718035e-06, - "long_answer_loss": 0.0873, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 1995, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.08, - "grad_norm": 1.3359375, - "learning_rate": 3.5065800467070182e-06, - "long_answer_loss": 0.08, - "loss": 0.0742, - "short_answer_loss": NaN, - "step": 1996, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0824, - "grad_norm": 1.40625, - "learning_rate": 3.4958365590570597e-06, - "long_answer_loss": 0.0824, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 1997, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0983, - "grad_norm": 1.4140625, - "learning_rate": 3.485106878496133e-06, - "long_answer_loss": 0.0983, - "loss": 0.0764, - "short_answer_loss": NaN, - "step": 1998, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0669, - "grad_norm": 1.34375, - "learning_rate": 3.4743910214772413e-06, - "long_answer_loss": 0.0669, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 1999, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0593, - "grad_norm": 1.484375, - "learning_rate": 3.4636890044322107e-06, - "long_answer_loss": 0.0593, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 2000, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0589, - "grad_norm": 1.3828125, - "learning_rate": 3.453000843771642e-06, - "long_answer_loss": 0.0589, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 2001, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0815, - "grad_norm": 1.453125, - "learning_rate": 3.442326555884873e-06, - "long_answer_loss": 0.0815, - "loss": 0.081, - "short_answer_loss": NaN, - "step": 2002, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0766, - "grad_norm": 1.3671875, - "learning_rate": 3.4316661571399955e-06, - "long_answer_loss": 0.0766, - "loss": 0.0714, - "short_answer_loss": NaN, - "step": 2003, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0675, - "grad_norm": 1.34375, - "learning_rate": 3.4210196638837745e-06, - "long_answer_loss": 0.0675, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 2004, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0649, - "grad_norm": 1.5625, - "learning_rate": 3.410387092441683e-06, - "long_answer_loss": 0.0649, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 2005, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0683, - "grad_norm": 1.4375, - "learning_rate": 3.3997684591178177e-06, - "long_answer_loss": 0.0683, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 2006, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0721, - "grad_norm": 1.390625, - "learning_rate": 3.389163780194918e-06, - "long_answer_loss": 0.0721, - "loss": 0.0669, - "short_answer_loss": NaN, - "step": 2007, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.059, - "grad_norm": 1.3203125, - "learning_rate": 3.3785730719343226e-06, - "long_answer_loss": 0.059, - "loss": 0.0674, - "short_answer_loss": NaN, - "step": 2008, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0737, - "grad_norm": 1.421875, - "learning_rate": 3.367996350575946e-06, - "long_answer_loss": 0.0737, - "loss": 0.0765, - "short_answer_loss": NaN, - "step": 2009, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0627, - "grad_norm": 1.421875, - "learning_rate": 3.3574336323382595e-06, - "long_answer_loss": 0.0627, - "loss": 0.0648, - "short_answer_loss": NaN, - "step": 2010, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0658, - "grad_norm": 1.4296875, - "learning_rate": 3.3468849334182483e-06, - "long_answer_loss": 0.0658, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 2011, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0719, - "grad_norm": 1.484375, - "learning_rate": 3.3363502699914244e-06, - "long_answer_loss": 0.0719, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 2012, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0886, - "grad_norm": 1.4609375, - "learning_rate": 3.3258296582117474e-06, - "long_answer_loss": 0.0886, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 2013, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0781, - "grad_norm": 1.359375, - "learning_rate": 3.3153231142116617e-06, - "long_answer_loss": 0.0781, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 2014, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0944, - "grad_norm": 1.3671875, - "learning_rate": 3.3048306541020117e-06, - "long_answer_loss": 0.0944, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 2015, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.067, - "grad_norm": 1.4375, - "learning_rate": 3.2943522939720637e-06, - "long_answer_loss": 0.067, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 2016, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0634, - "grad_norm": 1.421875, - "learning_rate": 3.2838880498894568e-06, - "long_answer_loss": 0.0634, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 2017, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0758, - "grad_norm": 1.46875, - "learning_rate": 3.273437937900184e-06, - "long_answer_loss": 0.0758, - "loss": 0.0742, - "short_answer_loss": NaN, - "step": 2018, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0712, - "grad_norm": 1.3828125, - "learning_rate": 3.263001974028568e-06, - "long_answer_loss": 0.0712, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 2019, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0595, - "grad_norm": 1.390625, - "learning_rate": 3.252580174277238e-06, - "long_answer_loss": 0.0595, - "loss": 0.068, - "short_answer_loss": NaN, - "step": 2020, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0628, - "grad_norm": 1.3046875, - "learning_rate": 3.242172554627107e-06, - "long_answer_loss": 0.0628, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 2021, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0682, - "grad_norm": 1.4921875, - "learning_rate": 3.231779131037331e-06, - "long_answer_loss": 0.0682, - "loss": 0.0766, - "short_answer_loss": NaN, - "step": 2022, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0698, - "grad_norm": 1.4609375, - "learning_rate": 3.2213999194453128e-06, - "long_answer_loss": 0.0698, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 2023, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0694, - "grad_norm": 1.4453125, - "learning_rate": 3.211034935766656e-06, - "long_answer_loss": 0.0694, - "loss": 0.076, - "short_answer_loss": NaN, - "step": 2024, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0882, - "grad_norm": 1.3671875, - "learning_rate": 3.2006841958951458e-06, - "long_answer_loss": 0.0882, - "loss": 0.0679, - "short_answer_loss": NaN, - "step": 2025, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0866, - "grad_norm": 1.4375, - "learning_rate": 3.1903477157027266e-06, - "long_answer_loss": 0.0866, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 2026, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0714, - "grad_norm": 1.3125, - "learning_rate": 3.1800255110394806e-06, - "long_answer_loss": 0.0714, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 2027, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0811, - "grad_norm": 1.3359375, - "learning_rate": 3.1697175977335946e-06, - "long_answer_loss": 0.0811, - "loss": 0.07, - "short_answer_loss": NaN, - "step": 2028, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0636, - "grad_norm": 1.34375, - "learning_rate": 3.1594239915913413e-06, - "long_answer_loss": 0.0636, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 2029, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0601, - "grad_norm": 1.375, - "learning_rate": 3.1491447083970586e-06, - "long_answer_loss": 0.0601, - "loss": 0.0666, - "short_answer_loss": NaN, - "step": 2030, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0741, - "grad_norm": 1.3828125, - "learning_rate": 3.138879763913122e-06, - "long_answer_loss": 0.0741, - "loss": 0.0681, - "short_answer_loss": NaN, - "step": 2031, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0805, - "grad_norm": 1.359375, - "learning_rate": 3.1286291738799087e-06, - "long_answer_loss": 0.0805, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 2032, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0692, - "grad_norm": 1.3359375, - "learning_rate": 3.1183929540157973e-06, - "long_answer_loss": 0.0692, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2033, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0677, - "grad_norm": 1.3984375, - "learning_rate": 3.1081711200171266e-06, - "long_answer_loss": 0.0677, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 2034, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0674, - "grad_norm": 1.484375, - "learning_rate": 3.097963687558175e-06, - "long_answer_loss": 0.0674, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 2035, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0793, - "grad_norm": 1.4453125, - "learning_rate": 3.087770672291139e-06, - "long_answer_loss": 0.0793, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 2036, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0796, - "grad_norm": 1.390625, - "learning_rate": 3.077592089846107e-06, - "long_answer_loss": 0.0796, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 2037, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0851, - "grad_norm": 1.4296875, - "learning_rate": 3.0674279558310384e-06, - "long_answer_loss": 0.0851, - "loss": 0.0764, - "short_answer_loss": NaN, - "step": 2038, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0831, - "grad_norm": 1.453125, - "learning_rate": 3.0572782858317244e-06, - "long_answer_loss": 0.0831, - "loss": 0.0777, - "short_answer_loss": NaN, - "step": 2039, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.061, - "grad_norm": 1.3515625, - "learning_rate": 3.0471430954118018e-06, - "long_answer_loss": 0.061, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2040, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0708, - "grad_norm": 1.4296875, - "learning_rate": 3.037022400112678e-06, - "long_answer_loss": 0.0708, - "loss": 0.0714, - "short_answer_loss": NaN, - "step": 2041, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0673, - "grad_norm": 1.4765625, - "learning_rate": 3.02691621545355e-06, - "long_answer_loss": 0.0673, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 2042, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0743, - "grad_norm": 1.3515625, - "learning_rate": 3.0168245569313566e-06, - "long_answer_loss": 0.0743, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 2043, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0795, - "grad_norm": 1.4453125, - "learning_rate": 3.0067474400207672e-06, - "long_answer_loss": 0.0795, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 2044, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0606, - "grad_norm": 1.375, - "learning_rate": 2.996684880174151e-06, - "long_answer_loss": 0.0606, - "loss": 0.0684, - "short_answer_loss": NaN, - "step": 2045, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0719, - "grad_norm": 1.4453125, - "learning_rate": 2.9866368928215456e-06, - "long_answer_loss": 0.0719, - "loss": 0.0724, - "short_answer_loss": NaN, - "step": 2046, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0654, - "grad_norm": 1.3671875, - "learning_rate": 2.9766034933706653e-06, - "long_answer_loss": 0.0654, - "loss": 0.0714, - "short_answer_loss": NaN, - "step": 2047, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0689, - "grad_norm": 1.359375, - "learning_rate": 2.9665846972068285e-06, - "long_answer_loss": 0.0689, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 2048, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0772, - "grad_norm": 1.3203125, - "learning_rate": 2.9565805196929864e-06, - "long_answer_loss": 0.0772, - "loss": 0.067, - "short_answer_loss": NaN, - "step": 2049, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0725, - "grad_norm": 1.4921875, - "learning_rate": 2.946590976169651e-06, - "long_answer_loss": 0.0725, - "loss": 0.0739, - "short_answer_loss": NaN, - "step": 2050, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0687, - "grad_norm": 1.3515625, - "learning_rate": 2.9366160819549087e-06, - "long_answer_loss": 0.0687, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 2051, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0858, - "grad_norm": 1.390625, - "learning_rate": 2.9266558523443776e-06, - "long_answer_loss": 0.0858, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 2052, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0876, - "grad_norm": 1.3671875, - "learning_rate": 2.9167103026111904e-06, - "long_answer_loss": 0.0876, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2053, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0657, - "grad_norm": 1.40625, - "learning_rate": 2.9067794480059735e-06, - "long_answer_loss": 0.0657, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 2054, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0597, - "grad_norm": 1.4375, - "learning_rate": 2.896863303756801e-06, - "long_answer_loss": 0.0597, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 2055, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0737, - "grad_norm": 1.4140625, - "learning_rate": 2.8869618850692227e-06, - "long_answer_loss": 0.0737, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 2056, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.085, - "grad_norm": 1.4140625, - "learning_rate": 2.87707520712617e-06, - "long_answer_loss": 0.085, - "loss": 0.0766, - "short_answer_loss": NaN, - "step": 2057, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0932, - "grad_norm": 1.5390625, - "learning_rate": 2.8672032850880078e-06, - "long_answer_loss": 0.0932, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 2058, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0772, - "grad_norm": 1.4296875, - "learning_rate": 2.857346134092445e-06, - "long_answer_loss": 0.0772, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 2059, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0631, - "grad_norm": 1.3671875, - "learning_rate": 2.847503769254553e-06, - "long_answer_loss": 0.0631, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 2060, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0792, - "grad_norm": 1.40625, - "learning_rate": 2.837676205666731e-06, - "long_answer_loss": 0.0792, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 2061, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.075, - "grad_norm": 1.421875, - "learning_rate": 2.82786345839868e-06, - "long_answer_loss": 0.075, - "loss": 0.0756, - "short_answer_loss": NaN, - "step": 2062, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0733, - "grad_norm": 1.5, - "learning_rate": 2.8180655424973806e-06, - "long_answer_loss": 0.0733, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2063, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0627, - "grad_norm": 1.53125, - "learning_rate": 2.8082824729870642e-06, - "long_answer_loss": 0.0627, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 2064, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0622, - "grad_norm": 1.3046875, - "learning_rate": 2.7985142648692176e-06, - "long_answer_loss": 0.0622, - "loss": 0.0674, - "short_answer_loss": NaN, - "step": 2065, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0718, - "grad_norm": 1.453125, - "learning_rate": 2.7887609331225114e-06, - "long_answer_loss": 0.0718, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2066, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0733, - "grad_norm": 1.546875, - "learning_rate": 2.7790224927028237e-06, - "long_answer_loss": 0.0733, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2067, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0648, - "grad_norm": 1.3671875, - "learning_rate": 2.76929895854319e-06, - "long_answer_loss": 0.0648, - "loss": 0.0714, - "short_answer_loss": NaN, - "step": 2068, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0946, - "grad_norm": 1.46875, - "learning_rate": 2.7595903455537946e-06, - "long_answer_loss": 0.0946, - "loss": 0.0785, - "short_answer_loss": NaN, - "step": 2069, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0813, - "grad_norm": 1.4609375, - "learning_rate": 2.7498966686219347e-06, - "long_answer_loss": 0.0813, - "loss": 0.0687, - "short_answer_loss": NaN, - "step": 2070, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0739, - "grad_norm": 1.53125, - "learning_rate": 2.7402179426120085e-06, - "long_answer_loss": 0.0739, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 2071, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0763, - "grad_norm": 1.3828125, - "learning_rate": 2.730554182365491e-06, - "long_answer_loss": 0.0763, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2072, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0803, - "grad_norm": 1.5234375, - "learning_rate": 2.720905402700892e-06, - "long_answer_loss": 0.0803, - "loss": 0.0779, - "short_answer_loss": NaN, - "step": 2073, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.074, - "grad_norm": 1.46875, - "learning_rate": 2.7112716184137798e-06, - "long_answer_loss": 0.074, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 2074, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0824, - "grad_norm": 1.4296875, - "learning_rate": 2.7016528442766977e-06, - "long_answer_loss": 0.0824, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 2075, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0619, - "grad_norm": 1.421875, - "learning_rate": 2.692049095039191e-06, - "long_answer_loss": 0.0619, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 2076, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0748, - "grad_norm": 1.4296875, - "learning_rate": 2.682460385427761e-06, - "long_answer_loss": 0.0748, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 2077, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.08, - "grad_norm": 1.3828125, - "learning_rate": 2.672886730145846e-06, - "long_answer_loss": 0.08, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 2078, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0673, - "grad_norm": 1.296875, - "learning_rate": 2.6633281438738e-06, - "long_answer_loss": 0.0673, - "loss": 0.0667, - "short_answer_loss": NaN, - "step": 2079, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0622, - "grad_norm": 1.421875, - "learning_rate": 2.6537846412688707e-06, - "long_answer_loss": 0.0622, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 2080, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0714, - "grad_norm": 1.3515625, - "learning_rate": 2.644256236965177e-06, - "long_answer_loss": 0.0714, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 2081, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0814, - "grad_norm": 1.4765625, - "learning_rate": 2.634742945573687e-06, - "long_answer_loss": 0.0814, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 2082, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.095, - "grad_norm": 1.5390625, - "learning_rate": 2.625244781682187e-06, - "long_answer_loss": 0.095, - "loss": 0.0764, - "short_answer_loss": NaN, - "step": 2083, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0839, - "grad_norm": 1.453125, - "learning_rate": 2.6157617598552745e-06, - "long_answer_loss": 0.0839, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 2084, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.089, - "grad_norm": 1.3984375, - "learning_rate": 2.6062938946343248e-06, - "long_answer_loss": 0.089, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 2085, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0586, - "grad_norm": 1.4609375, - "learning_rate": 2.596841200537474e-06, - "long_answer_loss": 0.0586, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 2086, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0648, - "grad_norm": 1.4453125, - "learning_rate": 2.5874036920595937e-06, - "long_answer_loss": 0.0648, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 2087, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0535, - "grad_norm": 1.4375, - "learning_rate": 2.5779813836722677e-06, - "long_answer_loss": 0.0535, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 2088, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0799, - "grad_norm": 1.3984375, - "learning_rate": 2.5685742898237748e-06, - "long_answer_loss": 0.0799, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 2089, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.071, - "grad_norm": 1.46875, - "learning_rate": 2.5591824249390607e-06, - "long_answer_loss": 0.071, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 2090, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0737, - "grad_norm": 1.4140625, - "learning_rate": 2.549805803419725e-06, - "long_answer_loss": 0.0737, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2091, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0818, - "grad_norm": 1.3515625, - "learning_rate": 2.540444439643977e-06, - "long_answer_loss": 0.0818, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2092, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0694, - "grad_norm": 1.3359375, - "learning_rate": 2.5310983479666554e-06, - "long_answer_loss": 0.0694, - "loss": 0.0676, - "short_answer_loss": NaN, - "step": 2093, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0728, - "grad_norm": 1.40625, - "learning_rate": 2.5217675427191555e-06, - "long_answer_loss": 0.0728, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 2094, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.07, - "grad_norm": 1.375, - "learning_rate": 2.5124520382094466e-06, - "long_answer_loss": 0.07, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 2095, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0737, - "grad_norm": 1.4609375, - "learning_rate": 2.5031518487220294e-06, - "long_answer_loss": 0.0737, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 2096, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0711, - "grad_norm": 1.3828125, - "learning_rate": 2.493866988517926e-06, - "long_answer_loss": 0.0711, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 2097, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.074, - "grad_norm": 1.3203125, - "learning_rate": 2.4845974718346503e-06, - "long_answer_loss": 0.074, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 2098, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0747, - "grad_norm": 1.46875, - "learning_rate": 2.4753433128861775e-06, - "long_answer_loss": 0.0747, - "loss": 0.0731, - "short_answer_loss": NaN, - "step": 2099, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0688, - "grad_norm": 1.4609375, - "learning_rate": 2.466104525862957e-06, - "long_answer_loss": 0.0688, - "loss": 0.0684, - "short_answer_loss": NaN, - "step": 2100, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0772, - "grad_norm": 1.3828125, - "learning_rate": 2.456881124931837e-06, - "long_answer_loss": 0.0772, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 2101, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0615, - "grad_norm": 1.390625, - "learning_rate": 2.447673124236102e-06, - "long_answer_loss": 0.0615, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 2102, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0754, - "grad_norm": 1.4140625, - "learning_rate": 2.438480537895399e-06, - "long_answer_loss": 0.0754, - "loss": 0.0751, - "short_answer_loss": NaN, - "step": 2103, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0663, - "grad_norm": 1.3828125, - "learning_rate": 2.4293033800057486e-06, - "long_answer_loss": 0.0663, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 2104, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0951, - "grad_norm": 1.4609375, - "learning_rate": 2.4201416646395123e-06, - "long_answer_loss": 0.0951, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 2105, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0688, - "grad_norm": 1.421875, - "learning_rate": 2.410995405845369e-06, - "long_answer_loss": 0.0688, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 2106, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0581, - "grad_norm": 1.4609375, - "learning_rate": 2.4018646176483056e-06, - "long_answer_loss": 0.0581, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 2107, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0743, - "grad_norm": 1.4765625, - "learning_rate": 2.3927493140495653e-06, - "long_answer_loss": 0.0743, - "loss": 0.0739, - "short_answer_loss": NaN, - "step": 2108, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.067, - "grad_norm": 1.390625, - "learning_rate": 2.3836495090266767e-06, - "long_answer_loss": 0.067, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2109, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0935, - "grad_norm": 1.4375, - "learning_rate": 2.3745652165333713e-06, - "long_answer_loss": 0.0935, - "loss": 0.0782, - "short_answer_loss": NaN, - "step": 2110, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0931, - "grad_norm": 1.453125, - "learning_rate": 2.365496450499623e-06, - "long_answer_loss": 0.0931, - "loss": 0.0752, - "short_answer_loss": NaN, - "step": 2111, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0826, - "grad_norm": 1.328125, - "learning_rate": 2.356443224831574e-06, - "long_answer_loss": 0.0826, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 2112, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0823, - "grad_norm": 1.390625, - "learning_rate": 2.3474055534115495e-06, - "long_answer_loss": 0.0823, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 2113, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.072, - "grad_norm": 1.484375, - "learning_rate": 2.338383450098021e-06, - "long_answer_loss": 0.072, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 2114, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0791, - "grad_norm": 1.4609375, - "learning_rate": 2.3293769287255797e-06, - "long_answer_loss": 0.0791, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 2115, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0703, - "grad_norm": 1.421875, - "learning_rate": 2.3203860031049423e-06, - "long_answer_loss": 0.0703, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 2116, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0668, - "grad_norm": 1.3828125, - "learning_rate": 2.311410687022884e-06, - "long_answer_loss": 0.0668, - "loss": 0.0682, - "short_answer_loss": NaN, - "step": 2117, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0626, - "grad_norm": 1.453125, - "learning_rate": 2.302450994242275e-06, - "long_answer_loss": 0.0626, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 2118, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0718, - "grad_norm": 1.4375, - "learning_rate": 2.2935069385020005e-06, - "long_answer_loss": 0.0718, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 2119, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0672, - "grad_norm": 1.4296875, - "learning_rate": 2.2845785335169832e-06, - "long_answer_loss": 0.0672, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 2120, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0851, - "grad_norm": 1.4921875, - "learning_rate": 2.275665792978145e-06, - "long_answer_loss": 0.0851, - "loss": 0.077, - "short_answer_loss": NaN, - "step": 2121, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0813, - "grad_norm": 1.4765625, - "learning_rate": 2.2667687305523836e-06, - "long_answer_loss": 0.0813, - "loss": 0.0806, - "short_answer_loss": NaN, - "step": 2122, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0648, - "grad_norm": 1.421875, - "learning_rate": 2.257887359882563e-06, - "long_answer_loss": 0.0648, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 2123, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.067, - "grad_norm": 1.3828125, - "learning_rate": 2.249021694587471e-06, - "long_answer_loss": 0.067, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 2124, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0626, - "grad_norm": 1.453125, - "learning_rate": 2.2401717482618325e-06, - "long_answer_loss": 0.0626, - "loss": 0.0767, - "short_answer_loss": NaN, - "step": 2125, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0802, - "grad_norm": 1.46875, - "learning_rate": 2.2313375344762465e-06, - "long_answer_loss": 0.0802, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 2126, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.073, - "grad_norm": 1.3203125, - "learning_rate": 2.2225190667772135e-06, - "long_answer_loss": 0.073, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 2127, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0644, - "grad_norm": 1.34375, - "learning_rate": 2.213716358687064e-06, - "long_answer_loss": 0.0644, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 2128, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0668, - "grad_norm": 1.3828125, - "learning_rate": 2.2049294237039745e-06, - "long_answer_loss": 0.0668, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2129, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0681, - "grad_norm": 1.328125, - "learning_rate": 2.1961582753019365e-06, - "long_answer_loss": 0.0681, - "loss": 0.0667, - "short_answer_loss": NaN, - "step": 2130, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0734, - "grad_norm": 1.4296875, - "learning_rate": 2.1874029269307277e-06, - "long_answer_loss": 0.0734, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 2131, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0933, - "grad_norm": 1.4609375, - "learning_rate": 2.1786633920159045e-06, - "long_answer_loss": 0.0933, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 2132, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0676, - "grad_norm": 1.4375, - "learning_rate": 2.1699396839587687e-06, - "long_answer_loss": 0.0676, - "loss": 0.0763, - "short_answer_loss": NaN, - "step": 2133, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0697, - "grad_norm": 1.3671875, - "learning_rate": 2.161231816136361e-06, - "long_answer_loss": 0.0697, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 2134, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0687, - "grad_norm": 1.46875, - "learning_rate": 2.1525398019014197e-06, - "long_answer_loss": 0.0687, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 2135, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0676, - "grad_norm": 1.421875, - "learning_rate": 2.1438636545823843e-06, - "long_answer_loss": 0.0676, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 2136, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0784, - "grad_norm": 1.34375, - "learning_rate": 2.13520338748336e-06, - "long_answer_loss": 0.0784, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 2137, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0865, - "grad_norm": 1.6171875, - "learning_rate": 2.126559013884101e-06, - "long_answer_loss": 0.0865, - "loss": 0.0739, - "short_answer_loss": NaN, - "step": 2138, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0733, - "grad_norm": 1.296875, - "learning_rate": 2.1179305470399897e-06, - "long_answer_loss": 0.0733, - "loss": 0.0669, - "short_answer_loss": NaN, - "step": 2139, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0648, - "grad_norm": 1.4140625, - "learning_rate": 2.109318000182019e-06, - "long_answer_loss": 0.0648, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 2140, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0807, - "grad_norm": 1.4375, - "learning_rate": 2.1007213865167684e-06, - "long_answer_loss": 0.0807, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 2141, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.068, - "grad_norm": 1.4375, - "learning_rate": 2.0921407192263876e-06, - "long_answer_loss": 0.068, - "loss": 0.0743, - "short_answer_loss": NaN, - "step": 2142, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0539, - "grad_norm": 1.5234375, - "learning_rate": 2.083576011468562e-06, - "long_answer_loss": 0.0539, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2143, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0557, - "grad_norm": 1.3984375, - "learning_rate": 2.0750272763765276e-06, - "long_answer_loss": 0.0557, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 2144, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0758, - "grad_norm": 1.3984375, - "learning_rate": 2.066494527059004e-06, - "long_answer_loss": 0.0758, - "loss": 0.0739, - "short_answer_loss": NaN, - "step": 2145, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0755, - "grad_norm": 1.5078125, - "learning_rate": 2.057977776600213e-06, - "long_answer_loss": 0.0755, - "loss": 0.0864, - "short_answer_loss": NaN, - "step": 2146, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0617, - "grad_norm": 1.4765625, - "learning_rate": 2.049477038059838e-06, - "long_answer_loss": 0.0617, - "loss": 0.0688, - "short_answer_loss": NaN, - "step": 2147, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.077, - "grad_norm": 1.3828125, - "learning_rate": 2.040992324473011e-06, - "long_answer_loss": 0.077, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2148, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0776, - "grad_norm": 1.4375, - "learning_rate": 2.0325236488502888e-06, - "long_answer_loss": 0.0776, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2149, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0669, - "grad_norm": 1.390625, - "learning_rate": 2.0240710241776386e-06, - "long_answer_loss": 0.0669, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 2150, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0843, - "grad_norm": 1.3828125, - "learning_rate": 2.0156344634164175e-06, - "long_answer_loss": 0.0843, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 2151, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0759, - "grad_norm": 1.4765625, - "learning_rate": 2.0072139795033333e-06, - "long_answer_loss": 0.0759, - "loss": 0.0731, - "short_answer_loss": NaN, - "step": 2152, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0722, - "grad_norm": 1.3125, - "learning_rate": 1.9988095853504694e-06, - "long_answer_loss": 0.0722, - "loss": 0.0656, - "short_answer_loss": NaN, - "step": 2153, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0819, - "grad_norm": 1.4140625, - "learning_rate": 1.9904212938452128e-06, - "long_answer_loss": 0.0819, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 2154, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0718, - "grad_norm": 1.4375, - "learning_rate": 1.982049117850268e-06, - "long_answer_loss": 0.0718, - "loss": 0.0695, - "short_answer_loss": NaN, - "step": 2155, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0736, - "grad_norm": 1.328125, - "learning_rate": 1.973693070203628e-06, - "long_answer_loss": 0.0736, - "loss": 0.0668, - "short_answer_loss": NaN, - "step": 2156, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.065, - "grad_norm": 1.3125, - "learning_rate": 1.9653531637185545e-06, - "long_answer_loss": 0.065, - "loss": 0.0684, - "short_answer_loss": NaN, - "step": 2157, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0529, - "grad_norm": 1.3984375, - "learning_rate": 1.9570294111835585e-06, - "long_answer_loss": 0.0529, - "loss": 0.0656, - "short_answer_loss": NaN, - "step": 2158, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0708, - "grad_norm": 1.515625, - "learning_rate": 1.948721825362372e-06, - "long_answer_loss": 0.0708, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 2159, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0815, - "grad_norm": 1.3984375, - "learning_rate": 1.9404304189939547e-06, - "long_answer_loss": 0.0815, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 2160, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0794, - "grad_norm": 1.421875, - "learning_rate": 1.9321552047924324e-06, - "long_answer_loss": 0.0794, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 2161, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0801, - "grad_norm": 1.4140625, - "learning_rate": 1.9238961954471294e-06, - "long_answer_loss": 0.0801, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 2162, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0746, - "grad_norm": 1.3828125, - "learning_rate": 1.915653403622497e-06, - "long_answer_loss": 0.0746, - "loss": 0.0687, - "short_answer_loss": NaN, - "step": 2163, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0718, - "grad_norm": 1.34375, - "learning_rate": 1.9074268419581294e-06, - "long_answer_loss": 0.0718, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 2164, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0748, - "grad_norm": 1.5, - "learning_rate": 1.8992165230687336e-06, - "long_answer_loss": 0.0748, - "loss": 0.0742, - "short_answer_loss": NaN, - "step": 2165, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0721, - "grad_norm": 1.2890625, - "learning_rate": 1.891022459544109e-06, - "long_answer_loss": 0.0721, - "loss": 0.07, - "short_answer_loss": NaN, - "step": 2166, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0715, - "grad_norm": 1.4140625, - "learning_rate": 1.8828446639491279e-06, - "long_answer_loss": 0.0715, - "loss": 0.0682, - "short_answer_loss": NaN, - "step": 2167, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0796, - "grad_norm": 1.3359375, - "learning_rate": 1.874683148823711e-06, - "long_answer_loss": 0.0796, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 2168, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0613, - "grad_norm": 1.3984375, - "learning_rate": 1.866537926682832e-06, - "long_answer_loss": 0.0613, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 2169, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0766, - "grad_norm": 1.4609375, - "learning_rate": 1.858409010016457e-06, - "long_answer_loss": 0.0766, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 2170, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0596, - "grad_norm": 1.3515625, - "learning_rate": 1.8502964112895731e-06, - "long_answer_loss": 0.0596, - "loss": 0.0669, - "short_answer_loss": NaN, - "step": 2171, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0599, - "grad_norm": 1.375, - "learning_rate": 1.8422001429421257e-06, - "long_answer_loss": 0.0599, - "loss": 0.0684, - "short_answer_loss": NaN, - "step": 2172, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0727, - "grad_norm": 1.4765625, - "learning_rate": 1.8341202173890292e-06, - "long_answer_loss": 0.0727, - "loss": 0.0684, - "short_answer_loss": NaN, - "step": 2173, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0634, - "grad_norm": 1.4140625, - "learning_rate": 1.8260566470201343e-06, - "long_answer_loss": 0.0634, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 2174, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0845, - "grad_norm": 1.3515625, - "learning_rate": 1.8180094442002165e-06, - "long_answer_loss": 0.0845, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 2175, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0824, - "grad_norm": 1.3984375, - "learning_rate": 1.8099786212689498e-06, - "long_answer_loss": 0.0824, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 2176, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.068, - "grad_norm": 1.34375, - "learning_rate": 1.8019641905408862e-06, - "long_answer_loss": 0.068, - "loss": 0.0681, - "short_answer_loss": NaN, - "step": 2177, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0848, - "grad_norm": 1.5, - "learning_rate": 1.7939661643054564e-06, - "long_answer_loss": 0.0848, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 2178, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0738, - "grad_norm": 1.4140625, - "learning_rate": 1.7859845548269193e-06, - "long_answer_loss": 0.0738, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 2179, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0664, - "grad_norm": 1.4296875, - "learning_rate": 1.7780193743443697e-06, - "long_answer_loss": 0.0664, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 2180, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0576, - "grad_norm": 1.40625, - "learning_rate": 1.7700706350717093e-06, - "long_answer_loss": 0.0576, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 2181, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0693, - "grad_norm": 1.3984375, - "learning_rate": 1.7621383491976256e-06, - "long_answer_loss": 0.0693, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 2182, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0641, - "grad_norm": 1.4296875, - "learning_rate": 1.7542225288855796e-06, - "long_answer_loss": 0.0641, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 2183, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0667, - "grad_norm": 1.3671875, - "learning_rate": 1.7463231862737822e-06, - "long_answer_loss": 0.0667, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 2184, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.068, - "grad_norm": 1.5, - "learning_rate": 1.7384403334751802e-06, - "long_answer_loss": 0.068, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 2185, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0851, - "grad_norm": 1.3828125, - "learning_rate": 1.7305739825774228e-06, - "long_answer_loss": 0.0851, - "loss": 0.0663, - "short_answer_loss": NaN, - "step": 2186, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0742, - "grad_norm": 1.453125, - "learning_rate": 1.722724145642876e-06, - "long_answer_loss": 0.0742, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 2187, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0611, - "grad_norm": 1.4296875, - "learning_rate": 1.7148908347085616e-06, - "long_answer_loss": 0.0611, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 2188, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0835, - "grad_norm": 1.4140625, - "learning_rate": 1.7070740617861736e-06, - "long_answer_loss": 0.0835, - "loss": 0.066, - "short_answer_loss": NaN, - "step": 2189, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0747, - "grad_norm": 1.375, - "learning_rate": 1.6992738388620408e-06, - "long_answer_loss": 0.0747, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 2190, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0906, - "grad_norm": 1.3671875, - "learning_rate": 1.691490177897119e-06, - "long_answer_loss": 0.0906, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 2191, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0859, - "grad_norm": 1.4375, - "learning_rate": 1.6837230908269623e-06, - "long_answer_loss": 0.0859, - "loss": 0.0782, - "short_answer_loss": NaN, - "step": 2192, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.1052, - "grad_norm": 1.5703125, - "learning_rate": 1.6759725895617113e-06, - "long_answer_loss": 0.1052, - "loss": 0.0775, - "short_answer_loss": NaN, - "step": 2193, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0771, - "grad_norm": 1.4140625, - "learning_rate": 1.6682386859860774e-06, - "long_answer_loss": 0.0771, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 2194, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.081, - "grad_norm": 1.40625, - "learning_rate": 1.66052139195932e-06, - "long_answer_loss": 0.081, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 2195, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0525, - "grad_norm": 1.3515625, - "learning_rate": 1.6528207193152235e-06, - "long_answer_loss": 0.0525, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 2196, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0649, - "grad_norm": 1.4296875, - "learning_rate": 1.6451366798620888e-06, - "long_answer_loss": 0.0649, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 2197, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0549, - "grad_norm": 1.4140625, - "learning_rate": 1.637469285382713e-06, - "long_answer_loss": 0.0549, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 2198, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0653, - "grad_norm": 1.375, - "learning_rate": 1.6298185476343693e-06, - "long_answer_loss": 0.0653, - "loss": 0.077, - "short_answer_loss": NaN, - "step": 2199, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0654, - "grad_norm": 1.375, - "learning_rate": 1.6221844783487859e-06, - "long_answer_loss": 0.0654, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2200, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0929, - "grad_norm": 1.4609375, - "learning_rate": 1.6145670892321344e-06, - "long_answer_loss": 0.0929, - "loss": 0.0756, - "short_answer_loss": NaN, - "step": 2201, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0737, - "grad_norm": 1.4140625, - "learning_rate": 1.6069663919650077e-06, - "long_answer_loss": 0.0737, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 2202, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0896, - "grad_norm": 1.3828125, - "learning_rate": 1.5993823982024036e-06, - "long_answer_loss": 0.0896, - "loss": 0.0789, - "short_answer_loss": NaN, - "step": 2203, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0711, - "grad_norm": 1.453125, - "learning_rate": 1.5918151195737099e-06, - "long_answer_loss": 0.0711, - "loss": 0.0706, - "short_answer_loss": NaN, - "step": 2204, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0827, - "grad_norm": 1.34375, - "learning_rate": 1.584264567682671e-06, - "long_answer_loss": 0.0827, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 2205, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0703, - "grad_norm": 1.4140625, - "learning_rate": 1.5767307541074015e-06, - "long_answer_loss": 0.0703, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 2206, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0742, - "grad_norm": 1.40625, - "learning_rate": 1.5692136904003298e-06, - "long_answer_loss": 0.0742, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 2207, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0759, - "grad_norm": 1.4609375, - "learning_rate": 1.5617133880882137e-06, - "long_answer_loss": 0.0759, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 2208, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0649, - "grad_norm": 1.2734375, - "learning_rate": 1.554229858672103e-06, - "long_answer_loss": 0.0649, - "loss": 0.0667, - "short_answer_loss": NaN, - "step": 2209, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0697, - "grad_norm": 1.3515625, - "learning_rate": 1.5467631136273294e-06, - "long_answer_loss": 0.0697, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 2210, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0657, - "grad_norm": 1.359375, - "learning_rate": 1.5393131644034885e-06, - "long_answer_loss": 0.0657, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 2211, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0683, - "grad_norm": 1.4453125, - "learning_rate": 1.5318800224244118e-06, - "long_answer_loss": 0.0683, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2212, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0544, - "grad_norm": 1.3125, - "learning_rate": 1.5244636990881758e-06, - "long_answer_loss": 0.0544, - "loss": 0.0644, - "short_answer_loss": NaN, - "step": 2213, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0736, - "grad_norm": 1.3828125, - "learning_rate": 1.5170642057670465e-06, - "long_answer_loss": 0.0736, - "loss": 0.0727, - "short_answer_loss": NaN, - "step": 2214, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.09, - "grad_norm": 1.390625, - "learning_rate": 1.5096815538075043e-06, - "long_answer_loss": 0.09, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 2215, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0683, - "grad_norm": 1.3671875, - "learning_rate": 1.5023157545301854e-06, - "long_answer_loss": 0.0683, - "loss": 0.0706, - "short_answer_loss": NaN, - "step": 2216, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0774, - "grad_norm": 1.390625, - "learning_rate": 1.4949668192298942e-06, - "long_answer_loss": 0.0774, - "loss": 0.0765, - "short_answer_loss": NaN, - "step": 2217, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0556, - "grad_norm": 1.328125, - "learning_rate": 1.487634759175574e-06, - "long_answer_loss": 0.0556, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2218, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0826, - "grad_norm": 1.3671875, - "learning_rate": 1.4803195856102917e-06, - "long_answer_loss": 0.0826, - "loss": 0.0779, - "short_answer_loss": NaN, - "step": 2219, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0693, - "grad_norm": 1.3984375, - "learning_rate": 1.4730213097512213e-06, - "long_answer_loss": 0.0693, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 2220, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0746, - "grad_norm": 1.3828125, - "learning_rate": 1.4657399427896152e-06, - "long_answer_loss": 0.0746, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 2221, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0711, - "grad_norm": 1.484375, - "learning_rate": 1.4584754958908195e-06, - "long_answer_loss": 0.0711, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 2222, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0624, - "grad_norm": 1.375, - "learning_rate": 1.4512279801942099e-06, - "long_answer_loss": 0.0624, - "loss": 0.0689, - "short_answer_loss": NaN, - "step": 2223, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0728, - "grad_norm": 1.4375, - "learning_rate": 1.4439974068132204e-06, - "long_answer_loss": 0.0728, - "loss": 0.0739, - "short_answer_loss": NaN, - "step": 2224, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0729, - "grad_norm": 1.375, - "learning_rate": 1.43678378683529e-06, - "long_answer_loss": 0.0729, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2225, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0759, - "grad_norm": 1.421875, - "learning_rate": 1.4295871313218702e-06, - "long_answer_loss": 0.0759, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 2226, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0684, - "grad_norm": 1.3984375, - "learning_rate": 1.4224074513083983e-06, - "long_answer_loss": 0.0684, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 2227, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0797, - "grad_norm": 1.375, - "learning_rate": 1.415244757804271e-06, - "long_answer_loss": 0.0797, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 2228, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0756, - "grad_norm": 1.359375, - "learning_rate": 1.4080990617928571e-06, - "long_answer_loss": 0.0756, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 2229, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0881, - "grad_norm": 1.4609375, - "learning_rate": 1.4009703742314404e-06, - "long_answer_loss": 0.0881, - "loss": 0.0743, - "short_answer_loss": NaN, - "step": 2230, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0692, - "grad_norm": 1.421875, - "learning_rate": 1.3938587060512417e-06, - "long_answer_loss": 0.0692, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 2231, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0805, - "grad_norm": 1.390625, - "learning_rate": 1.3867640681573687e-06, - "long_answer_loss": 0.0805, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 2232, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0745, - "grad_norm": 1.3515625, - "learning_rate": 1.379686471428826e-06, - "long_answer_loss": 0.0745, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 2233, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0676, - "grad_norm": 1.359375, - "learning_rate": 1.3726259267184807e-06, - "long_answer_loss": 0.0676, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 2234, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0785, - "grad_norm": 1.4296875, - "learning_rate": 1.3655824448530557e-06, - "long_answer_loss": 0.0785, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 2235, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0802, - "grad_norm": 1.546875, - "learning_rate": 1.35855603663311e-06, - "long_answer_loss": 0.0802, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2236, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.068, - "grad_norm": 1.4609375, - "learning_rate": 1.3515467128330115e-06, - "long_answer_loss": 0.068, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 2237, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.061, - "grad_norm": 1.3125, - "learning_rate": 1.3445544842009493e-06, - "long_answer_loss": 0.061, - "loss": 0.0661, - "short_answer_loss": NaN, - "step": 2238, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0749, - "grad_norm": 1.4765625, - "learning_rate": 1.3375793614588794e-06, - "long_answer_loss": 0.0749, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 2239, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0687, - "grad_norm": 1.375, - "learning_rate": 1.3306213553025444e-06, - "long_answer_loss": 0.0687, - "loss": 0.0681, - "short_answer_loss": NaN, - "step": 2240, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0632, - "grad_norm": 1.375, - "learning_rate": 1.323680476401426e-06, - "long_answer_loss": 0.0632, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 2241, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0636, - "grad_norm": 1.4453125, - "learning_rate": 1.3167567353987498e-06, - "long_answer_loss": 0.0636, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 2242, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0698, - "grad_norm": 1.3046875, - "learning_rate": 1.3098501429114618e-06, - "long_answer_loss": 0.0698, - "loss": 0.068, - "short_answer_loss": NaN, - "step": 2243, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0659, - "grad_norm": 1.34375, - "learning_rate": 1.3029607095302112e-06, - "long_answer_loss": 0.0659, - "loss": 0.0665, - "short_answer_loss": NaN, - "step": 2244, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0689, - "grad_norm": 1.3828125, - "learning_rate": 1.296088445819335e-06, - "long_answer_loss": 0.0689, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 2245, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0667, - "grad_norm": 1.3359375, - "learning_rate": 1.2892333623168426e-06, - "long_answer_loss": 0.0667, - "loss": 0.0668, - "short_answer_loss": NaN, - "step": 2246, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0643, - "grad_norm": 1.4375, - "learning_rate": 1.2823954695344005e-06, - "long_answer_loss": 0.0643, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 2247, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0776, - "grad_norm": 1.453125, - "learning_rate": 1.2755747779573099e-06, - "long_answer_loss": 0.0776, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 2248, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0697, - "grad_norm": 1.421875, - "learning_rate": 1.2687712980444994e-06, - "long_answer_loss": 0.0697, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 2249, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0537, - "grad_norm": 1.296875, - "learning_rate": 1.2619850402285054e-06, - "long_answer_loss": 0.0537, - "loss": 0.0644, - "short_answer_loss": NaN, - "step": 2250, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0674, - "grad_norm": 1.3984375, - "learning_rate": 1.255216014915453e-06, - "long_answer_loss": 0.0674, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 2251, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0817, - "grad_norm": 1.3828125, - "learning_rate": 1.2484642324850471e-06, - "long_answer_loss": 0.0817, - "loss": 0.0752, - "short_answer_loss": NaN, - "step": 2252, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0771, - "grad_norm": 1.4765625, - "learning_rate": 1.2417297032905465e-06, - "long_answer_loss": 0.0771, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 2253, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0703, - "grad_norm": 1.3359375, - "learning_rate": 1.23501243765876e-06, - "long_answer_loss": 0.0703, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 2254, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0677, - "grad_norm": 1.5, - "learning_rate": 1.2283124458900202e-06, - "long_answer_loss": 0.0677, - "loss": 0.0731, - "short_answer_loss": NaN, - "step": 2255, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.066, - "grad_norm": 1.4921875, - "learning_rate": 1.2216297382581663e-06, - "long_answer_loss": 0.066, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 2256, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0787, - "grad_norm": 1.453125, - "learning_rate": 1.2149643250105495e-06, - "long_answer_loss": 0.0787, - "loss": 0.076, - "short_answer_loss": NaN, - "step": 2257, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0715, - "grad_norm": 1.390625, - "learning_rate": 1.2083162163679857e-06, - "long_answer_loss": 0.0715, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 2258, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0668, - "grad_norm": 1.375, - "learning_rate": 1.2016854225247633e-06, - "long_answer_loss": 0.0668, - "loss": 0.068, - "short_answer_loss": NaN, - "step": 2259, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0817, - "grad_norm": 1.3359375, - "learning_rate": 1.1950719536486201e-06, - "long_answer_loss": 0.0817, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 2260, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0673, - "grad_norm": 1.4140625, - "learning_rate": 1.1884758198807258e-06, - "long_answer_loss": 0.0673, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2261, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.081, - "grad_norm": 1.4375, - "learning_rate": 1.1818970313356673e-06, - "long_answer_loss": 0.081, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 2262, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0797, - "grad_norm": 1.3984375, - "learning_rate": 1.1753355981014374e-06, - "long_answer_loss": 0.0797, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 2263, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.075, - "grad_norm": 1.390625, - "learning_rate": 1.1687915302394144e-06, - "long_answer_loss": 0.075, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 2264, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0692, - "grad_norm": 1.390625, - "learning_rate": 1.1622648377843437e-06, - "long_answer_loss": 0.0692, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 2265, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0967, - "grad_norm": 1.4140625, - "learning_rate": 1.1557555307443387e-06, - "long_answer_loss": 0.0967, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 2266, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0723, - "grad_norm": 1.4453125, - "learning_rate": 1.1492636191008418e-06, - "long_answer_loss": 0.0723, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 2267, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.066, - "grad_norm": 1.328125, - "learning_rate": 1.142789112808626e-06, - "long_answer_loss": 0.066, - "loss": 0.0751, - "short_answer_loss": NaN, - "step": 2268, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0841, - "grad_norm": 1.34375, - "learning_rate": 1.1363320217957746e-06, - "long_answer_loss": 0.0841, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 2269, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0717, - "grad_norm": 1.3671875, - "learning_rate": 1.1298923559636686e-06, - "long_answer_loss": 0.0717, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2270, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0853, - "grad_norm": 1.4921875, - "learning_rate": 1.1234701251869665e-06, - "long_answer_loss": 0.0853, - "loss": 0.0727, - "short_answer_loss": NaN, - "step": 2271, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0811, - "grad_norm": 1.421875, - "learning_rate": 1.1170653393135847e-06, - "long_answer_loss": 0.0811, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 2272, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0791, - "grad_norm": 1.390625, - "learning_rate": 1.1106780081647075e-06, - "long_answer_loss": 0.0791, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 2273, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0606, - "grad_norm": 1.3359375, - "learning_rate": 1.1043081415347323e-06, - "long_answer_loss": 0.0606, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 2274, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0778, - "grad_norm": 1.453125, - "learning_rate": 1.0979557491912956e-06, - "long_answer_loss": 0.0778, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 2275, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0708, - "grad_norm": 1.390625, - "learning_rate": 1.0916208408752237e-06, - "long_answer_loss": 0.0708, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 2276, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0571, - "grad_norm": 1.3515625, - "learning_rate": 1.085303426300542e-06, - "long_answer_loss": 0.0571, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 2277, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0834, - "grad_norm": 1.375, - "learning_rate": 1.0790035151544447e-06, - "long_answer_loss": 0.0834, - "loss": 0.0685, - "short_answer_loss": NaN, - "step": 2278, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0931, - "grad_norm": 1.453125, - "learning_rate": 1.0727211170972916e-06, - "long_answer_loss": 0.0931, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 2279, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0787, - "grad_norm": 1.5, - "learning_rate": 1.0664562417625853e-06, - "long_answer_loss": 0.0787, - "loss": 0.0743, - "short_answer_loss": NaN, - "step": 2280, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0684, - "grad_norm": 1.390625, - "learning_rate": 1.0602088987569537e-06, - "long_answer_loss": 0.0684, - "loss": 0.0675, - "short_answer_loss": NaN, - "step": 2281, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0831, - "grad_norm": 1.3515625, - "learning_rate": 1.053979097660153e-06, - "long_answer_loss": 0.0831, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 2282, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0878, - "grad_norm": 1.40625, - "learning_rate": 1.0477668480250239e-06, - "long_answer_loss": 0.0878, - "loss": 0.078, - "short_answer_loss": NaN, - "step": 2283, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0787, - "grad_norm": 1.4296875, - "learning_rate": 1.0415721593775101e-06, - "long_answer_loss": 0.0787, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 2284, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0845, - "grad_norm": 1.4609375, - "learning_rate": 1.0353950412166149e-06, - "long_answer_loss": 0.0845, - "loss": 0.0782, - "short_answer_loss": NaN, - "step": 2285, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0677, - "grad_norm": 1.34375, - "learning_rate": 1.0292355030144044e-06, - "long_answer_loss": 0.0677, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 2286, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0702, - "grad_norm": 1.453125, - "learning_rate": 1.0230935542159855e-06, - "long_answer_loss": 0.0702, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 2287, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.068, - "grad_norm": 1.4296875, - "learning_rate": 1.0169692042394957e-06, - "long_answer_loss": 0.068, - "loss": 0.0717, - "short_answer_loss": NaN, - "step": 2288, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0702, - "grad_norm": 1.5078125, - "learning_rate": 1.0108624624760852e-06, - "long_answer_loss": 0.0702, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 2289, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0754, - "grad_norm": 1.421875, - "learning_rate": 1.0047733382898966e-06, - "long_answer_loss": 0.0754, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 2290, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.071, - "grad_norm": 1.3828125, - "learning_rate": 9.987018410180724e-07, - "long_answer_loss": 0.071, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 2291, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0671, - "grad_norm": 1.328125, - "learning_rate": 9.926479799707109e-07, - "long_answer_loss": 0.0671, - "loss": 0.0695, - "short_answer_loss": NaN, - "step": 2292, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0811, - "grad_norm": 1.4296875, - "learning_rate": 9.866117644308754e-07, - "long_answer_loss": 0.0811, - "loss": 0.0699, - "short_answer_loss": NaN, - "step": 2293, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0713, - "grad_norm": 1.3671875, - "learning_rate": 9.805932036545686e-07, - "long_answer_loss": 0.0713, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 2294, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0806, - "grad_norm": 1.3671875, - "learning_rate": 9.745923068707225e-07, - "long_answer_loss": 0.0806, - "loss": 0.0746, - "short_answer_loss": NaN, - "step": 2295, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0974, - "grad_norm": 1.3984375, - "learning_rate": 9.68609083281183e-07, - "long_answer_loss": 0.0974, - "loss": 0.0739, - "short_answer_loss": NaN, - "step": 2296, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0903, - "grad_norm": 1.4140625, - "learning_rate": 9.626435420606913e-07, - "long_answer_loss": 0.0903, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 2297, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0622, - "grad_norm": 1.40625, - "learning_rate": 9.56695692356882e-07, - "long_answer_loss": 0.0622, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2298, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0697, - "grad_norm": 1.390625, - "learning_rate": 9.50765543290251e-07, - "long_answer_loss": 0.0697, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 2299, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0724, - "grad_norm": 1.3515625, - "learning_rate": 9.448531039541672e-07, - "long_answer_loss": 0.0724, - "loss": 0.0739, - "short_answer_loss": NaN, - "step": 2300, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0762, - "grad_norm": 1.328125, - "learning_rate": 9.389583834148244e-07, - "long_answer_loss": 0.0762, - "loss": 0.0678, - "short_answer_loss": NaN, - "step": 2301, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0749, - "grad_norm": 1.375, - "learning_rate": 9.330813907112615e-07, - "long_answer_loss": 0.0749, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 2302, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0785, - "grad_norm": 1.453125, - "learning_rate": 9.272221348553253e-07, - "long_answer_loss": 0.0785, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 2303, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.083, - "grad_norm": 1.40625, - "learning_rate": 9.213806248316664e-07, - "long_answer_loss": 0.083, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 2304, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0761, - "grad_norm": 1.4375, - "learning_rate": 9.155568695977265e-07, - "long_answer_loss": 0.0761, - "loss": 0.067, - "short_answer_loss": NaN, - "step": 2305, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0793, - "grad_norm": 1.40625, - "learning_rate": 9.097508780837177e-07, - "long_answer_loss": 0.0793, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 2306, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.077, - "grad_norm": 1.40625, - "learning_rate": 9.039626591926156e-07, - "long_answer_loss": 0.077, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 2307, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0782, - "grad_norm": 1.3203125, - "learning_rate": 8.981922218001454e-07, - "long_answer_loss": 0.0782, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 2308, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0694, - "grad_norm": 1.4609375, - "learning_rate": 8.924395747547568e-07, - "long_answer_loss": 0.0694, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 2309, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0663, - "grad_norm": 1.3203125, - "learning_rate": 8.867047268776296e-07, - "long_answer_loss": 0.0663, - "loss": 0.0632, - "short_answer_loss": NaN, - "step": 2310, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0783, - "grad_norm": 1.359375, - "learning_rate": 8.809876869626463e-07, - "long_answer_loss": 0.0783, - "loss": 0.0731, - "short_answer_loss": NaN, - "step": 2311, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0762, - "grad_norm": 1.4140625, - "learning_rate": 8.752884637763817e-07, - "long_answer_loss": 0.0762, - "loss": 0.0665, - "short_answer_loss": NaN, - "step": 2312, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0575, - "grad_norm": 1.390625, - "learning_rate": 8.69607066058091e-07, - "long_answer_loss": 0.0575, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 2313, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0759, - "grad_norm": 1.4765625, - "learning_rate": 8.639435025196957e-07, - "long_answer_loss": 0.0759, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2314, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0818, - "grad_norm": 1.453125, - "learning_rate": 8.582977818457696e-07, - "long_answer_loss": 0.0818, - "loss": 0.0755, - "short_answer_loss": NaN, - "step": 2315, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0825, - "grad_norm": 1.4140625, - "learning_rate": 8.526699126935267e-07, - "long_answer_loss": 0.0825, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 2316, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0792, - "grad_norm": 1.4453125, - "learning_rate": 8.470599036928096e-07, - "long_answer_loss": 0.0792, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 2317, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.074, - "grad_norm": 1.390625, - "learning_rate": 8.41467763446066e-07, - "long_answer_loss": 0.074, - "loss": 0.0727, - "short_answer_loss": NaN, - "step": 2318, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0812, - "grad_norm": 1.4296875, - "learning_rate": 8.35893500528355e-07, - "long_answer_loss": 0.0812, - "loss": 0.0783, - "short_answer_loss": NaN, - "step": 2319, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0702, - "grad_norm": 1.5390625, - "learning_rate": 8.303371234873111e-07, - "long_answer_loss": 0.0702, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 2320, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0743, - "grad_norm": 1.390625, - "learning_rate": 8.24798640843151e-07, - "long_answer_loss": 0.0743, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 2321, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.061, - "grad_norm": 1.34375, - "learning_rate": 8.192780610886449e-07, - "long_answer_loss": 0.061, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 2322, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0864, - "grad_norm": 1.5078125, - "learning_rate": 8.137753926891187e-07, - "long_answer_loss": 0.0864, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 2323, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0767, - "grad_norm": 1.390625, - "learning_rate": 8.082906440824253e-07, - "long_answer_loss": 0.0767, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 2324, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0553, - "grad_norm": 1.296875, - "learning_rate": 8.028238236789401e-07, - "long_answer_loss": 0.0553, - "loss": 0.0675, - "short_answer_loss": NaN, - "step": 2325, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0551, - "grad_norm": 1.3515625, - "learning_rate": 7.973749398615546e-07, - "long_answer_loss": 0.0551, - "loss": 0.0699, - "short_answer_loss": NaN, - "step": 2326, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0656, - "grad_norm": 1.3359375, - "learning_rate": 7.919440009856436e-07, - "long_answer_loss": 0.0656, - "loss": 0.0691, - "short_answer_loss": NaN, - "step": 2327, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.069, - "grad_norm": 1.390625, - "learning_rate": 7.8653101537908e-07, - "long_answer_loss": 0.069, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 2328, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0673, - "grad_norm": 1.4140625, - "learning_rate": 7.811359913421939e-07, - "long_answer_loss": 0.0673, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 2329, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0649, - "grad_norm": 1.5, - "learning_rate": 7.757589371477775e-07, - "long_answer_loss": 0.0649, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 2330, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0715, - "grad_norm": 1.421875, - "learning_rate": 7.703998610410712e-07, - "long_answer_loss": 0.0715, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 2331, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0708, - "grad_norm": 1.4140625, - "learning_rate": 7.650587712397419e-07, - "long_answer_loss": 0.0708, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 2332, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0691, - "grad_norm": 1.390625, - "learning_rate": 7.597356759338828e-07, - "long_answer_loss": 0.0691, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 2333, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0673, - "grad_norm": 1.4609375, - "learning_rate": 7.544305832859825e-07, - "long_answer_loss": 0.0673, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 2334, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0868, - "grad_norm": 1.375, - "learning_rate": 7.49143501430942e-07, - "long_answer_loss": 0.0868, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 2335, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0652, - "grad_norm": 1.390625, - "learning_rate": 7.438744384760249e-07, - "long_answer_loss": 0.0652, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 2336, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0782, - "grad_norm": 1.4375, - "learning_rate": 7.38623402500882e-07, - "long_answer_loss": 0.0782, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 2337, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0669, - "grad_norm": 1.3828125, - "learning_rate": 7.333904015575058e-07, - "long_answer_loss": 0.0669, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 2338, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0818, - "grad_norm": 1.3671875, - "learning_rate": 7.28175443670244e-07, - "long_answer_loss": 0.0818, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 2339, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0596, - "grad_norm": 1.375, - "learning_rate": 7.229785368357764e-07, - "long_answer_loss": 0.0596, - "loss": 0.0675, - "short_answer_loss": NaN, - "step": 2340, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0786, - "grad_norm": 1.4453125, - "learning_rate": 7.177996890230937e-07, - "long_answer_loss": 0.0786, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 2341, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0687, - "grad_norm": 1.375, - "learning_rate": 7.126389081735075e-07, - "long_answer_loss": 0.0687, - "loss": 0.0688, - "short_answer_loss": NaN, - "step": 2342, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0766, - "grad_norm": 1.453125, - "learning_rate": 7.074962022006151e-07, - "long_answer_loss": 0.0766, - "loss": 0.0788, - "short_answer_loss": NaN, - "step": 2343, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0589, - "grad_norm": 1.3828125, - "learning_rate": 7.02371578990306e-07, - "long_answer_loss": 0.0589, - "loss": 0.0676, - "short_answer_loss": NaN, - "step": 2344, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.083, - "grad_norm": 1.421875, - "learning_rate": 6.972650464007344e-07, - "long_answer_loss": 0.083, - "loss": 0.0724, - "short_answer_loss": NaN, - "step": 2345, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0804, - "grad_norm": 1.3984375, - "learning_rate": 6.921766122623158e-07, - "long_answer_loss": 0.0804, - "loss": 0.0706, - "short_answer_loss": NaN, - "step": 2346, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.066, - "grad_norm": 1.4296875, - "learning_rate": 6.871062843777157e-07, - "long_answer_loss": 0.066, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 2347, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.085, - "grad_norm": 1.390625, - "learning_rate": 6.820540705218343e-07, - "long_answer_loss": 0.085, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 2348, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0753, - "grad_norm": 1.3671875, - "learning_rate": 6.770199784417966e-07, - "long_answer_loss": 0.0753, - "loss": 0.0766, - "short_answer_loss": NaN, - "step": 2349, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0756, - "grad_norm": 1.40625, - "learning_rate": 6.720040158569322e-07, - "long_answer_loss": 0.0756, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 2350, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0676, - "grad_norm": 1.3828125, - "learning_rate": 6.670061904587826e-07, - "long_answer_loss": 0.0676, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 2351, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0473, - "grad_norm": 1.390625, - "learning_rate": 6.620265099110679e-07, - "long_answer_loss": 0.0473, - "loss": 0.0638, - "short_answer_loss": NaN, - "step": 2352, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0752, - "grad_norm": 1.53125, - "learning_rate": 6.570649818496922e-07, - "long_answer_loss": 0.0752, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 2353, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.067, - "grad_norm": 1.390625, - "learning_rate": 6.521216138827155e-07, - "long_answer_loss": 0.067, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 2354, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0564, - "grad_norm": 1.3515625, - "learning_rate": 6.471964135903578e-07, - "long_answer_loss": 0.0564, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 2355, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0809, - "grad_norm": 1.421875, - "learning_rate": 6.42289388524979e-07, - "long_answer_loss": 0.0809, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 2356, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0749, - "grad_norm": 1.4453125, - "learning_rate": 6.374005462110685e-07, - "long_answer_loss": 0.0749, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 2357, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.061, - "grad_norm": 1.3203125, - "learning_rate": 6.32529894145234e-07, - "long_answer_loss": 0.061, - "loss": 0.0678, - "short_answer_loss": NaN, - "step": 2358, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0737, - "grad_norm": 1.453125, - "learning_rate": 6.276774397961885e-07, - "long_answer_loss": 0.0737, - "loss": 0.0724, - "short_answer_loss": NaN, - "step": 2359, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0871, - "grad_norm": 1.46875, - "learning_rate": 6.228431906047467e-07, - "long_answer_loss": 0.0871, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 2360, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0753, - "grad_norm": 1.8515625, - "learning_rate": 6.180271539837954e-07, - "long_answer_loss": 0.0753, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 2361, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0731, - "grad_norm": 1.3046875, - "learning_rate": 6.132293373183065e-07, - "long_answer_loss": 0.0731, - "loss": 0.0651, - "short_answer_loss": NaN, - "step": 2362, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0698, - "grad_norm": 1.359375, - "learning_rate": 6.084497479653062e-07, - "long_answer_loss": 0.0698, - "loss": 0.0682, - "short_answer_loss": NaN, - "step": 2363, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0755, - "grad_norm": 1.453125, - "learning_rate": 6.03688393253872e-07, - "long_answer_loss": 0.0755, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 2364, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0718, - "grad_norm": 1.3984375, - "learning_rate": 5.989452804851206e-07, - "long_answer_loss": 0.0718, - "loss": 0.0743, - "short_answer_loss": NaN, - "step": 2365, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0754, - "grad_norm": 1.375, - "learning_rate": 5.94220416932198e-07, - "long_answer_loss": 0.0754, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 2366, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0916, - "grad_norm": 1.4140625, - "learning_rate": 5.895138098402628e-07, - "long_answer_loss": 0.0916, - "loss": 0.0777, - "short_answer_loss": NaN, - "step": 2367, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0745, - "grad_norm": 1.3671875, - "learning_rate": 5.848254664264848e-07, - "long_answer_loss": 0.0745, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 2368, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0879, - "grad_norm": 1.3515625, - "learning_rate": 5.801553938800192e-07, - "long_answer_loss": 0.0879, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 2369, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0695, - "grad_norm": 1.40625, - "learning_rate": 5.755035993620137e-07, - "long_answer_loss": 0.0695, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 2370, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0781, - "grad_norm": 1.359375, - "learning_rate": 5.708700900055819e-07, - "long_answer_loss": 0.0781, - "loss": 0.07, - "short_answer_loss": NaN, - "step": 2371, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0877, - "grad_norm": 1.5078125, - "learning_rate": 5.662548729158015e-07, - "long_answer_loss": 0.0877, - "loss": 0.0742, - "short_answer_loss": NaN, - "step": 2372, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0715, - "grad_norm": 1.359375, - "learning_rate": 5.616579551697004e-07, - "long_answer_loss": 0.0715, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 2373, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0649, - "grad_norm": 1.421875, - "learning_rate": 5.570793438162456e-07, - "long_answer_loss": 0.0649, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 2374, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0812, - "grad_norm": 1.46875, - "learning_rate": 5.525190458763332e-07, - "long_answer_loss": 0.0812, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 2375, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0691, - "grad_norm": 1.4609375, - "learning_rate": 5.479770683427768e-07, - "long_answer_loss": 0.0691, - "loss": 0.0775, - "short_answer_loss": NaN, - "step": 2376, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0883, - "grad_norm": 1.40625, - "learning_rate": 5.434534181803008e-07, - "long_answer_loss": 0.0883, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 2377, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0847, - "grad_norm": 1.7421875, - "learning_rate": 5.389481023255149e-07, - "long_answer_loss": 0.0847, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 2378, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0707, - "grad_norm": 1.328125, - "learning_rate": 5.344611276869318e-07, - "long_answer_loss": 0.0707, - "loss": 0.0684, - "short_answer_loss": NaN, - "step": 2379, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0777, - "grad_norm": 1.3984375, - "learning_rate": 5.299925011449269e-07, - "long_answer_loss": 0.0777, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 2380, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0743, - "grad_norm": 1.34375, - "learning_rate": 5.255422295517426e-07, - "long_answer_loss": 0.0743, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 2381, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0812, - "grad_norm": 1.3515625, - "learning_rate": 5.211103197314784e-07, - "long_answer_loss": 0.0812, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2382, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0833, - "grad_norm": 1.40625, - "learning_rate": 5.166967784800774e-07, - "long_answer_loss": 0.0833, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 2383, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0688, - "grad_norm": 1.3671875, - "learning_rate": 5.123016125653163e-07, - "long_answer_loss": 0.0688, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 2384, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0814, - "grad_norm": 1.421875, - "learning_rate": 5.079248287267885e-07, - "long_answer_loss": 0.0814, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 2385, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0697, - "grad_norm": 1.40625, - "learning_rate": 5.035664336759116e-07, - "long_answer_loss": 0.0697, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2386, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0608, - "grad_norm": 1.4296875, - "learning_rate": 4.992264340958924e-07, - "long_answer_loss": 0.0608, - "loss": 0.0678, - "short_answer_loss": NaN, - "step": 2387, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0744, - "grad_norm": 1.421875, - "learning_rate": 4.94904836641745e-07, - "long_answer_loss": 0.0744, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 2388, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0608, - "grad_norm": 1.359375, - "learning_rate": 4.906016479402504e-07, - "long_answer_loss": 0.0608, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 2389, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0828, - "grad_norm": 1.3828125, - "learning_rate": 4.863168745899704e-07, - "long_answer_loss": 0.0828, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2390, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0727, - "grad_norm": 1.3984375, - "learning_rate": 4.820505231612274e-07, - "long_answer_loss": 0.0727, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 2391, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0733, - "grad_norm": 1.3984375, - "learning_rate": 4.778026001960936e-07, - "long_answer_loss": 0.0733, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 2392, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0689, - "grad_norm": 1.4609375, - "learning_rate": 4.73573112208385e-07, - "long_answer_loss": 0.0689, - "loss": 0.0759, - "short_answer_loss": NaN, - "step": 2393, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0918, - "grad_norm": 1.578125, - "learning_rate": 4.693620656836442e-07, - "long_answer_loss": 0.0918, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 2394, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0708, - "grad_norm": 1.3671875, - "learning_rate": 4.6516946707914205e-07, - "long_answer_loss": 0.0708, - "loss": 0.0717, - "short_answer_loss": NaN, - "step": 2395, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.087, - "grad_norm": 1.4765625, - "learning_rate": 4.609953228238553e-07, - "long_answer_loss": 0.087, - "loss": 0.0767, - "short_answer_loss": NaN, - "step": 2396, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0608, - "grad_norm": 1.3828125, - "learning_rate": 4.568396393184696e-07, - "long_answer_loss": 0.0608, - "loss": 0.0669, - "short_answer_loss": NaN, - "step": 2397, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.07, - "grad_norm": 1.3671875, - "learning_rate": 4.527024229353541e-07, - "long_answer_loss": 0.07, - "loss": 0.0699, - "short_answer_loss": NaN, - "step": 2398, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0613, - "grad_norm": 1.328125, - "learning_rate": 4.485836800185661e-07, - "long_answer_loss": 0.0613, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 2399, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0643, - "grad_norm": 1.3359375, - "learning_rate": 4.444834168838355e-07, - "long_answer_loss": 0.0643, - "loss": 0.0688, - "short_answer_loss": NaN, - "step": 2400, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0703, - "grad_norm": 1.3828125, - "learning_rate": 4.4040163981855095e-07, - "long_answer_loss": 0.0703, - "loss": 0.0666, - "short_answer_loss": NaN, - "step": 2401, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0727, - "grad_norm": 1.5078125, - "learning_rate": 4.3633835508175987e-07, - "long_answer_loss": 0.0727, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 2402, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0771, - "grad_norm": 1.4140625, - "learning_rate": 4.322935689041449e-07, - "long_answer_loss": 0.0771, - "loss": 0.0774, - "short_answer_loss": NaN, - "step": 2403, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0807, - "grad_norm": 1.4296875, - "learning_rate": 4.2826728748803504e-07, - "long_answer_loss": 0.0807, - "loss": 0.0731, - "short_answer_loss": NaN, - "step": 2404, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0996, - "grad_norm": 1.296875, - "learning_rate": 4.242595170073735e-07, - "long_answer_loss": 0.0996, - "loss": 0.0681, - "short_answer_loss": NaN, - "step": 2405, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0733, - "grad_norm": 1.453125, - "learning_rate": 4.2027026360772215e-07, - "long_answer_loss": 0.0733, - "loss": 0.0674, - "short_answer_loss": NaN, - "step": 2406, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0721, - "grad_norm": 1.3203125, - "learning_rate": 4.162995334062489e-07, - "long_answer_loss": 0.0721, - "loss": 0.0672, - "short_answer_loss": NaN, - "step": 2407, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0816, - "grad_norm": 1.40625, - "learning_rate": 4.1234733249171794e-07, - "long_answer_loss": 0.0816, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 2408, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0664, - "grad_norm": 1.390625, - "learning_rate": 4.084136669244801e-07, - "long_answer_loss": 0.0664, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 2409, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0903, - "grad_norm": 1.4375, - "learning_rate": 4.044985427364645e-07, - "long_answer_loss": 0.0903, - "loss": 0.0765, - "short_answer_loss": NaN, - "step": 2410, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0713, - "grad_norm": 1.375, - "learning_rate": 4.0060196593116747e-07, - "long_answer_loss": 0.0713, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 2411, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0668, - "grad_norm": 1.4375, - "learning_rate": 3.9672394248364414e-07, - "long_answer_loss": 0.0668, - "loss": 0.0694, - "short_answer_loss": NaN, - "step": 2412, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.065, - "grad_norm": 1.34375, - "learning_rate": 3.9286447834050304e-07, - "long_answer_loss": 0.065, - "loss": 0.0666, - "short_answer_loss": NaN, - "step": 2413, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0767, - "grad_norm": 1.421875, - "learning_rate": 3.890235794198907e-07, - "long_answer_loss": 0.0767, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 2414, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.069, - "grad_norm": 1.4765625, - "learning_rate": 3.8520125161148475e-07, - "long_answer_loss": 0.069, - "loss": 0.0803, - "short_answer_loss": NaN, - "step": 2415, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0714, - "grad_norm": 1.4375, - "learning_rate": 3.8139750077648834e-07, - "long_answer_loss": 0.0714, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 2416, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.083, - "grad_norm": 1.4609375, - "learning_rate": 3.7761233274761774e-07, - "long_answer_loss": 0.083, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 2417, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0671, - "grad_norm": 1.390625, - "learning_rate": 3.738457533290926e-07, - "long_answer_loss": 0.0671, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 2418, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.067, - "grad_norm": 1.46875, - "learning_rate": 3.7009776829663027e-07, - "long_answer_loss": 0.067, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 2419, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0779, - "grad_norm": 1.3671875, - "learning_rate": 3.663683833974349e-07, - "long_answer_loss": 0.0779, - "loss": 0.0687, - "short_answer_loss": NaN, - "step": 2420, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0607, - "grad_norm": 1.34375, - "learning_rate": 3.626576043501889e-07, - "long_answer_loss": 0.0607, - "loss": 0.0706, - "short_answer_loss": NaN, - "step": 2421, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0726, - "grad_norm": 1.4453125, - "learning_rate": 3.5896543684504205e-07, - "long_answer_loss": 0.0726, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 2422, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0784, - "grad_norm": 1.3671875, - "learning_rate": 3.5529188654361e-07, - "long_answer_loss": 0.0784, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 2423, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0727, - "grad_norm": 1.34375, - "learning_rate": 3.5163695907895477e-07, - "long_answer_loss": 0.0727, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 2424, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0809, - "grad_norm": 1.359375, - "learning_rate": 3.480006600555849e-07, - "long_answer_loss": 0.0809, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 2425, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0754, - "grad_norm": 1.453125, - "learning_rate": 3.4438299504944563e-07, - "long_answer_loss": 0.0754, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2426, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0632, - "grad_norm": 1.5234375, - "learning_rate": 3.4078396960790656e-07, - "long_answer_loss": 0.0632, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 2427, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0809, - "grad_norm": 1.3984375, - "learning_rate": 3.37203589249753e-07, - "long_answer_loss": 0.0809, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 2428, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0755, - "grad_norm": 1.375, - "learning_rate": 3.3364185946518217e-07, - "long_answer_loss": 0.0755, - "loss": 0.0742, - "short_answer_loss": NaN, - "step": 2429, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.068, - "grad_norm": 1.359375, - "learning_rate": 3.3009878571579473e-07, - "long_answer_loss": 0.068, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 2430, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0681, - "grad_norm": 1.3203125, - "learning_rate": 3.26574373434578e-07, - "long_answer_loss": 0.0681, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 2431, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0864, - "grad_norm": 1.484375, - "learning_rate": 3.2306862802590904e-07, - "long_answer_loss": 0.0864, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 2432, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0857, - "grad_norm": 1.3984375, - "learning_rate": 3.195815548655376e-07, - "long_answer_loss": 0.0857, - "loss": 0.0748, - "short_answer_loss": NaN, - "step": 2433, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0886, - "grad_norm": 1.328125, - "learning_rate": 3.1611315930058225e-07, - "long_answer_loss": 0.0886, - "loss": 0.0727, - "short_answer_loss": NaN, - "step": 2434, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0747, - "grad_norm": 1.390625, - "learning_rate": 3.126634466495207e-07, - "long_answer_loss": 0.0747, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 2435, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0843, - "grad_norm": 1.3828125, - "learning_rate": 3.092324222021825e-07, - "long_answer_loss": 0.0843, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 2436, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0685, - "grad_norm": 1.46875, - "learning_rate": 3.05820091219744e-07, - "long_answer_loss": 0.0685, - "loss": 0.0766, - "short_answer_loss": NaN, - "step": 2437, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0817, - "grad_norm": 1.484375, - "learning_rate": 3.0242645893470563e-07, - "long_answer_loss": 0.0817, - "loss": 0.0749, - "short_answer_loss": NaN, - "step": 2438, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0703, - "grad_norm": 1.3671875, - "learning_rate": 2.990515305509117e-07, - "long_answer_loss": 0.0703, - "loss": 0.0689, - "short_answer_loss": NaN, - "step": 2439, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0671, - "grad_norm": 1.3984375, - "learning_rate": 2.9569531124350876e-07, - "long_answer_loss": 0.0671, - "loss": 0.0695, - "short_answer_loss": NaN, - "step": 2440, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0762, - "grad_norm": 1.4375, - "learning_rate": 2.923578061589688e-07, - "long_answer_loss": 0.0762, - "loss": 0.0695, - "short_answer_loss": NaN, - "step": 2441, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0845, - "grad_norm": 1.46875, - "learning_rate": 2.890390204150564e-07, - "long_answer_loss": 0.0845, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 2442, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0726, - "grad_norm": 1.3671875, - "learning_rate": 2.857389591008383e-07, - "long_answer_loss": 0.0726, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 2443, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0583, - "grad_norm": 1.515625, - "learning_rate": 2.824576272766666e-07, - "long_answer_loss": 0.0583, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 2444, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0842, - "grad_norm": 1.40625, - "learning_rate": 2.791950299741747e-07, - "long_answer_loss": 0.0842, - "loss": 0.0704, - "short_answer_loss": NaN, - "step": 2445, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0642, - "grad_norm": 1.4140625, - "learning_rate": 2.7595117219626626e-07, - "long_answer_loss": 0.0642, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2446, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0612, - "grad_norm": 1.375, - "learning_rate": 2.727260589171096e-07, - "long_answer_loss": 0.0612, - "loss": 0.0748, - "short_answer_loss": NaN, - "step": 2447, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0754, - "grad_norm": 1.4453125, - "learning_rate": 2.6951969508213355e-07, - "long_answer_loss": 0.0754, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 2448, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0708, - "grad_norm": 1.421875, - "learning_rate": 2.6633208560800927e-07, - "long_answer_loss": 0.0708, - "loss": 0.0717, - "short_answer_loss": NaN, - "step": 2449, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0651, - "grad_norm": 1.421875, - "learning_rate": 2.631632353826602e-07, - "long_answer_loss": 0.0651, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2450, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0677, - "grad_norm": 1.453125, - "learning_rate": 2.600131492652341e-07, - "long_answer_loss": 0.0677, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 2451, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0875, - "grad_norm": 1.328125, - "learning_rate": 2.568818320861102e-07, - "long_answer_loss": 0.0875, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 2452, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0776, - "grad_norm": 1.3984375, - "learning_rate": 2.5376928864688927e-07, - "long_answer_loss": 0.0776, - "loss": 0.076, - "short_answer_loss": NaN, - "step": 2453, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.082, - "grad_norm": 1.484375, - "learning_rate": 2.50675523720377e-07, - "long_answer_loss": 0.082, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2454, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0796, - "grad_norm": 1.359375, - "learning_rate": 2.476005420505925e-07, - "long_answer_loss": 0.0796, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 2455, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0673, - "grad_norm": 1.4140625, - "learning_rate": 2.4454434835274596e-07, - "long_answer_loss": 0.0673, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2456, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0753, - "grad_norm": 1.5, - "learning_rate": 2.4150694731324283e-07, - "long_answer_loss": 0.0753, - "loss": 0.0739, - "short_answer_loss": NaN, - "step": 2457, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0778, - "grad_norm": 1.390625, - "learning_rate": 2.3848834358966705e-07, - "long_answer_loss": 0.0778, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 2458, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0652, - "grad_norm": 1.4453125, - "learning_rate": 2.3548854181078272e-07, - "long_answer_loss": 0.0652, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 2459, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0699, - "grad_norm": 1.4140625, - "learning_rate": 2.3250754657651863e-07, - "long_answer_loss": 0.0699, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2460, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.071, - "grad_norm": 1.4765625, - "learning_rate": 2.2954536245796827e-07, - "long_answer_loss": 0.071, - "loss": 0.0756, - "short_answer_loss": NaN, - "step": 2461, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0692, - "grad_norm": 1.421875, - "learning_rate": 2.2660199399738014e-07, - "long_answer_loss": 0.0692, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 2462, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0698, - "grad_norm": 1.390625, - "learning_rate": 2.2367744570814808e-07, - "long_answer_loss": 0.0698, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 2463, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0823, - "grad_norm": 1.34375, - "learning_rate": 2.2077172207481123e-07, - "long_answer_loss": 0.0823, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 2464, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0684, - "grad_norm": 1.4296875, - "learning_rate": 2.1788482755303734e-07, - "long_answer_loss": 0.0684, - "loss": 0.0763, - "short_answer_loss": NaN, - "step": 2465, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0684, - "grad_norm": 1.4140625, - "learning_rate": 2.1501676656962428e-07, - "long_answer_loss": 0.0684, - "loss": 0.0717, - "short_answer_loss": NaN, - "step": 2466, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0703, - "grad_norm": 1.390625, - "learning_rate": 2.1216754352249151e-07, - "long_answer_loss": 0.0703, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2467, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0848, - "grad_norm": 1.359375, - "learning_rate": 2.093371627806706e-07, - "long_answer_loss": 0.0848, - "loss": 0.0742, - "short_answer_loss": NaN, - "step": 2468, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0688, - "grad_norm": 1.3984375, - "learning_rate": 2.0652562868429953e-07, - "long_answer_loss": 0.0688, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 2469, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0767, - "grad_norm": 1.4140625, - "learning_rate": 2.0373294554461715e-07, - "long_answer_loss": 0.0767, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2470, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.074, - "grad_norm": 1.515625, - "learning_rate": 2.0095911764395764e-07, - "long_answer_loss": 0.074, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 2471, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0531, - "grad_norm": 1.375, - "learning_rate": 1.9820414923574087e-07, - "long_answer_loss": 0.0531, - "loss": 0.0712, - "short_answer_loss": NaN, - "step": 2472, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0651, - "grad_norm": 1.4296875, - "learning_rate": 1.9546804454446676e-07, - "long_answer_loss": 0.0651, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 2473, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0795, - "grad_norm": 1.3984375, - "learning_rate": 1.9275080776570976e-07, - "long_answer_loss": 0.0795, - "loss": 0.0705, - "short_answer_loss": NaN, - "step": 2474, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0687, - "grad_norm": 1.3984375, - "learning_rate": 1.9005244306611185e-07, - "long_answer_loss": 0.0687, - "loss": 0.0745, - "short_answer_loss": NaN, - "step": 2475, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0729, - "grad_norm": 1.4140625, - "learning_rate": 1.8737295458337855e-07, - "long_answer_loss": 0.0729, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 2476, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.064, - "grad_norm": 1.3828125, - "learning_rate": 1.84712346426269e-07, - "long_answer_loss": 0.064, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 2477, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0605, - "grad_norm": 1.328125, - "learning_rate": 1.8207062267458775e-07, - "long_answer_loss": 0.0605, - "loss": 0.0688, - "short_answer_loss": NaN, - "step": 2478, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0823, - "grad_norm": 1.359375, - "learning_rate": 1.7944778737918748e-07, - "long_answer_loss": 0.0823, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 2479, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0835, - "grad_norm": 1.484375, - "learning_rate": 1.7684384456195385e-07, - "long_answer_loss": 0.0835, - "loss": 0.0779, - "short_answer_loss": NaN, - "step": 2480, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.068, - "grad_norm": 1.4296875, - "learning_rate": 1.7425879821580394e-07, - "long_answer_loss": 0.068, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 2481, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0701, - "grad_norm": 1.40625, - "learning_rate": 1.7169265230467525e-07, - "long_answer_loss": 0.0701, - "loss": 0.0699, - "short_answer_loss": NaN, - "step": 2482, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0749, - "grad_norm": 1.4375, - "learning_rate": 1.6914541076352847e-07, - "long_answer_loss": 0.0749, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 2483, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0687, - "grad_norm": 1.3828125, - "learning_rate": 1.6661707749833082e-07, - "long_answer_loss": 0.0687, - "loss": 0.0673, - "short_answer_loss": NaN, - "step": 2484, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0806, - "grad_norm": 1.3828125, - "learning_rate": 1.6410765638606023e-07, - "long_answer_loss": 0.0806, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2485, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0754, - "grad_norm": 1.375, - "learning_rate": 1.616171512746914e-07, - "long_answer_loss": 0.0754, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 2486, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0975, - "grad_norm": 1.4375, - "learning_rate": 1.5914556598319307e-07, - "long_answer_loss": 0.0975, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 2487, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0758, - "grad_norm": 1.4375, - "learning_rate": 1.5669290430152388e-07, - "long_answer_loss": 0.0758, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 2488, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0888, - "grad_norm": 1.453125, - "learning_rate": 1.5425916999062402e-07, - "long_answer_loss": 0.0888, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 2489, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.073, - "grad_norm": 1.4296875, - "learning_rate": 1.5184436678241097e-07, - "long_answer_loss": 0.073, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 2490, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0751, - "grad_norm": 1.4765625, - "learning_rate": 1.4944849837976726e-07, - "long_answer_loss": 0.0751, - "loss": 0.0778, - "short_answer_loss": NaN, - "step": 2491, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0552, - "grad_norm": 1.328125, - "learning_rate": 1.4707156845655267e-07, - "long_answer_loss": 0.0552, - "loss": 0.0731, - "short_answer_loss": NaN, - "step": 2492, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0773, - "grad_norm": 1.375, - "learning_rate": 1.447135806575725e-07, - "long_answer_loss": 0.0773, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 2493, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0629, - "grad_norm": 1.375, - "learning_rate": 1.4237453859859696e-07, - "long_answer_loss": 0.0629, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 2494, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0669, - "grad_norm": 1.5859375, - "learning_rate": 1.4005444586633886e-07, - "long_answer_loss": 0.0669, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 2495, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0683, - "grad_norm": 1.3515625, - "learning_rate": 1.377533060184552e-07, - "long_answer_loss": 0.0683, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 2496, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0654, - "grad_norm": 1.5078125, - "learning_rate": 1.3547112258354143e-07, - "long_answer_loss": 0.0654, - "loss": 0.0751, - "short_answer_loss": NaN, - "step": 2497, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0653, - "grad_norm": 1.3515625, - "learning_rate": 1.3320789906112186e-07, - "long_answer_loss": 0.0653, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 2498, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0696, - "grad_norm": 1.3203125, - "learning_rate": 1.309636389216537e-07, - "long_answer_loss": 0.0696, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 2499, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0706, - "grad_norm": 1.4921875, - "learning_rate": 1.2873834560650778e-07, - "long_answer_loss": 0.0706, - "loss": 0.0761, - "short_answer_loss": NaN, - "step": 2500, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.068, - "grad_norm": 1.40625, - "learning_rate": 1.2653202252797815e-07, - "long_answer_loss": 0.068, - "loss": 0.0756, - "short_answer_loss": NaN, - "step": 2501, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.087, - "grad_norm": 1.5390625, - "learning_rate": 1.2434467306926405e-07, - "long_answer_loss": 0.087, - "loss": 0.0823, - "short_answer_loss": NaN, - "step": 2502, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0638, - "grad_norm": 1.453125, - "learning_rate": 1.2217630058447282e-07, - "long_answer_loss": 0.0638, - "loss": 0.0696, - "short_answer_loss": NaN, - "step": 2503, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0692, - "grad_norm": 1.3515625, - "learning_rate": 1.2002690839861276e-07, - "long_answer_loss": 0.0692, - "loss": 0.0695, - "short_answer_loss": NaN, - "step": 2504, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0725, - "grad_norm": 1.359375, - "learning_rate": 1.1789649980758627e-07, - "long_answer_loss": 0.0725, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 2505, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0665, - "grad_norm": 1.359375, - "learning_rate": 1.1578507807818717e-07, - "long_answer_loss": 0.0665, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 2506, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0819, - "grad_norm": 1.390625, - "learning_rate": 1.1369264644809363e-07, - "long_answer_loss": 0.0819, - "loss": 0.0767, - "short_answer_loss": NaN, - "step": 2507, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.071, - "grad_norm": 1.40625, - "learning_rate": 1.1161920812586546e-07, - "long_answer_loss": 0.071, - "loss": 0.0729, - "short_answer_loss": NaN, - "step": 2508, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0788, - "grad_norm": 1.4375, - "learning_rate": 1.0956476629093438e-07, - "long_answer_loss": 0.0788, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 2509, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0744, - "grad_norm": 1.390625, - "learning_rate": 1.0752932409360955e-07, - "long_answer_loss": 0.0744, - "loss": 0.0698, - "short_answer_loss": NaN, - "step": 2510, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0751, - "grad_norm": 1.390625, - "learning_rate": 1.055128846550596e-07, - "long_answer_loss": 0.0751, - "loss": 0.0727, - "short_answer_loss": NaN, - "step": 2511, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0758, - "grad_norm": 1.390625, - "learning_rate": 1.0351545106731669e-07, - "long_answer_loss": 0.0758, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 2512, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0892, - "grad_norm": 1.40625, - "learning_rate": 1.015370263932669e-07, - "long_answer_loss": 0.0892, - "loss": 0.0792, - "short_answer_loss": NaN, - "step": 2513, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0743, - "grad_norm": 1.484375, - "learning_rate": 9.957761366665292e-08, - "long_answer_loss": 0.0743, - "loss": 0.0717, - "short_answer_loss": NaN, - "step": 2514, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0696, - "grad_norm": 1.40625, - "learning_rate": 9.763721589205882e-08, - "long_answer_loss": 0.0696, - "loss": 0.066, - "short_answer_loss": NaN, - "step": 2515, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.069, - "grad_norm": 1.4296875, - "learning_rate": 9.571583604491286e-08, - "long_answer_loss": 0.069, - "loss": 0.0772, - "short_answer_loss": NaN, - "step": 2516, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0715, - "grad_norm": 1.3203125, - "learning_rate": 9.381347707148325e-08, - "long_answer_loss": 0.0715, - "loss": 0.0709, - "short_answer_loss": NaN, - "step": 2517, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0694, - "grad_norm": 1.421875, - "learning_rate": 9.193014188886712e-08, - "long_answer_loss": 0.0694, - "loss": 0.0692, - "short_answer_loss": NaN, - "step": 2518, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0599, - "grad_norm": 1.4296875, - "learning_rate": 9.006583338499463e-08, - "long_answer_loss": 0.0599, - "loss": 0.0707, - "short_answer_loss": NaN, - "step": 2519, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0666, - "grad_norm": 1.375, - "learning_rate": 8.822055441861515e-08, - "long_answer_loss": 0.0666, - "loss": 0.0699, - "short_answer_loss": NaN, - "step": 2520, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0722, - "grad_norm": 1.421875, - "learning_rate": 8.639430781930413e-08, - "long_answer_loss": 0.0722, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 2521, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0703, - "grad_norm": 1.3515625, - "learning_rate": 8.458709638744788e-08, - "long_answer_loss": 0.0703, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 2522, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0777, - "grad_norm": 1.421875, - "learning_rate": 8.279892289424635e-08, - "long_answer_loss": 0.0777, - "loss": 0.0792, - "short_answer_loss": NaN, - "step": 2523, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.072, - "grad_norm": 1.375, - "learning_rate": 8.102979008170474e-08, - "long_answer_loss": 0.072, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 2524, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0821, - "grad_norm": 1.6015625, - "learning_rate": 7.927970066263085e-08, - "long_answer_loss": 0.0821, - "loss": 0.0775, - "short_answer_loss": NaN, - "step": 2525, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0658, - "grad_norm": 1.3671875, - "learning_rate": 7.754865732063493e-08, - "long_answer_loss": 0.0658, - "loss": 0.0671, - "short_answer_loss": NaN, - "step": 2526, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0581, - "grad_norm": 1.390625, - "learning_rate": 7.58366627101173e-08, - "long_answer_loss": 0.0581, - "loss": 0.0721, - "short_answer_loss": NaN, - "step": 2527, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0652, - "grad_norm": 1.421875, - "learning_rate": 7.41437194562697e-08, - "long_answer_loss": 0.0652, - "loss": 0.0677, - "short_answer_loss": NaN, - "step": 2528, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0712, - "grad_norm": 1.375, - "learning_rate": 7.246983015507247e-08, - "long_answer_loss": 0.0712, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 2529, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0765, - "grad_norm": 1.3984375, - "learning_rate": 7.081499737328634e-08, - "long_answer_loss": 0.0765, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 2530, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0742, - "grad_norm": 1.421875, - "learning_rate": 6.917922364845092e-08, - "long_answer_loss": 0.0742, - "loss": 0.0702, - "short_answer_loss": NaN, - "step": 2531, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0863, - "grad_norm": 1.390625, - "learning_rate": 6.75625114888806e-08, - "long_answer_loss": 0.0863, - "loss": 0.0719, - "short_answer_loss": NaN, - "step": 2532, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0711, - "grad_norm": 1.4140625, - "learning_rate": 6.596486337366176e-08, - "long_answer_loss": 0.0711, - "loss": 0.0778, - "short_answer_loss": NaN, - "step": 2533, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0877, - "grad_norm": 1.5078125, - "learning_rate": 6.438628175264582e-08, - "long_answer_loss": 0.0877, - "loss": 0.0787, - "short_answer_loss": NaN, - "step": 2534, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0691, - "grad_norm": 1.3359375, - "learning_rate": 6.282676904644652e-08, - "long_answer_loss": 0.0691, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 2535, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.071, - "grad_norm": 1.421875, - "learning_rate": 6.12863276464426e-08, - "long_answer_loss": 0.071, - "loss": 0.0741, - "short_answer_loss": NaN, - "step": 2536, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0685, - "grad_norm": 1.453125, - "learning_rate": 5.976495991476121e-08, - "long_answer_loss": 0.0685, - "loss": 0.0718, - "short_answer_loss": NaN, - "step": 2537, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.07, - "grad_norm": 1.4296875, - "learning_rate": 5.826266818428766e-08, - "long_answer_loss": 0.07, - "loss": 0.0784, - "short_answer_loss": NaN, - "step": 2538, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0773, - "grad_norm": 1.359375, - "learning_rate": 5.6779454758652816e-08, - "long_answer_loss": 0.0773, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 2539, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0702, - "grad_norm": 1.375, - "learning_rate": 5.531532191223321e-08, - "long_answer_loss": 0.0702, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 2540, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0625, - "grad_norm": 1.3203125, - "learning_rate": 5.3870271890146814e-08, - "long_answer_loss": 0.0625, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2541, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0771, - "grad_norm": 1.3515625, - "learning_rate": 5.244430690825031e-08, - "long_answer_loss": 0.0771, - "loss": 0.0676, - "short_answer_loss": NaN, - "step": 2542, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0726, - "grad_norm": 1.3515625, - "learning_rate": 5.103742915313764e-08, - "long_answer_loss": 0.0726, - "loss": 0.07, - "short_answer_loss": NaN, - "step": 2543, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0756, - "grad_norm": 1.4453125, - "learning_rate": 4.964964078212619e-08, - "long_answer_loss": 0.0756, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 2544, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0728, - "grad_norm": 1.3671875, - "learning_rate": 4.828094392327204e-08, - "long_answer_loss": 0.0728, - "loss": 0.071, - "short_answer_loss": NaN, - "step": 2545, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0792, - "grad_norm": 1.4140625, - "learning_rate": 4.6931340675347714e-08, - "long_answer_loss": 0.0792, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2546, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.05, - "grad_norm": 1.3671875, - "learning_rate": 4.560083310785196e-08, - "long_answer_loss": 0.05, - "loss": 0.0674, - "short_answer_loss": NaN, - "step": 2547, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.077, - "grad_norm": 1.3359375, - "learning_rate": 4.4289423260999994e-08, - "long_answer_loss": 0.077, - "loss": 0.0675, - "short_answer_loss": NaN, - "step": 2548, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0671, - "grad_norm": 1.46875, - "learning_rate": 4.299711314572352e-08, - "long_answer_loss": 0.0671, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 2549, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0806, - "grad_norm": 1.4375, - "learning_rate": 4.172390474366517e-08, - "long_answer_loss": 0.0806, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 2550, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0705, - "grad_norm": 1.34375, - "learning_rate": 4.0469800007177096e-08, - "long_answer_loss": 0.0705, - "loss": 0.0666, - "short_answer_loss": NaN, - "step": 2551, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0857, - "grad_norm": 1.3984375, - "learning_rate": 3.923480085931963e-08, - "long_answer_loss": 0.0857, - "loss": 0.0774, - "short_answer_loss": NaN, - "step": 2552, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0755, - "grad_norm": 1.359375, - "learning_rate": 3.8018909193854315e-08, - "long_answer_loss": 0.0755, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 2553, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0735, - "grad_norm": 1.28125, - "learning_rate": 3.6822126875242504e-08, - "long_answer_loss": 0.0735, - "loss": 0.0676, - "short_answer_loss": NaN, - "step": 2554, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0694, - "grad_norm": 1.375, - "learning_rate": 3.564445573864539e-08, - "long_answer_loss": 0.0694, - "loss": 0.0717, - "short_answer_loss": NaN, - "step": 2555, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0679, - "grad_norm": 1.46875, - "learning_rate": 3.448589758991705e-08, - "long_answer_loss": 0.0679, - "loss": 0.074, - "short_answer_loss": NaN, - "step": 2556, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0666, - "grad_norm": 1.3984375, - "learning_rate": 3.334645420560445e-08, - "long_answer_loss": 0.0666, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 2557, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0819, - "grad_norm": 1.3203125, - "learning_rate": 3.222612733294189e-08, - "long_answer_loss": 0.0819, - "loss": 0.0725, - "short_answer_loss": NaN, - "step": 2558, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0688, - "grad_norm": 1.3671875, - "learning_rate": 3.112491868985379e-08, - "long_answer_loss": 0.0688, - "loss": 0.0689, - "short_answer_loss": NaN, - "step": 2559, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0737, - "grad_norm": 1.3671875, - "learning_rate": 3.004282996494495e-08, - "long_answer_loss": 0.0737, - "loss": 0.0754, - "short_answer_loss": NaN, - "step": 2560, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0735, - "grad_norm": 1.53125, - "learning_rate": 2.8979862817503368e-08, - "long_answer_loss": 0.0735, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 2561, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0896, - "grad_norm": 1.453125, - "learning_rate": 2.793601887749464e-08, - "long_answer_loss": 0.0896, - "loss": 0.0751, - "short_answer_loss": NaN, - "step": 2562, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0798, - "grad_norm": 1.4765625, - "learning_rate": 2.6911299745562e-08, - "long_answer_loss": 0.0798, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 2563, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0881, - "grad_norm": 1.359375, - "learning_rate": 2.590570699302214e-08, - "long_answer_loss": 0.0881, - "loss": 0.0666, - "short_answer_loss": NaN, - "step": 2564, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0644, - "grad_norm": 1.421875, - "learning_rate": 2.4919242161859646e-08, - "long_answer_loss": 0.0644, - "loss": 0.073, - "short_answer_loss": NaN, - "step": 2565, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0725, - "grad_norm": 1.375, - "learning_rate": 2.3951906764735353e-08, - "long_answer_loss": 0.0725, - "loss": 0.0693, - "short_answer_loss": NaN, - "step": 2566, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0635, - "grad_norm": 1.3828125, - "learning_rate": 2.3003702284969676e-08, - "long_answer_loss": 0.0635, - "loss": 0.0701, - "short_answer_loss": NaN, - "step": 2567, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0625, - "grad_norm": 1.3984375, - "learning_rate": 2.2074630176550927e-08, - "long_answer_loss": 0.0625, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 2568, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0781, - "grad_norm": 1.4765625, - "learning_rate": 2.1164691864129783e-08, - "long_answer_loss": 0.0781, - "loss": 0.0713, - "short_answer_loss": NaN, - "step": 2569, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.07, - "grad_norm": 1.34375, - "learning_rate": 2.02738887430165e-08, - "long_answer_loss": 0.07, - "loss": 0.0728, - "short_answer_loss": NaN, - "step": 2570, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0656, - "grad_norm": 1.3359375, - "learning_rate": 1.9402222179178142e-08, - "long_answer_loss": 0.0656, - "loss": 0.0675, - "short_answer_loss": NaN, - "step": 2571, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0539, - "grad_norm": 1.5078125, - "learning_rate": 1.8549693509238576e-08, - "long_answer_loss": 0.0539, - "loss": 0.0737, - "short_answer_loss": NaN, - "step": 2572, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0661, - "grad_norm": 1.4140625, - "learning_rate": 1.7716304040475697e-08, - "long_answer_loss": 0.0661, - "loss": 0.0738, - "short_answer_loss": NaN, - "step": 2573, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.074, - "grad_norm": 1.3828125, - "learning_rate": 1.6902055050817268e-08, - "long_answer_loss": 0.074, - "loss": 0.0703, - "short_answer_loss": NaN, - "step": 2574, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0855, - "grad_norm": 3.296875, - "learning_rate": 1.6106947788845082e-08, - "long_answer_loss": 0.0855, - "loss": 0.0715, - "short_answer_loss": NaN, - "step": 2575, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.059, - "grad_norm": 1.3828125, - "learning_rate": 1.533098347378109e-08, - "long_answer_loss": 0.059, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 2576, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0748, - "grad_norm": 1.3984375, - "learning_rate": 1.4574163295502652e-08, - "long_answer_loss": 0.0748, - "loss": 0.0681, - "short_answer_loss": NaN, - "step": 2577, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0721, - "grad_norm": 1.40625, - "learning_rate": 1.3836488414524507e-08, - "long_answer_loss": 0.0721, - "loss": 0.069, - "short_answer_loss": NaN, - "step": 2578, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0746, - "grad_norm": 1.390625, - "learning_rate": 1.3117959962005711e-08, - "long_answer_loss": 0.0746, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 2579, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0715, - "grad_norm": 1.421875, - "learning_rate": 1.2418579039746859e-08, - "long_answer_loss": 0.0715, - "loss": 0.0726, - "short_answer_loss": NaN, - "step": 2580, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0867, - "grad_norm": 1.3671875, - "learning_rate": 1.1738346720185922e-08, - "long_answer_loss": 0.0867, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2581, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0794, - "grad_norm": 1.484375, - "learning_rate": 1.1077264046399638e-08, - "long_answer_loss": 0.0794, - "loss": 0.0717, - "short_answer_loss": NaN, - "step": 2582, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0711, - "grad_norm": 1.4453125, - "learning_rate": 1.0435332032100731e-08, - "long_answer_loss": 0.0711, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2583, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0633, - "grad_norm": 1.3828125, - "learning_rate": 9.812551661633751e-09, - "long_answer_loss": 0.0633, - "loss": 0.0708, - "short_answer_loss": NaN, - "step": 2584, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0695, - "grad_norm": 1.375, - "learning_rate": 9.208923889979237e-09, - "long_answer_loss": 0.0695, - "loss": 0.0687, - "short_answer_loss": NaN, - "step": 2585, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0898, - "grad_norm": 1.453125, - "learning_rate": 8.624449642745391e-09, - "long_answer_loss": 0.0898, - "loss": 0.0722, - "short_answer_loss": NaN, - "step": 2586, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0657, - "grad_norm": 1.3984375, - "learning_rate": 8.059129816170851e-09, - "long_answer_loss": 0.0657, - "loss": 0.0659, - "short_answer_loss": NaN, - "step": 2587, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0633, - "grad_norm": 1.3671875, - "learning_rate": 7.512965277126083e-09, - "long_answer_loss": 0.0633, - "loss": 0.07, - "short_answer_loss": NaN, - "step": 2588, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0744, - "grad_norm": 1.3828125, - "learning_rate": 6.985956863105047e-09, - "long_answer_loss": 0.0744, - "loss": 0.0735, - "short_answer_loss": NaN, - "step": 2589, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0649, - "grad_norm": 1.359375, - "learning_rate": 6.478105382229371e-09, - "long_answer_loss": 0.0649, - "loss": 0.0711, - "short_answer_loss": NaN, - "step": 2590, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0631, - "grad_norm": 1.421875, - "learning_rate": 5.989411613242791e-09, - "long_answer_loss": 0.0631, - "loss": 0.067, - "short_answer_loss": NaN, - "step": 2591, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0722, - "grad_norm": 1.3984375, - "learning_rate": 5.519876305515315e-09, - "long_answer_loss": 0.0722, - "loss": 0.0706, - "short_answer_loss": NaN, - "step": 2592, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0779, - "grad_norm": 1.4140625, - "learning_rate": 5.069500179036291e-09, - "long_answer_loss": 0.0779, - "loss": 0.0717, - "short_answer_loss": NaN, - "step": 2593, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0775, - "grad_norm": 1.3828125, - "learning_rate": 4.63828392441995e-09, - "long_answer_loss": 0.0775, - "loss": 0.0732, - "short_answer_loss": NaN, - "step": 2594, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.099, - "grad_norm": 1.328125, - "learning_rate": 4.226228202897087e-09, - "long_answer_loss": 0.099, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 2595, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.055, - "grad_norm": 1.4296875, - "learning_rate": 3.833333646319215e-09, - "long_answer_loss": 0.055, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2596, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0773, - "grad_norm": 1.40625, - "learning_rate": 3.4596008571544102e-09, - "long_answer_loss": 0.0773, - "loss": 0.0758, - "short_answer_loss": NaN, - "step": 2597, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0877, - "grad_norm": 1.3203125, - "learning_rate": 3.105030408490084e-09, - "long_answer_loss": 0.0877, - "loss": 0.0697, - "short_answer_loss": NaN, - "step": 2598, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0814, - "grad_norm": 1.4296875, - "learning_rate": 2.7696228440274308e-09, - "long_answer_loss": 0.0814, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 2599, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0646, - "grad_norm": 1.3515625, - "learning_rate": 2.453378678085594e-09, - "long_answer_loss": 0.0646, - "loss": 0.0716, - "short_answer_loss": NaN, - "step": 2600, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0733, - "grad_norm": 1.3984375, - "learning_rate": 2.1562983955975003e-09, - "long_answer_loss": 0.0733, - "loss": 0.0691, - "short_answer_loss": NaN, - "step": 2601, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0927, - "grad_norm": 1.4375, - "learning_rate": 1.8783824521070857e-09, - "long_answer_loss": 0.0927, - "loss": 0.0753, - "short_answer_loss": NaN, - "step": 2602, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0754, - "grad_norm": 1.421875, - "learning_rate": 1.6196312737762342e-09, - "long_answer_loss": 0.0754, - "loss": 0.0723, - "short_answer_loss": NaN, - "step": 2603, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0783, - "grad_norm": 1.421875, - "learning_rate": 1.3800452573750623e-09, - "long_answer_loss": 0.0783, - "loss": 0.072, - "short_answer_loss": NaN, - "step": 2604, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0784, - "grad_norm": 1.3671875, - "learning_rate": 1.1596247702888584e-09, - "long_answer_loss": 0.0784, - "loss": 0.0687, - "short_answer_loss": NaN, - "step": 2605, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0873, - "grad_norm": 1.421875, - "learning_rate": 9.583701505139208e-10, - "long_answer_loss": 0.0873, - "loss": 0.0739, - "short_answer_loss": NaN, - "step": 2606, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0656, - "grad_norm": 1.59375, - "learning_rate": 7.762817066533923e-10, - "long_answer_loss": 0.0656, - "loss": 0.0734, - "short_answer_loss": NaN, - "step": 2607, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0654, - "grad_norm": 1.3515625, - "learning_rate": 6.133597179269757e-10, - "long_answer_loss": 0.0654, - "loss": 0.0684, - "short_answer_loss": NaN, - "step": 2608, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0674, - "grad_norm": 1.4296875, - "learning_rate": 4.696044341598315e-10, - "long_answer_loss": 0.0674, - "loss": 0.0672, - "short_answer_loss": NaN, - "step": 2609, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "full_loss": 0.0789, - "grad_norm": 1.3984375, - "learning_rate": 3.450160757881293e-10, - "long_answer_loss": 0.0789, - "loss": 0.0689, - "short_answer_loss": NaN, - "step": 2610, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "full_loss": 0.0605, - "grad_norm": 1.359375, - "learning_rate": 2.3959483385627146e-10, - "long_answer_loss": 0.0605, - "loss": 0.0675, - "short_answer_loss": NaN, - "step": 2611, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "full_loss": 0.0674, - "grad_norm": 1.390625, - "learning_rate": 1.5334087001828145e-10, - "long_answer_loss": 0.0674, - "loss": 0.07, - "short_answer_loss": NaN, - "step": 2612, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "full_loss": 0.0619, - "grad_norm": 1.421875, - "learning_rate": 8.625431653919158e-11, - "long_answer_loss": 0.0619, - "loss": 0.0748, - "short_answer_loss": NaN, - "step": 2613, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "full_loss": 0.0675, - "grad_norm": 1.5078125, - "learning_rate": 3.833527628810396e-11, - "long_answer_loss": 0.0675, - "loss": 0.0763, - "short_answer_loss": NaN, - "step": 2614, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "full_loss": 0.0748, - "grad_norm": 1.6015625, - "learning_rate": 9.583822746517257e-12, - "long_answer_loss": 0.0748, - "loss": 0.0747, - "short_answer_loss": NaN, - "step": 2615, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "full_loss": 0.0656, - "grad_norm": 1.34375, - "learning_rate": 0.0, - "long_answer_loss": 0.0656, - "loss": 0.0676, - "short_answer_loss": NaN, - "step": 2616, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "step": 2616, - "total_flos": 3.5412472154040566e+18, - "train_loss": 0.1190942916392551, - "train_runtime": 17303.3718, - "train_samples_per_second": 19.355, - "train_steps_per_second": 0.151 - } - ], - "logging_steps": 1.0, - "max_steps": 2616, - "num_input_tokens_seen": 0, - "num_train_epochs": 2, - "save_steps": 1.0, - "total_flos": 3.5412472154040566e+18, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}