{ "best_metric": 1.4569982290267944, "best_model_checkpoint": "./checkpoint-1424", "epoch": 0.9992982456140351, "eval_steps": 16, "global_step": 1424, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007017543859649122, "grad_norm": 0.9928091168403625, "learning_rate": 5.000000000000001e-07, "loss": 3.8149, "step": 1 }, { "epoch": 0.0014035087719298245, "grad_norm": 1.0101275444030762, "learning_rate": 1.0000000000000002e-06, "loss": 4.0694, "step": 2 }, { "epoch": 0.002105263157894737, "grad_norm": 1.0024998188018799, "learning_rate": 1.5e-06, "loss": 3.954, "step": 3 }, { "epoch": 0.002807017543859649, "grad_norm": 0.9291408658027649, "learning_rate": 2.0000000000000003e-06, "loss": 3.8228, "step": 4 }, { "epoch": 0.0035087719298245615, "grad_norm": 1.0567421913146973, "learning_rate": 2.5e-06, "loss": 4.0643, "step": 5 }, { "epoch": 0.004210526315789474, "grad_norm": 0.9363265037536621, "learning_rate": 3e-06, "loss": 4.1041, "step": 6 }, { "epoch": 0.004912280701754386, "grad_norm": 0.971520721912384, "learning_rate": 3.5000000000000004e-06, "loss": 3.8879, "step": 7 }, { "epoch": 0.005614035087719298, "grad_norm": 0.9608014822006226, "learning_rate": 4.000000000000001e-06, "loss": 4.0432, "step": 8 }, { "epoch": 0.00631578947368421, "grad_norm": 0.9707185626029968, "learning_rate": 4.5e-06, "loss": 3.9807, "step": 9 }, { "epoch": 0.007017543859649123, "grad_norm": 1.0041345357894897, "learning_rate": 5e-06, "loss": 4.0688, "step": 10 }, { "epoch": 0.0077192982456140355, "grad_norm": 1.0085757970809937, "learning_rate": 5.500000000000001e-06, "loss": 3.9952, "step": 11 }, { "epoch": 0.008421052631578947, "grad_norm": 0.9630079865455627, "learning_rate": 6e-06, "loss": 3.766, "step": 12 }, { "epoch": 0.009122807017543859, "grad_norm": 1.0072615146636963, "learning_rate": 6.5000000000000004e-06, "loss": 3.8847, "step": 13 }, { "epoch": 0.009824561403508772, "grad_norm": 1.0733579397201538, "learning_rate": 7.000000000000001e-06, "loss": 3.9948, "step": 14 }, { "epoch": 0.010526315789473684, "grad_norm": 0.9916390776634216, "learning_rate": 7.5e-06, "loss": 3.9149, "step": 15 }, { "epoch": 0.011228070175438596, "grad_norm": 1.066650390625, "learning_rate": 8.000000000000001e-06, "loss": 3.8099, "step": 16 }, { "epoch": 0.011228070175438596, "eval_loss": 3.5970983505249023, "eval_runtime": 65.8107, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 16 }, { "epoch": 0.011929824561403509, "grad_norm": 0.9609705209732056, "learning_rate": 8.500000000000002e-06, "loss": 4.2014, "step": 17 }, { "epoch": 0.01263157894736842, "grad_norm": 1.0079936981201172, "learning_rate": 9e-06, "loss": 3.7662, "step": 18 }, { "epoch": 0.013333333333333334, "grad_norm": 0.9501464366912842, "learning_rate": 9.5e-06, "loss": 3.6349, "step": 19 }, { "epoch": 0.014035087719298246, "grad_norm": 1.0884504318237305, "learning_rate": 1e-05, "loss": 4.032, "step": 20 }, { "epoch": 0.014736842105263158, "grad_norm": 1.1201224327087402, "learning_rate": 1.05e-05, "loss": 4.0953, "step": 21 }, { "epoch": 0.015438596491228071, "grad_norm": 1.0220407247543335, "learning_rate": 1.1000000000000001e-05, "loss": 3.7558, "step": 22 }, { "epoch": 0.016140350877192983, "grad_norm": 1.004400610923767, "learning_rate": 1.1500000000000002e-05, "loss": 3.5918, "step": 23 }, { "epoch": 0.016842105263157894, "grad_norm": 1.0860965251922607, "learning_rate": 1.2e-05, "loss": 3.8151, "step": 24 }, { "epoch": 0.017543859649122806, "grad_norm": 0.9569761157035828, "learning_rate": 1.25e-05, "loss": 3.8188, "step": 25 }, { "epoch": 0.018245614035087718, "grad_norm": 1.0531936883926392, "learning_rate": 1.3000000000000001e-05, "loss": 3.6522, "step": 26 }, { "epoch": 0.018947368421052633, "grad_norm": 1.119773507118225, "learning_rate": 1.3500000000000001e-05, "loss": 3.8432, "step": 27 }, { "epoch": 0.019649122807017545, "grad_norm": 1.1256169080734253, "learning_rate": 1.4000000000000001e-05, "loss": 3.7482, "step": 28 }, { "epoch": 0.020350877192982456, "grad_norm": 0.9968459010124207, "learning_rate": 1.45e-05, "loss": 3.713, "step": 29 }, { "epoch": 0.021052631578947368, "grad_norm": 1.0673344135284424, "learning_rate": 1.5e-05, "loss": 3.9127, "step": 30 }, { "epoch": 0.02175438596491228, "grad_norm": 1.027275800704956, "learning_rate": 1.55e-05, "loss": 3.7287, "step": 31 }, { "epoch": 0.02245614035087719, "grad_norm": 1.0490241050720215, "learning_rate": 1.6000000000000003e-05, "loss": 3.4391, "step": 32 }, { "epoch": 0.02245614035087719, "eval_loss": 3.302555561065674, "eval_runtime": 65.8191, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 32 }, { "epoch": 0.023157894736842106, "grad_norm": 1.0123652219772339, "learning_rate": 1.65e-05, "loss": 3.6645, "step": 33 }, { "epoch": 0.023859649122807018, "grad_norm": 0.9930626749992371, "learning_rate": 1.7000000000000003e-05, "loss": 3.5309, "step": 34 }, { "epoch": 0.02456140350877193, "grad_norm": 1.0697325468063354, "learning_rate": 1.75e-05, "loss": 3.4824, "step": 35 }, { "epoch": 0.02526315789473684, "grad_norm": 1.071256399154663, "learning_rate": 1.8e-05, "loss": 3.6134, "step": 36 }, { "epoch": 0.025964912280701753, "grad_norm": 0.9064105153083801, "learning_rate": 1.85e-05, "loss": 3.582, "step": 37 }, { "epoch": 0.02666666666666667, "grad_norm": 1.0789775848388672, "learning_rate": 1.9e-05, "loss": 3.4349, "step": 38 }, { "epoch": 0.02736842105263158, "grad_norm": 1.0820211172103882, "learning_rate": 1.9500000000000003e-05, "loss": 3.552, "step": 39 }, { "epoch": 0.028070175438596492, "grad_norm": 0.8894747495651245, "learning_rate": 2e-05, "loss": 3.3743, "step": 40 }, { "epoch": 0.028771929824561403, "grad_norm": 0.8957179188728333, "learning_rate": 2.05e-05, "loss": 3.5168, "step": 41 }, { "epoch": 0.029473684210526315, "grad_norm": 0.9250975251197815, "learning_rate": 2.1e-05, "loss": 3.2832, "step": 42 }, { "epoch": 0.030175438596491227, "grad_norm": 0.9642888903617859, "learning_rate": 2.15e-05, "loss": 3.2628, "step": 43 }, { "epoch": 0.030877192982456142, "grad_norm": 0.8928602337837219, "learning_rate": 2.2000000000000003e-05, "loss": 3.3208, "step": 44 }, { "epoch": 0.031578947368421054, "grad_norm": 0.900462806224823, "learning_rate": 2.25e-05, "loss": 3.2336, "step": 45 }, { "epoch": 0.032280701754385965, "grad_norm": 0.9215595722198486, "learning_rate": 2.3000000000000003e-05, "loss": 3.1819, "step": 46 }, { "epoch": 0.03298245614035088, "grad_norm": 0.9151031374931335, "learning_rate": 2.35e-05, "loss": 3.0848, "step": 47 }, { "epoch": 0.03368421052631579, "grad_norm": 0.8561698198318481, "learning_rate": 2.4e-05, "loss": 2.9239, "step": 48 }, { "epoch": 0.03368421052631579, "eval_loss": 2.8186538219451904, "eval_runtime": 65.8337, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 48 }, { "epoch": 0.0343859649122807, "grad_norm": 0.9831928014755249, "learning_rate": 2.45e-05, "loss": 3.1884, "step": 49 }, { "epoch": 0.03508771929824561, "grad_norm": 0.8232364058494568, "learning_rate": 2.5e-05, "loss": 2.967, "step": 50 }, { "epoch": 0.035789473684210524, "grad_norm": 0.8265285491943359, "learning_rate": 2.5500000000000003e-05, "loss": 2.9923, "step": 51 }, { "epoch": 0.036491228070175435, "grad_norm": 0.7578099966049194, "learning_rate": 2.6000000000000002e-05, "loss": 3.0271, "step": 52 }, { "epoch": 0.037192982456140354, "grad_norm": 0.9042999744415283, "learning_rate": 2.6500000000000004e-05, "loss": 3.0086, "step": 53 }, { "epoch": 0.037894736842105266, "grad_norm": 0.7089908719062805, "learning_rate": 2.7000000000000002e-05, "loss": 2.7019, "step": 54 }, { "epoch": 0.03859649122807018, "grad_norm": 0.7533642649650574, "learning_rate": 2.7500000000000004e-05, "loss": 2.9794, "step": 55 }, { "epoch": 0.03929824561403509, "grad_norm": 0.7242478728294373, "learning_rate": 2.8000000000000003e-05, "loss": 2.8195, "step": 56 }, { "epoch": 0.04, "grad_norm": 0.7478873133659363, "learning_rate": 2.8499999999999998e-05, "loss": 2.8574, "step": 57 }, { "epoch": 0.04070175438596491, "grad_norm": 0.7415804862976074, "learning_rate": 2.9e-05, "loss": 2.9205, "step": 58 }, { "epoch": 0.041403508771929824, "grad_norm": 0.6366196274757385, "learning_rate": 2.95e-05, "loss": 2.7043, "step": 59 }, { "epoch": 0.042105263157894736, "grad_norm": 0.7335364818572998, "learning_rate": 3e-05, "loss": 2.7626, "step": 60 }, { "epoch": 0.04280701754385965, "grad_norm": 0.7551740407943726, "learning_rate": 3.05e-05, "loss": 2.651, "step": 61 }, { "epoch": 0.04350877192982456, "grad_norm": 0.8546727299690247, "learning_rate": 3.1e-05, "loss": 2.8325, "step": 62 }, { "epoch": 0.04421052631578947, "grad_norm": 0.9969200491905212, "learning_rate": 3.15e-05, "loss": 2.4287, "step": 63 }, { "epoch": 0.04491228070175438, "grad_norm": 1.0204991102218628, "learning_rate": 3.2000000000000005e-05, "loss": 2.9881, "step": 64 }, { "epoch": 0.04491228070175438, "eval_loss": 2.510679006576538, "eval_runtime": 65.8192, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 64 }, { "epoch": 0.0456140350877193, "grad_norm": 0.7464157342910767, "learning_rate": 3.2500000000000004e-05, "loss": 2.8384, "step": 65 }, { "epoch": 0.04631578947368421, "grad_norm": 0.7616225481033325, "learning_rate": 3.3e-05, "loss": 2.3166, "step": 66 }, { "epoch": 0.047017543859649125, "grad_norm": 0.929684042930603, "learning_rate": 3.35e-05, "loss": 2.8315, "step": 67 }, { "epoch": 0.047719298245614036, "grad_norm": 0.719383180141449, "learning_rate": 3.4000000000000007e-05, "loss": 2.6151, "step": 68 }, { "epoch": 0.04842105263157895, "grad_norm": 0.8510761260986328, "learning_rate": 3.45e-05, "loss": 2.9035, "step": 69 }, { "epoch": 0.04912280701754386, "grad_norm": 0.69796222448349, "learning_rate": 3.5e-05, "loss": 2.6384, "step": 70 }, { "epoch": 0.04982456140350877, "grad_norm": 0.6288107633590698, "learning_rate": 3.55e-05, "loss": 2.3917, "step": 71 }, { "epoch": 0.05052631578947368, "grad_norm": 0.6646111011505127, "learning_rate": 3.6e-05, "loss": 2.5939, "step": 72 }, { "epoch": 0.051228070175438595, "grad_norm": 0.7744008898735046, "learning_rate": 3.65e-05, "loss": 2.5397, "step": 73 }, { "epoch": 0.051929824561403506, "grad_norm": 0.6099230647087097, "learning_rate": 3.7e-05, "loss": 2.4414, "step": 74 }, { "epoch": 0.05263157894736842, "grad_norm": 0.6956833600997925, "learning_rate": 3.7500000000000003e-05, "loss": 2.407, "step": 75 }, { "epoch": 0.05333333333333334, "grad_norm": 0.6209631562232971, "learning_rate": 3.8e-05, "loss": 2.443, "step": 76 }, { "epoch": 0.05403508771929825, "grad_norm": 0.6467370390892029, "learning_rate": 3.85e-05, "loss": 2.6548, "step": 77 }, { "epoch": 0.05473684210526316, "grad_norm": 0.5977020263671875, "learning_rate": 3.9000000000000006e-05, "loss": 2.489, "step": 78 }, { "epoch": 0.05543859649122807, "grad_norm": 0.7086820602416992, "learning_rate": 3.9500000000000005e-05, "loss": 2.5178, "step": 79 }, { "epoch": 0.056140350877192984, "grad_norm": 1.2138707637786865, "learning_rate": 4e-05, "loss": 2.6977, "step": 80 }, { "epoch": 0.056140350877192984, "eval_loss": 2.3084614276885986, "eval_runtime": 65.8269, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 80 }, { "epoch": 0.056842105263157895, "grad_norm": 0.691352367401123, "learning_rate": 4.05e-05, "loss": 2.5726, "step": 81 }, { "epoch": 0.05754385964912281, "grad_norm": 0.556818425655365, "learning_rate": 4.1e-05, "loss": 2.3146, "step": 82 }, { "epoch": 0.05824561403508772, "grad_norm": 0.6409417986869812, "learning_rate": 4.15e-05, "loss": 2.4344, "step": 83 }, { "epoch": 0.05894736842105263, "grad_norm": 0.7173857092857361, "learning_rate": 4.2e-05, "loss": 2.6225, "step": 84 }, { "epoch": 0.05964912280701754, "grad_norm": 0.6835973262786865, "learning_rate": 4.25e-05, "loss": 2.5033, "step": 85 }, { "epoch": 0.060350877192982454, "grad_norm": 0.604385256767273, "learning_rate": 4.3e-05, "loss": 2.4072, "step": 86 }, { "epoch": 0.061052631578947365, "grad_norm": 0.7555168271064758, "learning_rate": 4.35e-05, "loss": 2.4785, "step": 87 }, { "epoch": 0.061754385964912284, "grad_norm": 0.5255555510520935, "learning_rate": 4.4000000000000006e-05, "loss": 2.2154, "step": 88 }, { "epoch": 0.062456140350877196, "grad_norm": 0.743011474609375, "learning_rate": 4.4500000000000004e-05, "loss": 2.4447, "step": 89 }, { "epoch": 0.06315789473684211, "grad_norm": 0.5670934319496155, "learning_rate": 4.5e-05, "loss": 2.4279, "step": 90 }, { "epoch": 0.06385964912280702, "grad_norm": 0.6251064538955688, "learning_rate": 4.55e-05, "loss": 2.434, "step": 91 }, { "epoch": 0.06456140350877193, "grad_norm": 0.6435151100158691, "learning_rate": 4.600000000000001e-05, "loss": 2.4084, "step": 92 }, { "epoch": 0.06526315789473684, "grad_norm": 0.553666889667511, "learning_rate": 4.6500000000000005e-05, "loss": 2.2531, "step": 93 }, { "epoch": 0.06596491228070175, "grad_norm": 0.623855471611023, "learning_rate": 4.7e-05, "loss": 2.5186, "step": 94 }, { "epoch": 0.06666666666666667, "grad_norm": 0.57314133644104, "learning_rate": 4.75e-05, "loss": 2.168, "step": 95 }, { "epoch": 0.06736842105263158, "grad_norm": 0.7393507957458496, "learning_rate": 4.8e-05, "loss": 2.5395, "step": 96 }, { "epoch": 0.06736842105263158, "eval_loss": 2.1815531253814697, "eval_runtime": 65.8331, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 96 }, { "epoch": 0.06807017543859649, "grad_norm": 0.6377303600311279, "learning_rate": 4.85e-05, "loss": 2.437, "step": 97 }, { "epoch": 0.0687719298245614, "grad_norm": 0.5928265452384949, "learning_rate": 4.9e-05, "loss": 2.2184, "step": 98 }, { "epoch": 0.06947368421052631, "grad_norm": 0.6546623706817627, "learning_rate": 4.9500000000000004e-05, "loss": 2.3626, "step": 99 }, { "epoch": 0.07017543859649122, "grad_norm": 0.5733924508094788, "learning_rate": 5e-05, "loss": 2.3069, "step": 100 }, { "epoch": 0.07087719298245614, "grad_norm": 0.5549228191375732, "learning_rate": 5.05e-05, "loss": 2.1463, "step": 101 }, { "epoch": 0.07157894736842105, "grad_norm": 0.5057318806648254, "learning_rate": 5.1000000000000006e-05, "loss": 2.1056, "step": 102 }, { "epoch": 0.07228070175438596, "grad_norm": 0.55784010887146, "learning_rate": 5.1500000000000005e-05, "loss": 2.2485, "step": 103 }, { "epoch": 0.07298245614035087, "grad_norm": 0.7705625891685486, "learning_rate": 5.2000000000000004e-05, "loss": 2.4975, "step": 104 }, { "epoch": 0.07368421052631578, "grad_norm": 0.7641386985778809, "learning_rate": 5.25e-05, "loss": 2.3585, "step": 105 }, { "epoch": 0.07438596491228071, "grad_norm": 0.5911398530006409, "learning_rate": 5.300000000000001e-05, "loss": 2.1133, "step": 106 }, { "epoch": 0.07508771929824562, "grad_norm": 0.6771928668022156, "learning_rate": 5.3500000000000006e-05, "loss": 2.3865, "step": 107 }, { "epoch": 0.07578947368421053, "grad_norm": 0.5390835404396057, "learning_rate": 5.4000000000000005e-05, "loss": 2.1475, "step": 108 }, { "epoch": 0.07649122807017544, "grad_norm": 0.6490298509597778, "learning_rate": 5.45e-05, "loss": 2.2857, "step": 109 }, { "epoch": 0.07719298245614035, "grad_norm": 0.5900008678436279, "learning_rate": 5.500000000000001e-05, "loss": 2.1128, "step": 110 }, { "epoch": 0.07789473684210527, "grad_norm": 0.6465023756027222, "learning_rate": 5.550000000000001e-05, "loss": 2.4125, "step": 111 }, { "epoch": 0.07859649122807018, "grad_norm": 0.530427098274231, "learning_rate": 5.6000000000000006e-05, "loss": 2.1766, "step": 112 }, { "epoch": 0.07859649122807018, "eval_loss": 2.0767898559570312, "eval_runtime": 65.8203, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 112 }, { "epoch": 0.07929824561403509, "grad_norm": 0.580603301525116, "learning_rate": 5.65e-05, "loss": 2.1998, "step": 113 }, { "epoch": 0.08, "grad_norm": 0.5760000348091125, "learning_rate": 5.6999999999999996e-05, "loss": 2.3152, "step": 114 }, { "epoch": 0.08070175438596491, "grad_norm": 0.8854110836982727, "learning_rate": 5.7499999999999995e-05, "loss": 2.4859, "step": 115 }, { "epoch": 0.08140350877192983, "grad_norm": 0.6056390404701233, "learning_rate": 5.8e-05, "loss": 2.2121, "step": 116 }, { "epoch": 0.08210526315789474, "grad_norm": 0.6043855547904968, "learning_rate": 5.85e-05, "loss": 2.1688, "step": 117 }, { "epoch": 0.08280701754385965, "grad_norm": 0.5782562494277954, "learning_rate": 5.9e-05, "loss": 1.9482, "step": 118 }, { "epoch": 0.08350877192982456, "grad_norm": 0.5910701155662537, "learning_rate": 5.95e-05, "loss": 2.1588, "step": 119 }, { "epoch": 0.08421052631578947, "grad_norm": 0.5671840310096741, "learning_rate": 6e-05, "loss": 2.2517, "step": 120 }, { "epoch": 0.08491228070175438, "grad_norm": 0.5868790745735168, "learning_rate": 6.05e-05, "loss": 2.0578, "step": 121 }, { "epoch": 0.0856140350877193, "grad_norm": 0.533165454864502, "learning_rate": 6.1e-05, "loss": 2.1406, "step": 122 }, { "epoch": 0.0863157894736842, "grad_norm": 0.6702587008476257, "learning_rate": 6.15e-05, "loss": 2.1612, "step": 123 }, { "epoch": 0.08701754385964912, "grad_norm": 0.8873376250267029, "learning_rate": 6.2e-05, "loss": 2.5703, "step": 124 }, { "epoch": 0.08771929824561403, "grad_norm": 0.591484010219574, "learning_rate": 6.25e-05, "loss": 2.0485, "step": 125 }, { "epoch": 0.08842105263157894, "grad_norm": 0.5561603903770447, "learning_rate": 6.3e-05, "loss": 1.9075, "step": 126 }, { "epoch": 0.08912280701754385, "grad_norm": 0.6469268202781677, "learning_rate": 6.35e-05, "loss": 2.0296, "step": 127 }, { "epoch": 0.08982456140350877, "grad_norm": 0.6495757102966309, "learning_rate": 6.400000000000001e-05, "loss": 2.1601, "step": 128 }, { "epoch": 0.08982456140350877, "eval_loss": 1.990267276763916, "eval_runtime": 65.8317, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 128 }, { "epoch": 0.09052631578947369, "grad_norm": 0.5613084435462952, "learning_rate": 6.450000000000001e-05, "loss": 1.9591, "step": 129 }, { "epoch": 0.0912280701754386, "grad_norm": 0.572991132736206, "learning_rate": 6.500000000000001e-05, "loss": 2.0454, "step": 130 }, { "epoch": 0.09192982456140351, "grad_norm": 0.5316200256347656, "learning_rate": 6.55e-05, "loss": 1.7485, "step": 131 }, { "epoch": 0.09263157894736843, "grad_norm": 0.6565808653831482, "learning_rate": 6.6e-05, "loss": 2.0338, "step": 132 }, { "epoch": 0.09333333333333334, "grad_norm": 0.7127291560173035, "learning_rate": 6.65e-05, "loss": 1.9532, "step": 133 }, { "epoch": 0.09403508771929825, "grad_norm": 0.684954047203064, "learning_rate": 6.7e-05, "loss": 2.012, "step": 134 }, { "epoch": 0.09473684210526316, "grad_norm": 0.5826917290687561, "learning_rate": 6.750000000000001e-05, "loss": 2.046, "step": 135 }, { "epoch": 0.09543859649122807, "grad_norm": 0.6228445768356323, "learning_rate": 6.800000000000001e-05, "loss": 1.9479, "step": 136 }, { "epoch": 0.09614035087719298, "grad_norm": 0.5441390872001648, "learning_rate": 6.850000000000001e-05, "loss": 1.9236, "step": 137 }, { "epoch": 0.0968421052631579, "grad_norm": 0.5885330438613892, "learning_rate": 6.9e-05, "loss": 2.1276, "step": 138 }, { "epoch": 0.09754385964912281, "grad_norm": 0.5856627225875854, "learning_rate": 6.95e-05, "loss": 2.0872, "step": 139 }, { "epoch": 0.09824561403508772, "grad_norm": 0.5710261464118958, "learning_rate": 7e-05, "loss": 1.9711, "step": 140 }, { "epoch": 0.09894736842105263, "grad_norm": 0.5488821864128113, "learning_rate": 7.05e-05, "loss": 1.9962, "step": 141 }, { "epoch": 0.09964912280701754, "grad_norm": 0.6389551758766174, "learning_rate": 7.1e-05, "loss": 2.0884, "step": 142 }, { "epoch": 0.10035087719298245, "grad_norm": 0.6057158708572388, "learning_rate": 7.15e-05, "loss": 1.9124, "step": 143 }, { "epoch": 0.10105263157894737, "grad_norm": 0.6388987302780151, "learning_rate": 7.2e-05, "loss": 2.0115, "step": 144 }, { "epoch": 0.10105263157894737, "eval_loss": 1.920411229133606, "eval_runtime": 65.8407, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 144 }, { "epoch": 0.10175438596491228, "grad_norm": 0.634857714176178, "learning_rate": 7.25e-05, "loss": 1.895, "step": 145 }, { "epoch": 0.10245614035087719, "grad_norm": 0.6049835681915283, "learning_rate": 7.3e-05, "loss": 2.05, "step": 146 }, { "epoch": 0.1031578947368421, "grad_norm": 0.6329953074455261, "learning_rate": 7.35e-05, "loss": 2.0513, "step": 147 }, { "epoch": 0.10385964912280701, "grad_norm": 0.6400398015975952, "learning_rate": 7.4e-05, "loss": 2.047, "step": 148 }, { "epoch": 0.10456140350877192, "grad_norm": 0.5854972004890442, "learning_rate": 7.450000000000001e-05, "loss": 2.0214, "step": 149 }, { "epoch": 0.10526315789473684, "grad_norm": 0.6599582433700562, "learning_rate": 7.500000000000001e-05, "loss": 1.97, "step": 150 }, { "epoch": 0.10596491228070175, "grad_norm": 0.6030476093292236, "learning_rate": 7.55e-05, "loss": 1.947, "step": 151 }, { "epoch": 0.10666666666666667, "grad_norm": 0.6454229950904846, "learning_rate": 7.6e-05, "loss": 1.8443, "step": 152 }, { "epoch": 0.10736842105263159, "grad_norm": 0.5656477212905884, "learning_rate": 7.65e-05, "loss": 1.9065, "step": 153 }, { "epoch": 0.1080701754385965, "grad_norm": 0.7885503172874451, "learning_rate": 7.7e-05, "loss": 2.0137, "step": 154 }, { "epoch": 0.10877192982456141, "grad_norm": 0.5692377686500549, "learning_rate": 7.75e-05, "loss": 1.8321, "step": 155 }, { "epoch": 0.10947368421052632, "grad_norm": 0.6062223315238953, "learning_rate": 7.800000000000001e-05, "loss": 1.8324, "step": 156 }, { "epoch": 0.11017543859649123, "grad_norm": 0.6104239225387573, "learning_rate": 7.850000000000001e-05, "loss": 2.0809, "step": 157 }, { "epoch": 0.11087719298245614, "grad_norm": 0.5728961825370789, "learning_rate": 7.900000000000001e-05, "loss": 1.7715, "step": 158 }, { "epoch": 0.11157894736842106, "grad_norm": 0.5158399939537048, "learning_rate": 7.950000000000001e-05, "loss": 1.779, "step": 159 }, { "epoch": 0.11228070175438597, "grad_norm": 0.6013870239257812, "learning_rate": 8e-05, "loss": 1.8702, "step": 160 }, { "epoch": 0.11228070175438597, "eval_loss": 1.859572410583496, "eval_runtime": 65.8254, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 160 }, { "epoch": 0.11298245614035088, "grad_norm": 0.5647510290145874, "learning_rate": 8.05e-05, "loss": 1.9163, "step": 161 }, { "epoch": 0.11368421052631579, "grad_norm": 0.6037722229957581, "learning_rate": 8.1e-05, "loss": 1.8491, "step": 162 }, { "epoch": 0.1143859649122807, "grad_norm": 0.5200175642967224, "learning_rate": 8.15e-05, "loss": 1.8144, "step": 163 }, { "epoch": 0.11508771929824561, "grad_norm": 0.5617084503173828, "learning_rate": 8.2e-05, "loss": 1.7968, "step": 164 }, { "epoch": 0.11578947368421053, "grad_norm": 0.6475440263748169, "learning_rate": 8.25e-05, "loss": 1.9709, "step": 165 }, { "epoch": 0.11649122807017544, "grad_norm": 0.5815696716308594, "learning_rate": 8.3e-05, "loss": 1.8604, "step": 166 }, { "epoch": 0.11719298245614035, "grad_norm": 0.5634240508079529, "learning_rate": 8.35e-05, "loss": 1.8542, "step": 167 }, { "epoch": 0.11789473684210526, "grad_norm": 0.5449738502502441, "learning_rate": 8.4e-05, "loss": 1.7606, "step": 168 }, { "epoch": 0.11859649122807017, "grad_norm": 0.5260114669799805, "learning_rate": 8.450000000000001e-05, "loss": 1.6395, "step": 169 }, { "epoch": 0.11929824561403508, "grad_norm": 0.6559540033340454, "learning_rate": 8.5e-05, "loss": 1.7735, "step": 170 }, { "epoch": 0.12, "grad_norm": 0.6063936352729797, "learning_rate": 8.55e-05, "loss": 1.8046, "step": 171 }, { "epoch": 0.12070175438596491, "grad_norm": 0.631157636642456, "learning_rate": 8.6e-05, "loss": 1.8018, "step": 172 }, { "epoch": 0.12140350877192982, "grad_norm": 0.612994909286499, "learning_rate": 8.65e-05, "loss": 1.7551, "step": 173 }, { "epoch": 0.12210526315789473, "grad_norm": 0.5569384694099426, "learning_rate": 8.7e-05, "loss": 1.7306, "step": 174 }, { "epoch": 0.12280701754385964, "grad_norm": 0.7006474137306213, "learning_rate": 8.75e-05, "loss": 1.9716, "step": 175 }, { "epoch": 0.12350877192982457, "grad_norm": 0.6427428722381592, "learning_rate": 8.800000000000001e-05, "loss": 1.903, "step": 176 }, { "epoch": 0.12350877192982457, "eval_loss": 1.812495470046997, "eval_runtime": 65.8221, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 176 }, { "epoch": 0.12421052631578948, "grad_norm": 0.6829315423965454, "learning_rate": 8.850000000000001e-05, "loss": 1.7839, "step": 177 }, { "epoch": 0.12491228070175439, "grad_norm": 0.5791435241699219, "learning_rate": 8.900000000000001e-05, "loss": 1.8174, "step": 178 }, { "epoch": 0.1256140350877193, "grad_norm": 0.7477076053619385, "learning_rate": 8.950000000000001e-05, "loss": 2.0264, "step": 179 }, { "epoch": 0.12631578947368421, "grad_norm": 0.6763526201248169, "learning_rate": 9e-05, "loss": 1.7543, "step": 180 }, { "epoch": 0.1270175438596491, "grad_norm": 0.6811761260032654, "learning_rate": 9.05e-05, "loss": 2.0755, "step": 181 }, { "epoch": 0.12771929824561404, "grad_norm": 0.5490175485610962, "learning_rate": 9.1e-05, "loss": 1.7176, "step": 182 }, { "epoch": 0.12842105263157894, "grad_norm": 0.5766850113868713, "learning_rate": 9.15e-05, "loss": 1.6224, "step": 183 }, { "epoch": 0.12912280701754386, "grad_norm": 1.223882794380188, "learning_rate": 9.200000000000001e-05, "loss": 2.2317, "step": 184 }, { "epoch": 0.12982456140350876, "grad_norm": 0.5552406311035156, "learning_rate": 9.250000000000001e-05, "loss": 1.7407, "step": 185 }, { "epoch": 0.13052631578947368, "grad_norm": 0.6658439636230469, "learning_rate": 9.300000000000001e-05, "loss": 1.8968, "step": 186 }, { "epoch": 0.1312280701754386, "grad_norm": 0.5724976658821106, "learning_rate": 9.350000000000001e-05, "loss": 1.7009, "step": 187 }, { "epoch": 0.1319298245614035, "grad_norm": 1.0073543787002563, "learning_rate": 9.4e-05, "loss": 1.9768, "step": 188 }, { "epoch": 0.13263157894736843, "grad_norm": 0.6007497310638428, "learning_rate": 9.449999999999999e-05, "loss": 1.8139, "step": 189 }, { "epoch": 0.13333333333333333, "grad_norm": 0.6300903558731079, "learning_rate": 9.5e-05, "loss": 1.9173, "step": 190 }, { "epoch": 0.13403508771929826, "grad_norm": 1.058654546737671, "learning_rate": 9.55e-05, "loss": 2.2861, "step": 191 }, { "epoch": 0.13473684210526315, "grad_norm": 0.7176299095153809, "learning_rate": 9.6e-05, "loss": 1.7633, "step": 192 }, { "epoch": 0.13473684210526315, "eval_loss": 1.7750072479248047, "eval_runtime": 65.8237, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 192 }, { "epoch": 0.13543859649122808, "grad_norm": 0.6150356531143188, "learning_rate": 9.65e-05, "loss": 1.7951, "step": 193 }, { "epoch": 0.13614035087719298, "grad_norm": 0.5919904708862305, "learning_rate": 9.7e-05, "loss": 1.9202, "step": 194 }, { "epoch": 0.1368421052631579, "grad_norm": 0.6272373795509338, "learning_rate": 9.75e-05, "loss": 1.9147, "step": 195 }, { "epoch": 0.1375438596491228, "grad_norm": 0.5895818471908569, "learning_rate": 9.8e-05, "loss": 1.806, "step": 196 }, { "epoch": 0.13824561403508773, "grad_norm": 0.5607153177261353, "learning_rate": 9.850000000000001e-05, "loss": 1.7982, "step": 197 }, { "epoch": 0.13894736842105262, "grad_norm": 0.6958012580871582, "learning_rate": 9.900000000000001e-05, "loss": 1.6869, "step": 198 }, { "epoch": 0.13964912280701755, "grad_norm": 0.6276410818099976, "learning_rate": 9.95e-05, "loss": 1.7787, "step": 199 }, { "epoch": 0.14035087719298245, "grad_norm": 0.6062275767326355, "learning_rate": 0.0001, "loss": 1.8814, "step": 200 }, { "epoch": 0.14105263157894737, "grad_norm": 0.5878540277481079, "learning_rate": 0.00010049999999999999, "loss": 1.778, "step": 201 }, { "epoch": 0.14175438596491227, "grad_norm": 0.5870038866996765, "learning_rate": 0.000101, "loss": 1.7957, "step": 202 }, { "epoch": 0.1424561403508772, "grad_norm": 0.5907621383666992, "learning_rate": 0.0001015, "loss": 1.7461, "step": 203 }, { "epoch": 0.1431578947368421, "grad_norm": 0.5737916231155396, "learning_rate": 0.00010200000000000001, "loss": 1.6647, "step": 204 }, { "epoch": 0.14385964912280702, "grad_norm": 0.5673708319664001, "learning_rate": 0.0001025, "loss": 1.7307, "step": 205 }, { "epoch": 0.14456140350877192, "grad_norm": 0.5708598494529724, "learning_rate": 0.00010300000000000001, "loss": 1.8256, "step": 206 }, { "epoch": 0.14526315789473684, "grad_norm": 0.5941545963287354, "learning_rate": 0.0001035, "loss": 1.9667, "step": 207 }, { "epoch": 0.14596491228070174, "grad_norm": 0.7349475622177124, "learning_rate": 0.00010400000000000001, "loss": 2.102, "step": 208 }, { "epoch": 0.14596491228070174, "eval_loss": 1.7481374740600586, "eval_runtime": 65.8261, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 208 }, { "epoch": 0.14666666666666667, "grad_norm": 0.8287472724914551, "learning_rate": 0.00010449999999999999, "loss": 1.7899, "step": 209 }, { "epoch": 0.14736842105263157, "grad_norm": 0.5865479111671448, "learning_rate": 0.000105, "loss": 1.6977, "step": 210 }, { "epoch": 0.1480701754385965, "grad_norm": 0.6178992986679077, "learning_rate": 0.0001055, "loss": 1.7282, "step": 211 }, { "epoch": 0.14877192982456142, "grad_norm": 0.6013645529747009, "learning_rate": 0.00010600000000000002, "loss": 1.7816, "step": 212 }, { "epoch": 0.14947368421052631, "grad_norm": 0.61904376745224, "learning_rate": 0.0001065, "loss": 1.8316, "step": 213 }, { "epoch": 0.15017543859649124, "grad_norm": 0.724051296710968, "learning_rate": 0.00010700000000000001, "loss": 1.8162, "step": 214 }, { "epoch": 0.15087719298245614, "grad_norm": 0.6658437848091125, "learning_rate": 0.0001075, "loss": 1.8453, "step": 215 }, { "epoch": 0.15157894736842106, "grad_norm": 0.5698081851005554, "learning_rate": 0.00010800000000000001, "loss": 1.852, "step": 216 }, { "epoch": 0.15228070175438596, "grad_norm": 0.7208857536315918, "learning_rate": 0.00010850000000000001, "loss": 1.8127, "step": 217 }, { "epoch": 0.1529824561403509, "grad_norm": 0.6816664338111877, "learning_rate": 0.000109, "loss": 1.927, "step": 218 }, { "epoch": 0.15368421052631578, "grad_norm": 0.6503348350524902, "learning_rate": 0.0001095, "loss": 1.7889, "step": 219 }, { "epoch": 0.1543859649122807, "grad_norm": 0.6846699118614197, "learning_rate": 0.00011000000000000002, "loss": 1.7617, "step": 220 }, { "epoch": 0.1550877192982456, "grad_norm": 0.7368967533111572, "learning_rate": 0.0001105, "loss": 2.0188, "step": 221 }, { "epoch": 0.15578947368421053, "grad_norm": 0.731460690498352, "learning_rate": 0.00011100000000000001, "loss": 1.9244, "step": 222 }, { "epoch": 0.15649122807017543, "grad_norm": 0.6871458292007446, "learning_rate": 0.0001115, "loss": 1.8466, "step": 223 }, { "epoch": 0.15719298245614036, "grad_norm": 0.5553243160247803, "learning_rate": 0.00011200000000000001, "loss": 1.5977, "step": 224 }, { "epoch": 0.15719298245614036, "eval_loss": 1.724018931388855, "eval_runtime": 65.8204, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 224 }, { "epoch": 0.15789473684210525, "grad_norm": 0.7251119017601013, "learning_rate": 0.00011250000000000001, "loss": 1.7306, "step": 225 }, { "epoch": 0.15859649122807018, "grad_norm": 0.6465187668800354, "learning_rate": 0.000113, "loss": 1.849, "step": 226 }, { "epoch": 0.15929824561403508, "grad_norm": 0.6604562401771545, "learning_rate": 0.00011350000000000001, "loss": 1.8163, "step": 227 }, { "epoch": 0.16, "grad_norm": 0.6065703630447388, "learning_rate": 0.00011399999999999999, "loss": 1.561, "step": 228 }, { "epoch": 0.1607017543859649, "grad_norm": 1.0526500940322876, "learning_rate": 0.0001145, "loss": 1.8566, "step": 229 }, { "epoch": 0.16140350877192983, "grad_norm": 0.6930262446403503, "learning_rate": 0.00011499999999999999, "loss": 1.9216, "step": 230 }, { "epoch": 0.16210526315789472, "grad_norm": 0.6250812411308289, "learning_rate": 0.0001155, "loss": 1.8288, "step": 231 }, { "epoch": 0.16280701754385965, "grad_norm": 0.7135443687438965, "learning_rate": 0.000116, "loss": 1.977, "step": 232 }, { "epoch": 0.16350877192982455, "grad_norm": 0.6365846395492554, "learning_rate": 0.00011650000000000001, "loss": 1.6024, "step": 233 }, { "epoch": 0.16421052631578947, "grad_norm": 0.8923673629760742, "learning_rate": 0.000117, "loss": 1.7248, "step": 234 }, { "epoch": 0.1649122807017544, "grad_norm": 0.6961442828178406, "learning_rate": 0.00011750000000000001, "loss": 1.7547, "step": 235 }, { "epoch": 0.1656140350877193, "grad_norm": 0.6233668923377991, "learning_rate": 0.000118, "loss": 1.8322, "step": 236 }, { "epoch": 0.16631578947368422, "grad_norm": 0.7002992033958435, "learning_rate": 0.00011850000000000001, "loss": 1.7196, "step": 237 }, { "epoch": 0.16701754385964912, "grad_norm": 0.6702497005462646, "learning_rate": 0.000119, "loss": 1.7846, "step": 238 }, { "epoch": 0.16771929824561405, "grad_norm": 0.9008122086524963, "learning_rate": 0.00011950000000000002, "loss": 1.8783, "step": 239 }, { "epoch": 0.16842105263157894, "grad_norm": 0.8020610809326172, "learning_rate": 0.00012, "loss": 1.955, "step": 240 }, { "epoch": 0.16842105263157894, "eval_loss": 1.6964468955993652, "eval_runtime": 65.818, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 240 }, { "epoch": 0.16912280701754387, "grad_norm": 0.6467213034629822, "learning_rate": 0.00012050000000000002, "loss": 1.7371, "step": 241 }, { "epoch": 0.16982456140350877, "grad_norm": 0.7768784761428833, "learning_rate": 0.000121, "loss": 1.8607, "step": 242 }, { "epoch": 0.1705263157894737, "grad_norm": 0.729199230670929, "learning_rate": 0.00012150000000000001, "loss": 1.7571, "step": 243 }, { "epoch": 0.1712280701754386, "grad_norm": 0.5859960913658142, "learning_rate": 0.000122, "loss": 1.7332, "step": 244 }, { "epoch": 0.17192982456140352, "grad_norm": 1.2551172971725464, "learning_rate": 0.00012250000000000002, "loss": 1.886, "step": 245 }, { "epoch": 0.1726315789473684, "grad_norm": 0.8452144861221313, "learning_rate": 0.000123, "loss": 1.7953, "step": 246 }, { "epoch": 0.17333333333333334, "grad_norm": 0.5969884395599365, "learning_rate": 0.00012350000000000002, "loss": 1.6984, "step": 247 }, { "epoch": 0.17403508771929824, "grad_norm": 0.5964519381523132, "learning_rate": 0.000124, "loss": 1.7451, "step": 248 }, { "epoch": 0.17473684210526316, "grad_norm": 0.605486273765564, "learning_rate": 0.00012450000000000002, "loss": 1.5528, "step": 249 }, { "epoch": 0.17543859649122806, "grad_norm": 0.7599868178367615, "learning_rate": 0.000125, "loss": 1.9283, "step": 250 }, { "epoch": 0.17614035087719299, "grad_norm": 0.6096434593200684, "learning_rate": 0.0001255, "loss": 1.6433, "step": 251 }, { "epoch": 0.17684210526315788, "grad_norm": 0.6275851726531982, "learning_rate": 0.000126, "loss": 1.8409, "step": 252 }, { "epoch": 0.1775438596491228, "grad_norm": 0.653072714805603, "learning_rate": 0.00012649999999999998, "loss": 1.7497, "step": 253 }, { "epoch": 0.1782456140350877, "grad_norm": 0.671919047832489, "learning_rate": 0.000127, "loss": 1.6344, "step": 254 }, { "epoch": 0.17894736842105263, "grad_norm": 0.5729222297668457, "learning_rate": 0.0001275, "loss": 1.4947, "step": 255 }, { "epoch": 0.17964912280701753, "grad_norm": 0.6807007789611816, "learning_rate": 0.00012800000000000002, "loss": 1.7112, "step": 256 }, { "epoch": 0.17964912280701753, "eval_loss": 1.676100730895996, "eval_runtime": 65.8237, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 256 }, { "epoch": 0.18035087719298246, "grad_norm": 0.6606838703155518, "learning_rate": 0.0001285, "loss": 1.7558, "step": 257 }, { "epoch": 0.18105263157894738, "grad_norm": 0.5948024988174438, "learning_rate": 0.00012900000000000002, "loss": 1.6916, "step": 258 }, { "epoch": 0.18175438596491228, "grad_norm": 0.6808467507362366, "learning_rate": 0.0001295, "loss": 1.8348, "step": 259 }, { "epoch": 0.1824561403508772, "grad_norm": 0.7079038619995117, "learning_rate": 0.00013000000000000002, "loss": 2.0326, "step": 260 }, { "epoch": 0.1831578947368421, "grad_norm": 0.6637305617332458, "learning_rate": 0.0001305, "loss": 1.623, "step": 261 }, { "epoch": 0.18385964912280703, "grad_norm": 0.6765385866165161, "learning_rate": 0.000131, "loss": 1.7191, "step": 262 }, { "epoch": 0.18456140350877193, "grad_norm": 0.6493529677391052, "learning_rate": 0.0001315, "loss": 1.7191, "step": 263 }, { "epoch": 0.18526315789473685, "grad_norm": 0.6313503384590149, "learning_rate": 0.000132, "loss": 1.769, "step": 264 }, { "epoch": 0.18596491228070175, "grad_norm": 0.6911813020706177, "learning_rate": 0.0001325, "loss": 1.8371, "step": 265 }, { "epoch": 0.18666666666666668, "grad_norm": 0.787820041179657, "learning_rate": 0.000133, "loss": 1.9518, "step": 266 }, { "epoch": 0.18736842105263157, "grad_norm": 0.6896628737449646, "learning_rate": 0.0001335, "loss": 1.5449, "step": 267 }, { "epoch": 0.1880701754385965, "grad_norm": 0.654269278049469, "learning_rate": 0.000134, "loss": 1.7359, "step": 268 }, { "epoch": 0.1887719298245614, "grad_norm": 0.7054020762443542, "learning_rate": 0.00013450000000000002, "loss": 1.6972, "step": 269 }, { "epoch": 0.18947368421052632, "grad_norm": 0.6697853803634644, "learning_rate": 0.00013500000000000003, "loss": 1.732, "step": 270 }, { "epoch": 0.19017543859649122, "grad_norm": 0.8842753171920776, "learning_rate": 0.00013550000000000001, "loss": 1.5646, "step": 271 }, { "epoch": 0.19087719298245615, "grad_norm": 0.7370839715003967, "learning_rate": 0.00013600000000000003, "loss": 1.8501, "step": 272 }, { "epoch": 0.19087719298245615, "eval_loss": 1.6598398685455322, "eval_runtime": 65.8278, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 272 }, { "epoch": 0.19157894736842104, "grad_norm": 0.6529443860054016, "learning_rate": 0.0001365, "loss": 1.7235, "step": 273 }, { "epoch": 0.19228070175438597, "grad_norm": 0.701677143573761, "learning_rate": 0.00013700000000000002, "loss": 1.7986, "step": 274 }, { "epoch": 0.19298245614035087, "grad_norm": 0.6164575815200806, "learning_rate": 0.0001375, "loss": 1.6767, "step": 275 }, { "epoch": 0.1936842105263158, "grad_norm": 0.7337639331817627, "learning_rate": 0.000138, "loss": 1.7697, "step": 276 }, { "epoch": 0.1943859649122807, "grad_norm": 0.7953131794929504, "learning_rate": 0.0001385, "loss": 1.6435, "step": 277 }, { "epoch": 0.19508771929824562, "grad_norm": 0.6779031753540039, "learning_rate": 0.000139, "loss": 1.7941, "step": 278 }, { "epoch": 0.1957894736842105, "grad_norm": 0.6542791724205017, "learning_rate": 0.0001395, "loss": 1.5838, "step": 279 }, { "epoch": 0.19649122807017544, "grad_norm": 0.6996631622314453, "learning_rate": 0.00014, "loss": 1.7903, "step": 280 }, { "epoch": 0.19719298245614036, "grad_norm": 0.7134291529655457, "learning_rate": 0.0001405, "loss": 1.7672, "step": 281 }, { "epoch": 0.19789473684210526, "grad_norm": 0.7196358442306519, "learning_rate": 0.000141, "loss": 1.8483, "step": 282 }, { "epoch": 0.1985964912280702, "grad_norm": 0.6014859080314636, "learning_rate": 0.0001415, "loss": 1.5573, "step": 283 }, { "epoch": 0.19929824561403509, "grad_norm": 0.6568789482116699, "learning_rate": 0.000142, "loss": 1.5945, "step": 284 }, { "epoch": 0.2, "grad_norm": 0.6452473402023315, "learning_rate": 0.00014250000000000002, "loss": 1.7326, "step": 285 }, { "epoch": 0.2007017543859649, "grad_norm": 0.721074104309082, "learning_rate": 0.000143, "loss": 1.7072, "step": 286 }, { "epoch": 0.20140350877192983, "grad_norm": 0.6635439991950989, "learning_rate": 0.00014350000000000002, "loss": 1.6528, "step": 287 }, { "epoch": 0.20210526315789473, "grad_norm": 0.6471149921417236, "learning_rate": 0.000144, "loss": 1.7238, "step": 288 }, { "epoch": 0.20210526315789473, "eval_loss": 1.648368239402771, "eval_runtime": 65.8256, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 288 }, { "epoch": 0.20280701754385966, "grad_norm": 0.6502244472503662, "learning_rate": 0.00014450000000000002, "loss": 1.6535, "step": 289 }, { "epoch": 0.20350877192982456, "grad_norm": 0.7102059721946716, "learning_rate": 0.000145, "loss": 1.8086, "step": 290 }, { "epoch": 0.20421052631578948, "grad_norm": 0.6370074152946472, "learning_rate": 0.0001455, "loss": 1.6384, "step": 291 }, { "epoch": 0.20491228070175438, "grad_norm": 0.6752753853797913, "learning_rate": 0.000146, "loss": 1.7305, "step": 292 }, { "epoch": 0.2056140350877193, "grad_norm": 0.6043215394020081, "learning_rate": 0.0001465, "loss": 1.522, "step": 293 }, { "epoch": 0.2063157894736842, "grad_norm": 0.6124662756919861, "learning_rate": 0.000147, "loss": 1.7533, "step": 294 }, { "epoch": 0.20701754385964913, "grad_norm": 0.6076603531837463, "learning_rate": 0.0001475, "loss": 1.6816, "step": 295 }, { "epoch": 0.20771929824561403, "grad_norm": 0.6305497884750366, "learning_rate": 0.000148, "loss": 1.7329, "step": 296 }, { "epoch": 0.20842105263157895, "grad_norm": 0.650345504283905, "learning_rate": 0.0001485, "loss": 1.6534, "step": 297 }, { "epoch": 0.20912280701754385, "grad_norm": 0.6520441770553589, "learning_rate": 0.00014900000000000002, "loss": 1.626, "step": 298 }, { "epoch": 0.20982456140350877, "grad_norm": 0.6576977968215942, "learning_rate": 0.00014950000000000003, "loss": 1.7041, "step": 299 }, { "epoch": 0.21052631578947367, "grad_norm": 0.7217229604721069, "learning_rate": 0.00015000000000000001, "loss": 1.7346, "step": 300 }, { "epoch": 0.2112280701754386, "grad_norm": 0.6894650459289551, "learning_rate": 0.0001505, "loss": 1.7297, "step": 301 }, { "epoch": 0.2119298245614035, "grad_norm": 0.6754316687583923, "learning_rate": 0.000151, "loss": 1.679, "step": 302 }, { "epoch": 0.21263157894736842, "grad_norm": 0.8202869892120361, "learning_rate": 0.0001515, "loss": 1.8756, "step": 303 }, { "epoch": 0.21333333333333335, "grad_norm": 0.6516512632369995, "learning_rate": 0.000152, "loss": 1.6181, "step": 304 }, { "epoch": 0.21333333333333335, "eval_loss": 1.6320592164993286, "eval_runtime": 65.8244, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 304 }, { "epoch": 0.21403508771929824, "grad_norm": 0.713117241859436, "learning_rate": 0.0001525, "loss": 1.7376, "step": 305 }, { "epoch": 0.21473684210526317, "grad_norm": 0.7311456203460693, "learning_rate": 0.000153, "loss": 1.7403, "step": 306 }, { "epoch": 0.21543859649122807, "grad_norm": 0.7241332530975342, "learning_rate": 0.0001535, "loss": 1.6339, "step": 307 }, { "epoch": 0.216140350877193, "grad_norm": 0.6111841201782227, "learning_rate": 0.000154, "loss": 1.5547, "step": 308 }, { "epoch": 0.2168421052631579, "grad_norm": 0.6247242093086243, "learning_rate": 0.0001545, "loss": 1.7161, "step": 309 }, { "epoch": 0.21754385964912282, "grad_norm": 0.6699721813201904, "learning_rate": 0.000155, "loss": 1.7845, "step": 310 }, { "epoch": 0.21824561403508771, "grad_norm": 0.708781361579895, "learning_rate": 0.0001555, "loss": 1.6824, "step": 311 }, { "epoch": 0.21894736842105264, "grad_norm": 0.6338351964950562, "learning_rate": 0.00015600000000000002, "loss": 1.6175, "step": 312 }, { "epoch": 0.21964912280701754, "grad_norm": 0.9422939419746399, "learning_rate": 0.0001565, "loss": 1.5341, "step": 313 }, { "epoch": 0.22035087719298246, "grad_norm": 0.6700330376625061, "learning_rate": 0.00015700000000000002, "loss": 1.6725, "step": 314 }, { "epoch": 0.22105263157894736, "grad_norm": 0.9142582416534424, "learning_rate": 0.0001575, "loss": 1.7215, "step": 315 }, { "epoch": 0.2217543859649123, "grad_norm": 0.7290825843811035, "learning_rate": 0.00015800000000000002, "loss": 1.6134, "step": 316 }, { "epoch": 0.22245614035087719, "grad_norm": 0.6589093208312988, "learning_rate": 0.0001585, "loss": 1.6423, "step": 317 }, { "epoch": 0.2231578947368421, "grad_norm": 0.7130411267280579, "learning_rate": 0.00015900000000000002, "loss": 1.6324, "step": 318 }, { "epoch": 0.223859649122807, "grad_norm": 0.6651201844215393, "learning_rate": 0.0001595, "loss": 1.7025, "step": 319 }, { "epoch": 0.22456140350877193, "grad_norm": 0.7485007643699646, "learning_rate": 0.00016, "loss": 1.7803, "step": 320 }, { "epoch": 0.22456140350877193, "eval_loss": 1.629869818687439, "eval_runtime": 65.8236, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 320 }, { "epoch": 0.22526315789473683, "grad_norm": 0.9492323398590088, "learning_rate": 0.0001605, "loss": 1.6981, "step": 321 }, { "epoch": 0.22596491228070176, "grad_norm": 57.05122375488281, "learning_rate": 0.000161, "loss": 1.5416, "step": 322 }, { "epoch": 0.22666666666666666, "grad_norm": 0.7079808712005615, "learning_rate": 0.0001615, "loss": 1.7357, "step": 323 }, { "epoch": 0.22736842105263158, "grad_norm": 0.6561883091926575, "learning_rate": 0.000162, "loss": 1.6631, "step": 324 }, { "epoch": 0.22807017543859648, "grad_norm": 0.7194943428039551, "learning_rate": 0.00016250000000000002, "loss": 1.5343, "step": 325 }, { "epoch": 0.2287719298245614, "grad_norm": 0.7897792458534241, "learning_rate": 0.000163, "loss": 1.6443, "step": 326 }, { "epoch": 0.2294736842105263, "grad_norm": 0.6690762042999268, "learning_rate": 0.00016350000000000002, "loss": 1.5506, "step": 327 }, { "epoch": 0.23017543859649123, "grad_norm": 0.7214846611022949, "learning_rate": 0.000164, "loss": 1.6735, "step": 328 }, { "epoch": 0.23087719298245615, "grad_norm": 0.6350500583648682, "learning_rate": 0.00016450000000000001, "loss": 1.7185, "step": 329 }, { "epoch": 0.23157894736842105, "grad_norm": 0.6072900295257568, "learning_rate": 0.000165, "loss": 1.6076, "step": 330 }, { "epoch": 0.23228070175438598, "grad_norm": 0.678343653678894, "learning_rate": 0.0001655, "loss": 1.7155, "step": 331 }, { "epoch": 0.23298245614035087, "grad_norm": 0.7727928161621094, "learning_rate": 0.000166, "loss": 1.6449, "step": 332 }, { "epoch": 0.2336842105263158, "grad_norm": 0.638267993927002, "learning_rate": 0.0001665, "loss": 1.484, "step": 333 }, { "epoch": 0.2343859649122807, "grad_norm": 0.6628909111022949, "learning_rate": 0.000167, "loss": 1.6804, "step": 334 }, { "epoch": 0.23508771929824562, "grad_norm": 0.6747671961784363, "learning_rate": 0.0001675, "loss": 1.7358, "step": 335 }, { "epoch": 0.23578947368421052, "grad_norm": 0.7026097774505615, "learning_rate": 0.000168, "loss": 1.5598, "step": 336 }, { "epoch": 0.23578947368421052, "eval_loss": 1.6214672327041626, "eval_runtime": 65.8233, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 336 }, { "epoch": 0.23649122807017545, "grad_norm": 0.6368547677993774, "learning_rate": 0.0001685, "loss": 1.6086, "step": 337 }, { "epoch": 0.23719298245614034, "grad_norm": 0.669689953327179, "learning_rate": 0.00016900000000000002, "loss": 1.6706, "step": 338 }, { "epoch": 0.23789473684210527, "grad_norm": 0.6439468264579773, "learning_rate": 0.00016950000000000003, "loss": 1.6649, "step": 339 }, { "epoch": 0.23859649122807017, "grad_norm": 0.747278094291687, "learning_rate": 0.00017, "loss": 1.6027, "step": 340 }, { "epoch": 0.2392982456140351, "grad_norm": 0.7407952547073364, "learning_rate": 0.00017050000000000002, "loss": 1.7544, "step": 341 }, { "epoch": 0.24, "grad_norm": 0.6871046423912048, "learning_rate": 0.000171, "loss": 1.5568, "step": 342 }, { "epoch": 0.24070175438596492, "grad_norm": 0.6485664248466492, "learning_rate": 0.00017150000000000002, "loss": 1.5902, "step": 343 }, { "epoch": 0.24140350877192981, "grad_norm": 0.8657354712486267, "learning_rate": 0.000172, "loss": 1.6997, "step": 344 }, { "epoch": 0.24210526315789474, "grad_norm": 0.7630641460418701, "learning_rate": 0.00017250000000000002, "loss": 1.6361, "step": 345 }, { "epoch": 0.24280701754385964, "grad_norm": 0.7222327589988708, "learning_rate": 0.000173, "loss": 1.7189, "step": 346 }, { "epoch": 0.24350877192982456, "grad_norm": 0.658898651599884, "learning_rate": 0.00017350000000000002, "loss": 1.6103, "step": 347 }, { "epoch": 0.24421052631578946, "grad_norm": 0.6879005432128906, "learning_rate": 0.000174, "loss": 1.6053, "step": 348 }, { "epoch": 0.2449122807017544, "grad_norm": 0.5957930684089661, "learning_rate": 0.0001745, "loss": 1.5952, "step": 349 }, { "epoch": 0.24561403508771928, "grad_norm": 0.5976981520652771, "learning_rate": 0.000175, "loss": 1.5702, "step": 350 }, { "epoch": 0.2463157894736842, "grad_norm": 0.6382569670677185, "learning_rate": 0.0001755, "loss": 1.5307, "step": 351 }, { "epoch": 0.24701754385964914, "grad_norm": 0.7132206559181213, "learning_rate": 0.00017600000000000002, "loss": 1.8067, "step": 352 }, { "epoch": 0.24701754385964914, "eval_loss": 1.6171884536743164, "eval_runtime": 65.8327, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 352 }, { "epoch": 0.24771929824561403, "grad_norm": 0.7135735750198364, "learning_rate": 0.0001765, "loss": 1.7202, "step": 353 }, { "epoch": 0.24842105263157896, "grad_norm": 0.6818898320198059, "learning_rate": 0.00017700000000000002, "loss": 1.6492, "step": 354 }, { "epoch": 0.24912280701754386, "grad_norm": 0.6693221926689148, "learning_rate": 0.0001775, "loss": 1.7142, "step": 355 }, { "epoch": 0.24982456140350878, "grad_norm": 0.6292011141777039, "learning_rate": 0.00017800000000000002, "loss": 1.5683, "step": 356 }, { "epoch": 0.2505263157894737, "grad_norm": 0.6244410276412964, "learning_rate": 0.0001785, "loss": 1.6022, "step": 357 }, { "epoch": 0.2512280701754386, "grad_norm": 0.7546259164810181, "learning_rate": 0.00017900000000000001, "loss": 1.6474, "step": 358 }, { "epoch": 0.2519298245614035, "grad_norm": 0.701273500919342, "learning_rate": 0.0001795, "loss": 1.5953, "step": 359 }, { "epoch": 0.25263157894736843, "grad_norm": 0.7112113833427429, "learning_rate": 0.00018, "loss": 1.635, "step": 360 }, { "epoch": 0.25333333333333335, "grad_norm": 0.7347482442855835, "learning_rate": 0.0001805, "loss": 1.8446, "step": 361 }, { "epoch": 0.2540350877192982, "grad_norm": 0.7154216170310974, "learning_rate": 0.000181, "loss": 1.5664, "step": 362 }, { "epoch": 0.25473684210526315, "grad_norm": 0.6344476342201233, "learning_rate": 0.0001815, "loss": 1.593, "step": 363 }, { "epoch": 0.2554385964912281, "grad_norm": 0.6583028435707092, "learning_rate": 0.000182, "loss": 1.5444, "step": 364 }, { "epoch": 0.256140350877193, "grad_norm": 0.7321555614471436, "learning_rate": 0.0001825, "loss": 1.7272, "step": 365 }, { "epoch": 0.25684210526315787, "grad_norm": 0.6510097980499268, "learning_rate": 0.000183, "loss": 1.594, "step": 366 }, { "epoch": 0.2575438596491228, "grad_norm": 0.6468524932861328, "learning_rate": 0.00018350000000000002, "loss": 1.6117, "step": 367 }, { "epoch": 0.2582456140350877, "grad_norm": 0.6648326516151428, "learning_rate": 0.00018400000000000003, "loss": 1.8381, "step": 368 }, { "epoch": 0.2582456140350877, "eval_loss": 1.6029711961746216, "eval_runtime": 65.8272, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 368 }, { "epoch": 0.25894736842105265, "grad_norm": 0.6455146670341492, "learning_rate": 0.0001845, "loss": 1.5622, "step": 369 }, { "epoch": 0.2596491228070175, "grad_norm": 0.6519348621368408, "learning_rate": 0.00018500000000000002, "loss": 1.614, "step": 370 }, { "epoch": 0.26035087719298244, "grad_norm": 0.6873278021812439, "learning_rate": 0.0001855, "loss": 1.8078, "step": 371 }, { "epoch": 0.26105263157894737, "grad_norm": 0.6784276962280273, "learning_rate": 0.00018600000000000002, "loss": 1.6373, "step": 372 }, { "epoch": 0.2617543859649123, "grad_norm": 0.7679441571235657, "learning_rate": 0.0001865, "loss": 1.6906, "step": 373 }, { "epoch": 0.2624561403508772, "grad_norm": 0.65203458070755, "learning_rate": 0.00018700000000000002, "loss": 1.5833, "step": 374 }, { "epoch": 0.2631578947368421, "grad_norm": 0.6820208430290222, "learning_rate": 0.0001875, "loss": 1.6331, "step": 375 }, { "epoch": 0.263859649122807, "grad_norm": 0.75938481092453, "learning_rate": 0.000188, "loss": 1.7521, "step": 376 }, { "epoch": 0.26456140350877194, "grad_norm": 0.6937898993492126, "learning_rate": 0.0001885, "loss": 1.7052, "step": 377 }, { "epoch": 0.26526315789473687, "grad_norm": 0.6280395984649658, "learning_rate": 0.00018899999999999999, "loss": 1.5356, "step": 378 }, { "epoch": 0.26596491228070174, "grad_norm": 0.731451153755188, "learning_rate": 0.0001895, "loss": 1.8005, "step": 379 }, { "epoch": 0.26666666666666666, "grad_norm": 0.6889495253562927, "learning_rate": 0.00019, "loss": 1.6858, "step": 380 }, { "epoch": 0.2673684210526316, "grad_norm": 0.677925169467926, "learning_rate": 0.00019050000000000002, "loss": 1.7834, "step": 381 }, { "epoch": 0.2680701754385965, "grad_norm": 0.7197091579437256, "learning_rate": 0.000191, "loss": 1.6578, "step": 382 }, { "epoch": 0.2687719298245614, "grad_norm": 0.6192888617515564, "learning_rate": 0.00019150000000000002, "loss": 1.6418, "step": 383 }, { "epoch": 0.2694736842105263, "grad_norm": 0.611748456954956, "learning_rate": 0.000192, "loss": 1.4265, "step": 384 }, { "epoch": 0.2694736842105263, "eval_loss": 1.599482774734497, "eval_runtime": 65.8269, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 384 }, { "epoch": 0.27017543859649124, "grad_norm": 0.6387831568717957, "learning_rate": 0.00019250000000000002, "loss": 1.6111, "step": 385 }, { "epoch": 0.27087719298245616, "grad_norm": 0.6571049094200134, "learning_rate": 0.000193, "loss": 1.6471, "step": 386 }, { "epoch": 0.27157894736842103, "grad_norm": 0.6474180817604065, "learning_rate": 0.00019350000000000001, "loss": 1.5387, "step": 387 }, { "epoch": 0.27228070175438596, "grad_norm": 0.7216473817825317, "learning_rate": 0.000194, "loss": 1.648, "step": 388 }, { "epoch": 0.2729824561403509, "grad_norm": 0.7126711010932922, "learning_rate": 0.0001945, "loss": 1.6354, "step": 389 }, { "epoch": 0.2736842105263158, "grad_norm": 0.651657223701477, "learning_rate": 0.000195, "loss": 1.5532, "step": 390 }, { "epoch": 0.2743859649122807, "grad_norm": 0.7893962860107422, "learning_rate": 0.0001955, "loss": 1.8075, "step": 391 }, { "epoch": 0.2750877192982456, "grad_norm": 0.7379441857337952, "learning_rate": 0.000196, "loss": 1.5418, "step": 392 }, { "epoch": 0.27578947368421053, "grad_norm": 0.7075092792510986, "learning_rate": 0.0001965, "loss": 1.4814, "step": 393 }, { "epoch": 0.27649122807017545, "grad_norm": 0.6572582721710205, "learning_rate": 0.00019700000000000002, "loss": 1.6321, "step": 394 }, { "epoch": 0.2771929824561403, "grad_norm": 0.728898286819458, "learning_rate": 0.00019750000000000003, "loss": 1.6153, "step": 395 }, { "epoch": 0.27789473684210525, "grad_norm": 0.6726265549659729, "learning_rate": 0.00019800000000000002, "loss": 1.6417, "step": 396 }, { "epoch": 0.2785964912280702, "grad_norm": 0.6529223918914795, "learning_rate": 0.00019850000000000003, "loss": 1.6572, "step": 397 }, { "epoch": 0.2792982456140351, "grad_norm": 1.0003806352615356, "learning_rate": 0.000199, "loss": 1.6589, "step": 398 }, { "epoch": 0.28, "grad_norm": 0.7141063213348389, "learning_rate": 0.00019950000000000002, "loss": 1.6506, "step": 399 }, { "epoch": 0.2807017543859649, "grad_norm": 0.6953572630882263, "learning_rate": 0.0002, "loss": 1.7487, "step": 400 }, { "epoch": 0.2807017543859649, "eval_loss": 1.5923221111297607, "eval_runtime": 65.8229, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 400 }, { "epoch": 0.2814035087719298, "grad_norm": 0.666172206401825, "learning_rate": 0.0001999347896967721, "loss": 1.7235, "step": 401 }, { "epoch": 0.28210526315789475, "grad_norm": 0.6359288096427917, "learning_rate": 0.0001998695793935442, "loss": 1.6144, "step": 402 }, { "epoch": 0.2828070175438597, "grad_norm": 0.8393360376358032, "learning_rate": 0.00019980436909031627, "loss": 1.7485, "step": 403 }, { "epoch": 0.28350877192982454, "grad_norm": 0.6780723929405212, "learning_rate": 0.00019973915878708837, "loss": 1.6818, "step": 404 }, { "epoch": 0.28421052631578947, "grad_norm": 0.6323883533477783, "learning_rate": 0.00019967394848386045, "loss": 1.5516, "step": 405 }, { "epoch": 0.2849122807017544, "grad_norm": 0.671244740486145, "learning_rate": 0.00019960873818063255, "loss": 1.646, "step": 406 }, { "epoch": 0.2856140350877193, "grad_norm": 0.6661714315414429, "learning_rate": 0.00019954352787740465, "loss": 1.5835, "step": 407 }, { "epoch": 0.2863157894736842, "grad_norm": 0.6124412417411804, "learning_rate": 0.00019947831757417673, "loss": 1.5328, "step": 408 }, { "epoch": 0.2870175438596491, "grad_norm": 0.6900444626808167, "learning_rate": 0.0001994131072709488, "loss": 1.7196, "step": 409 }, { "epoch": 0.28771929824561404, "grad_norm": 0.6249657273292542, "learning_rate": 0.0001993478969677209, "loss": 1.5233, "step": 410 }, { "epoch": 0.28842105263157897, "grad_norm": 0.9119845032691956, "learning_rate": 0.000199282686664493, "loss": 1.7429, "step": 411 }, { "epoch": 0.28912280701754384, "grad_norm": 0.7250961661338806, "learning_rate": 0.0001992174763612651, "loss": 1.6377, "step": 412 }, { "epoch": 0.28982456140350876, "grad_norm": 0.6883248090744019, "learning_rate": 0.0001991522660580372, "loss": 1.8828, "step": 413 }, { "epoch": 0.2905263157894737, "grad_norm": 0.6609102487564087, "learning_rate": 0.00019908705575480927, "loss": 1.6413, "step": 414 }, { "epoch": 0.2912280701754386, "grad_norm": 0.6576290130615234, "learning_rate": 0.00019902184545158135, "loss": 1.4833, "step": 415 }, { "epoch": 0.2919298245614035, "grad_norm": 0.7599754333496094, "learning_rate": 0.00019895663514835345, "loss": 1.5952, "step": 416 }, { "epoch": 0.2919298245614035, "eval_loss": 1.5908491611480713, "eval_runtime": 65.8171, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 416 }, { "epoch": 0.2926315789473684, "grad_norm": 0.6669303774833679, "learning_rate": 0.00019889142484512553, "loss": 1.5786, "step": 417 }, { "epoch": 0.29333333333333333, "grad_norm": 0.862571120262146, "learning_rate": 0.00019882621454189763, "loss": 1.766, "step": 418 }, { "epoch": 0.29403508771929826, "grad_norm": 0.673698902130127, "learning_rate": 0.00019876100423866973, "loss": 1.5205, "step": 419 }, { "epoch": 0.29473684210526313, "grad_norm": 0.6783264875411987, "learning_rate": 0.0001986957939354418, "loss": 1.6954, "step": 420 }, { "epoch": 0.29543859649122806, "grad_norm": 0.6730865836143494, "learning_rate": 0.0001986305836322139, "loss": 1.4579, "step": 421 }, { "epoch": 0.296140350877193, "grad_norm": 0.7017145156860352, "learning_rate": 0.000198565373328986, "loss": 1.7478, "step": 422 }, { "epoch": 0.2968421052631579, "grad_norm": 0.6237165927886963, "learning_rate": 0.0001985001630257581, "loss": 1.516, "step": 423 }, { "epoch": 0.29754385964912283, "grad_norm": 1.1509321928024292, "learning_rate": 0.00019843495272253017, "loss": 2.0267, "step": 424 }, { "epoch": 0.2982456140350877, "grad_norm": 0.700303852558136, "learning_rate": 0.00019836974241930228, "loss": 1.6524, "step": 425 }, { "epoch": 0.29894736842105263, "grad_norm": 0.6540369987487793, "learning_rate": 0.00019830453211607435, "loss": 1.4797, "step": 426 }, { "epoch": 0.29964912280701755, "grad_norm": 0.6748155355453491, "learning_rate": 0.00019823932181284643, "loss": 1.639, "step": 427 }, { "epoch": 0.3003508771929825, "grad_norm": 0.5861191749572754, "learning_rate": 0.00019817411150961853, "loss": 1.571, "step": 428 }, { "epoch": 0.30105263157894735, "grad_norm": 0.7370057702064514, "learning_rate": 0.00019810890120639064, "loss": 1.7409, "step": 429 }, { "epoch": 0.3017543859649123, "grad_norm": 0.7410442233085632, "learning_rate": 0.0001980436909031627, "loss": 1.7762, "step": 430 }, { "epoch": 0.3024561403508772, "grad_norm": 0.8091199994087219, "learning_rate": 0.0001979784805999348, "loss": 1.7323, "step": 431 }, { "epoch": 0.3031578947368421, "grad_norm": 0.7337295413017273, "learning_rate": 0.0001979132702967069, "loss": 1.6508, "step": 432 }, { "epoch": 0.3031578947368421, "eval_loss": 1.5880753993988037, "eval_runtime": 65.813, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 432 }, { "epoch": 0.303859649122807, "grad_norm": 0.6953486800193787, "learning_rate": 0.00019784805999347897, "loss": 1.5655, "step": 433 }, { "epoch": 0.3045614035087719, "grad_norm": 0.6846860647201538, "learning_rate": 0.00019778284969025107, "loss": 1.6046, "step": 434 }, { "epoch": 0.30526315789473685, "grad_norm": 0.7263966202735901, "learning_rate": 0.00019771763938702318, "loss": 1.7278, "step": 435 }, { "epoch": 0.3059649122807018, "grad_norm": 0.6760476231575012, "learning_rate": 0.00019765242908379525, "loss": 1.5458, "step": 436 }, { "epoch": 0.30666666666666664, "grad_norm": 0.6766071915626526, "learning_rate": 0.00019758721878056733, "loss": 1.4978, "step": 437 }, { "epoch": 0.30736842105263157, "grad_norm": 0.7111241221427917, "learning_rate": 0.00019752200847733943, "loss": 1.7675, "step": 438 }, { "epoch": 0.3080701754385965, "grad_norm": 0.6637362241744995, "learning_rate": 0.0001974567981741115, "loss": 1.6153, "step": 439 }, { "epoch": 0.3087719298245614, "grad_norm": 0.8304871916770935, "learning_rate": 0.0001973915878708836, "loss": 1.7427, "step": 440 }, { "epoch": 0.3094736842105263, "grad_norm": 0.6430342197418213, "learning_rate": 0.00019732637756765572, "loss": 1.6553, "step": 441 }, { "epoch": 0.3101754385964912, "grad_norm": 0.6601508855819702, "learning_rate": 0.0001972611672644278, "loss": 1.6706, "step": 442 }, { "epoch": 0.31087719298245614, "grad_norm": 0.6771219372749329, "learning_rate": 0.00019719595696119987, "loss": 1.7072, "step": 443 }, { "epoch": 0.31157894736842107, "grad_norm": 0.6689156889915466, "learning_rate": 0.00019713074665797197, "loss": 1.4808, "step": 444 }, { "epoch": 0.312280701754386, "grad_norm": 0.6653517484664917, "learning_rate": 0.00019706553635474405, "loss": 1.67, "step": 445 }, { "epoch": 0.31298245614035086, "grad_norm": 0.661916971206665, "learning_rate": 0.00019700032605151615, "loss": 1.5439, "step": 446 }, { "epoch": 0.3136842105263158, "grad_norm": 0.6444401144981384, "learning_rate": 0.00019693511574828826, "loss": 1.54, "step": 447 }, { "epoch": 0.3143859649122807, "grad_norm": 0.6796846389770508, "learning_rate": 0.00019686990544506033, "loss": 1.5673, "step": 448 }, { "epoch": 0.3143859649122807, "eval_loss": 1.5777168273925781, "eval_runtime": 65.8181, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 448 }, { "epoch": 0.31508771929824564, "grad_norm": 0.6782138347625732, "learning_rate": 0.0001968046951418324, "loss": 1.6511, "step": 449 }, { "epoch": 0.3157894736842105, "grad_norm": 0.6360114216804504, "learning_rate": 0.00019673948483860451, "loss": 1.5904, "step": 450 }, { "epoch": 0.31649122807017543, "grad_norm": 0.697213888168335, "learning_rate": 0.0001966742745353766, "loss": 1.5082, "step": 451 }, { "epoch": 0.31719298245614036, "grad_norm": 0.597529947757721, "learning_rate": 0.0001966090642321487, "loss": 1.4533, "step": 452 }, { "epoch": 0.3178947368421053, "grad_norm": 0.6409364938735962, "learning_rate": 0.0001965438539289208, "loss": 1.6345, "step": 453 }, { "epoch": 0.31859649122807016, "grad_norm": 0.6285794973373413, "learning_rate": 0.00019647864362569285, "loss": 1.4713, "step": 454 }, { "epoch": 0.3192982456140351, "grad_norm": 0.6725212931632996, "learning_rate": 0.00019641343332246495, "loss": 1.6356, "step": 455 }, { "epoch": 0.32, "grad_norm": 0.6402524709701538, "learning_rate": 0.00019634822301923705, "loss": 1.612, "step": 456 }, { "epoch": 0.32070175438596493, "grad_norm": 0.6168628334999084, "learning_rate": 0.00019628301271600913, "loss": 1.6238, "step": 457 }, { "epoch": 0.3214035087719298, "grad_norm": 0.6333017945289612, "learning_rate": 0.00019621780241278123, "loss": 1.5203, "step": 458 }, { "epoch": 0.32210526315789473, "grad_norm": 0.7710492610931396, "learning_rate": 0.0001961525921095533, "loss": 1.8244, "step": 459 }, { "epoch": 0.32280701754385965, "grad_norm": 0.6140996813774109, "learning_rate": 0.0001960873818063254, "loss": 1.5385, "step": 460 }, { "epoch": 0.3235087719298246, "grad_norm": 0.6964908838272095, "learning_rate": 0.0001960221715030975, "loss": 1.7616, "step": 461 }, { "epoch": 0.32421052631578945, "grad_norm": 0.6219298839569092, "learning_rate": 0.0001959569611998696, "loss": 1.5711, "step": 462 }, { "epoch": 0.3249122807017544, "grad_norm": 0.6307212114334106, "learning_rate": 0.00019589175089664167, "loss": 1.5841, "step": 463 }, { "epoch": 0.3256140350877193, "grad_norm": 0.666330099105835, "learning_rate": 0.00019582654059341378, "loss": 1.5402, "step": 464 }, { "epoch": 0.3256140350877193, "eval_loss": 1.5765098333358765, "eval_runtime": 65.8198, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 464 }, { "epoch": 0.3263157894736842, "grad_norm": 0.6581262946128845, "learning_rate": 0.00019576133029018585, "loss": 1.6774, "step": 465 }, { "epoch": 0.3270175438596491, "grad_norm": 0.6524697542190552, "learning_rate": 0.00019569611998695796, "loss": 1.5371, "step": 466 }, { "epoch": 0.327719298245614, "grad_norm": 0.6187402009963989, "learning_rate": 0.00019563090968373003, "loss": 1.6082, "step": 467 }, { "epoch": 0.32842105263157895, "grad_norm": 0.5938166975975037, "learning_rate": 0.00019556569938050214, "loss": 1.5066, "step": 468 }, { "epoch": 0.3291228070175439, "grad_norm": 0.6310122013092041, "learning_rate": 0.00019550048907727424, "loss": 1.5888, "step": 469 }, { "epoch": 0.3298245614035088, "grad_norm": 0.5949949026107788, "learning_rate": 0.00019543527877404632, "loss": 1.661, "step": 470 }, { "epoch": 0.33052631578947367, "grad_norm": 0.6215965151786804, "learning_rate": 0.0001953700684708184, "loss": 1.584, "step": 471 }, { "epoch": 0.3312280701754386, "grad_norm": 0.6249874234199524, "learning_rate": 0.0001953048581675905, "loss": 1.6597, "step": 472 }, { "epoch": 0.3319298245614035, "grad_norm": 0.6643415689468384, "learning_rate": 0.00019523964786436257, "loss": 1.6478, "step": 473 }, { "epoch": 0.33263157894736844, "grad_norm": 0.726338267326355, "learning_rate": 0.00019517443756113468, "loss": 1.7329, "step": 474 }, { "epoch": 0.3333333333333333, "grad_norm": 0.6735034584999084, "learning_rate": 0.00019510922725790678, "loss": 1.6233, "step": 475 }, { "epoch": 0.33403508771929824, "grad_norm": 0.7442544102668762, "learning_rate": 0.00019504401695467886, "loss": 1.6331, "step": 476 }, { "epoch": 0.33473684210526317, "grad_norm": 0.5987380743026733, "learning_rate": 0.00019497880665145093, "loss": 1.5729, "step": 477 }, { "epoch": 0.3354385964912281, "grad_norm": 0.6671352386474609, "learning_rate": 0.00019491359634822304, "loss": 1.5155, "step": 478 }, { "epoch": 0.33614035087719296, "grad_norm": 0.6072157025337219, "learning_rate": 0.0001948483860449951, "loss": 1.5184, "step": 479 }, { "epoch": 0.3368421052631579, "grad_norm": 0.7700802683830261, "learning_rate": 0.00019478317574176722, "loss": 1.517, "step": 480 }, { "epoch": 0.3368421052631579, "eval_loss": 1.5660690069198608, "eval_runtime": 65.8213, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 480 }, { "epoch": 0.3375438596491228, "grad_norm": 0.5709801316261292, "learning_rate": 0.00019471796543853932, "loss": 1.5906, "step": 481 }, { "epoch": 0.33824561403508774, "grad_norm": 0.6920676827430725, "learning_rate": 0.00019465275513531137, "loss": 1.6332, "step": 482 }, { "epoch": 0.3389473684210526, "grad_norm": 0.7384718656539917, "learning_rate": 0.00019458754483208347, "loss": 1.7847, "step": 483 }, { "epoch": 0.33964912280701753, "grad_norm": 0.6829721331596375, "learning_rate": 0.00019452233452885558, "loss": 1.5141, "step": 484 }, { "epoch": 0.34035087719298246, "grad_norm": 0.6028706431388855, "learning_rate": 0.00019445712422562765, "loss": 1.4237, "step": 485 }, { "epoch": 0.3410526315789474, "grad_norm": 0.7215585708618164, "learning_rate": 0.00019439191392239976, "loss": 1.6725, "step": 486 }, { "epoch": 0.34175438596491226, "grad_norm": 0.6875351667404175, "learning_rate": 0.00019432670361917183, "loss": 1.5586, "step": 487 }, { "epoch": 0.3424561403508772, "grad_norm": 0.6807013154029846, "learning_rate": 0.0001942614933159439, "loss": 1.7029, "step": 488 }, { "epoch": 0.3431578947368421, "grad_norm": 0.690457820892334, "learning_rate": 0.00019419628301271601, "loss": 1.6097, "step": 489 }, { "epoch": 0.34385964912280703, "grad_norm": 0.6151444911956787, "learning_rate": 0.00019413107270948812, "loss": 1.4907, "step": 490 }, { "epoch": 0.34456140350877196, "grad_norm": 0.6475211381912231, "learning_rate": 0.0001940658624062602, "loss": 1.5384, "step": 491 }, { "epoch": 0.3452631578947368, "grad_norm": 0.8687899708747864, "learning_rate": 0.0001940006521030323, "loss": 1.6515, "step": 492 }, { "epoch": 0.34596491228070175, "grad_norm": 0.6889581680297852, "learning_rate": 0.00019393544179980437, "loss": 1.545, "step": 493 }, { "epoch": 0.3466666666666667, "grad_norm": 0.6693393588066101, "learning_rate": 0.00019387023149657645, "loss": 1.5921, "step": 494 }, { "epoch": 0.3473684210526316, "grad_norm": 0.6513168811798096, "learning_rate": 0.00019380502119334855, "loss": 1.6333, "step": 495 }, { "epoch": 0.3480701754385965, "grad_norm": 0.7428777813911438, "learning_rate": 0.00019373981089012066, "loss": 1.5885, "step": 496 }, { "epoch": 0.3480701754385965, "eval_loss": 1.5670621395111084, "eval_runtime": 65.8226, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 496 }, { "epoch": 0.3487719298245614, "grad_norm": 0.7555594444274902, "learning_rate": 0.00019367460058689273, "loss": 1.419, "step": 497 }, { "epoch": 0.3494736842105263, "grad_norm": 0.6310045123100281, "learning_rate": 0.00019360939028366484, "loss": 1.577, "step": 498 }, { "epoch": 0.35017543859649125, "grad_norm": 0.6770079135894775, "learning_rate": 0.00019354417998043691, "loss": 1.7458, "step": 499 }, { "epoch": 0.3508771929824561, "grad_norm": 0.6079325675964355, "learning_rate": 0.000193478969677209, "loss": 1.7585, "step": 500 }, { "epoch": 0.35157894736842105, "grad_norm": 0.6061223149299622, "learning_rate": 0.0001934137593739811, "loss": 1.5363, "step": 501 }, { "epoch": 0.35228070175438597, "grad_norm": 0.6173948049545288, "learning_rate": 0.0001933485490707532, "loss": 1.5801, "step": 502 }, { "epoch": 0.3529824561403509, "grad_norm": 0.6323658227920532, "learning_rate": 0.00019328333876752528, "loss": 1.6302, "step": 503 }, { "epoch": 0.35368421052631577, "grad_norm": 0.6596127152442932, "learning_rate": 0.00019321812846429738, "loss": 1.6922, "step": 504 }, { "epoch": 0.3543859649122807, "grad_norm": 0.6541072726249695, "learning_rate": 0.00019315291816106946, "loss": 1.851, "step": 505 }, { "epoch": 0.3550877192982456, "grad_norm": 0.6097167134284973, "learning_rate": 0.00019308770785784153, "loss": 1.6088, "step": 506 }, { "epoch": 0.35578947368421054, "grad_norm": 0.6567264795303345, "learning_rate": 0.00019302249755461364, "loss": 1.8079, "step": 507 }, { "epoch": 0.3564912280701754, "grad_norm": 0.7155901789665222, "learning_rate": 0.00019295728725138574, "loss": 1.6275, "step": 508 }, { "epoch": 0.35719298245614034, "grad_norm": 0.6436221599578857, "learning_rate": 0.00019289207694815782, "loss": 1.5421, "step": 509 }, { "epoch": 0.35789473684210527, "grad_norm": 0.6261144280433655, "learning_rate": 0.0001928268666449299, "loss": 1.522, "step": 510 }, { "epoch": 0.3585964912280702, "grad_norm": 0.6645616888999939, "learning_rate": 0.000192761656341702, "loss": 1.7127, "step": 511 }, { "epoch": 0.35929824561403506, "grad_norm": 0.6452895402908325, "learning_rate": 0.0001926964460384741, "loss": 1.5806, "step": 512 }, { "epoch": 0.35929824561403506, "eval_loss": 1.562566876411438, "eval_runtime": 65.83, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 512 }, { "epoch": 0.36, "grad_norm": 0.6003766655921936, "learning_rate": 0.00019263123573524618, "loss": 1.4519, "step": 513 }, { "epoch": 0.3607017543859649, "grad_norm": 0.6487770080566406, "learning_rate": 0.00019256602543201828, "loss": 1.4407, "step": 514 }, { "epoch": 0.36140350877192984, "grad_norm": 0.6501699686050415, "learning_rate": 0.00019250081512879036, "loss": 1.6538, "step": 515 }, { "epoch": 0.36210526315789476, "grad_norm": 0.5992392301559448, "learning_rate": 0.00019243560482556243, "loss": 1.4916, "step": 516 }, { "epoch": 0.36280701754385963, "grad_norm": 0.7445614337921143, "learning_rate": 0.00019237039452233454, "loss": 1.6994, "step": 517 }, { "epoch": 0.36350877192982456, "grad_norm": 0.6477187275886536, "learning_rate": 0.00019230518421910664, "loss": 1.5603, "step": 518 }, { "epoch": 0.3642105263157895, "grad_norm": 0.6093468070030212, "learning_rate": 0.00019223997391587872, "loss": 1.4722, "step": 519 }, { "epoch": 0.3649122807017544, "grad_norm": 0.5677714347839355, "learning_rate": 0.00019217476361265082, "loss": 1.3723, "step": 520 }, { "epoch": 0.3656140350877193, "grad_norm": 0.5741240382194519, "learning_rate": 0.0001921095533094229, "loss": 1.476, "step": 521 }, { "epoch": 0.3663157894736842, "grad_norm": 0.6821523308753967, "learning_rate": 0.00019204434300619497, "loss": 1.7527, "step": 522 }, { "epoch": 0.36701754385964913, "grad_norm": 0.6139315962791443, "learning_rate": 0.00019197913270296708, "loss": 1.6129, "step": 523 }, { "epoch": 0.36771929824561406, "grad_norm": 0.6433385610580444, "learning_rate": 0.00019191392239973918, "loss": 1.629, "step": 524 }, { "epoch": 0.3684210526315789, "grad_norm": 0.5924102067947388, "learning_rate": 0.00019184871209651126, "loss": 1.4027, "step": 525 }, { "epoch": 0.36912280701754385, "grad_norm": 0.6233017444610596, "learning_rate": 0.00019178350179328336, "loss": 1.4539, "step": 526 }, { "epoch": 0.3698245614035088, "grad_norm": 0.6184253692626953, "learning_rate": 0.00019171829149005544, "loss": 1.4777, "step": 527 }, { "epoch": 0.3705263157894737, "grad_norm": 0.6504309177398682, "learning_rate": 0.00019165308118682751, "loss": 1.6766, "step": 528 }, { "epoch": 0.3705263157894737, "eval_loss": 1.558809757232666, "eval_runtime": 65.8215, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 528 }, { "epoch": 0.3712280701754386, "grad_norm": 0.664543092250824, "learning_rate": 0.00019158787088359962, "loss": 1.5572, "step": 529 }, { "epoch": 0.3719298245614035, "grad_norm": 0.60514897108078, "learning_rate": 0.00019152266058037172, "loss": 1.4646, "step": 530 }, { "epoch": 0.3726315789473684, "grad_norm": 0.7878003716468811, "learning_rate": 0.0001914574502771438, "loss": 1.6571, "step": 531 }, { "epoch": 0.37333333333333335, "grad_norm": 0.6283036470413208, "learning_rate": 0.0001913922399739159, "loss": 1.7073, "step": 532 }, { "epoch": 0.3740350877192982, "grad_norm": 0.6045493483543396, "learning_rate": 0.00019132702967068798, "loss": 1.5617, "step": 533 }, { "epoch": 0.37473684210526315, "grad_norm": 0.6325587034225464, "learning_rate": 0.00019126181936746005, "loss": 1.5482, "step": 534 }, { "epoch": 0.37543859649122807, "grad_norm": 0.6224280595779419, "learning_rate": 0.00019119660906423216, "loss": 1.7127, "step": 535 }, { "epoch": 0.376140350877193, "grad_norm": 0.600073516368866, "learning_rate": 0.00019113139876100426, "loss": 1.5568, "step": 536 }, { "epoch": 0.37684210526315787, "grad_norm": 0.7658207416534424, "learning_rate": 0.00019106618845777634, "loss": 1.7231, "step": 537 }, { "epoch": 0.3775438596491228, "grad_norm": 0.6435375213623047, "learning_rate": 0.00019100097815454841, "loss": 1.5649, "step": 538 }, { "epoch": 0.3782456140350877, "grad_norm": 0.6197004318237305, "learning_rate": 0.00019093576785132052, "loss": 1.4954, "step": 539 }, { "epoch": 0.37894736842105264, "grad_norm": 0.5986965298652649, "learning_rate": 0.0001908705575480926, "loss": 1.5437, "step": 540 }, { "epoch": 0.37964912280701757, "grad_norm": 0.6383313536643982, "learning_rate": 0.0001908053472448647, "loss": 1.6166, "step": 541 }, { "epoch": 0.38035087719298244, "grad_norm": 0.6442448496818542, "learning_rate": 0.0001907401369416368, "loss": 1.4989, "step": 542 }, { "epoch": 0.38105263157894737, "grad_norm": 0.6369329690933228, "learning_rate": 0.00019067492663840888, "loss": 1.6539, "step": 543 }, { "epoch": 0.3817543859649123, "grad_norm": 0.6250851154327393, "learning_rate": 0.00019060971633518096, "loss": 1.5615, "step": 544 }, { "epoch": 0.3817543859649123, "eval_loss": 1.5545685291290283, "eval_runtime": 65.8186, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 544 }, { "epoch": 0.3824561403508772, "grad_norm": 0.6362627148628235, "learning_rate": 0.00019054450603195306, "loss": 1.4881, "step": 545 }, { "epoch": 0.3831578947368421, "grad_norm": 0.6226513981819153, "learning_rate": 0.00019047929572872514, "loss": 1.5369, "step": 546 }, { "epoch": 0.383859649122807, "grad_norm": 0.6183695197105408, "learning_rate": 0.00019041408542549724, "loss": 1.7097, "step": 547 }, { "epoch": 0.38456140350877194, "grad_norm": 0.6272971630096436, "learning_rate": 0.00019034887512226934, "loss": 1.487, "step": 548 }, { "epoch": 0.38526315789473686, "grad_norm": 0.6172906756401062, "learning_rate": 0.00019028366481904142, "loss": 1.5378, "step": 549 }, { "epoch": 0.38596491228070173, "grad_norm": 0.6093366742134094, "learning_rate": 0.0001902184545158135, "loss": 1.4842, "step": 550 }, { "epoch": 0.38666666666666666, "grad_norm": 0.6335508227348328, "learning_rate": 0.0001901532442125856, "loss": 1.6128, "step": 551 }, { "epoch": 0.3873684210526316, "grad_norm": 0.6018494963645935, "learning_rate": 0.00019008803390935768, "loss": 1.5839, "step": 552 }, { "epoch": 0.3880701754385965, "grad_norm": 0.5925776362419128, "learning_rate": 0.00019002282360612978, "loss": 1.5709, "step": 553 }, { "epoch": 0.3887719298245614, "grad_norm": 0.6472690105438232, "learning_rate": 0.00018995761330290188, "loss": 1.6439, "step": 554 }, { "epoch": 0.3894736842105263, "grad_norm": 0.6460500955581665, "learning_rate": 0.00018989240299967396, "loss": 1.6058, "step": 555 }, { "epoch": 0.39017543859649123, "grad_norm": 0.6104738116264343, "learning_rate": 0.00018982719269644604, "loss": 1.551, "step": 556 }, { "epoch": 0.39087719298245616, "grad_norm": 0.67680823802948, "learning_rate": 0.00018976198239321814, "loss": 1.5126, "step": 557 }, { "epoch": 0.391578947368421, "grad_norm": 0.7177671194076538, "learning_rate": 0.00018969677208999024, "loss": 1.5908, "step": 558 }, { "epoch": 0.39228070175438595, "grad_norm": 0.6100648045539856, "learning_rate": 0.00018963156178676232, "loss": 1.523, "step": 559 }, { "epoch": 0.3929824561403509, "grad_norm": 0.6486205458641052, "learning_rate": 0.0001895663514835344, "loss": 1.5571, "step": 560 }, { "epoch": 0.3929824561403509, "eval_loss": 1.5493638515472412, "eval_runtime": 65.8193, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 560 }, { "epoch": 0.3936842105263158, "grad_norm": 0.6289941668510437, "learning_rate": 0.0001895011411803065, "loss": 1.8942, "step": 561 }, { "epoch": 0.39438596491228073, "grad_norm": 0.6127089262008667, "learning_rate": 0.00018943593087707858, "loss": 1.6828, "step": 562 }, { "epoch": 0.3950877192982456, "grad_norm": 0.5892358422279358, "learning_rate": 0.00018937072057385068, "loss": 1.5594, "step": 563 }, { "epoch": 0.3957894736842105, "grad_norm": 0.6225244402885437, "learning_rate": 0.00018930551027062278, "loss": 1.5743, "step": 564 }, { "epoch": 0.39649122807017545, "grad_norm": 0.6709104180335999, "learning_rate": 0.00018924029996739486, "loss": 1.7466, "step": 565 }, { "epoch": 0.3971929824561404, "grad_norm": 0.5733152627944946, "learning_rate": 0.00018917508966416694, "loss": 1.5174, "step": 566 }, { "epoch": 0.39789473684210525, "grad_norm": 0.6619423031806946, "learning_rate": 0.00018910987936093904, "loss": 1.6237, "step": 567 }, { "epoch": 0.39859649122807017, "grad_norm": 0.6386052370071411, "learning_rate": 0.00018904466905771112, "loss": 1.6313, "step": 568 }, { "epoch": 0.3992982456140351, "grad_norm": 0.5983960628509521, "learning_rate": 0.00018897945875448322, "loss": 1.6067, "step": 569 }, { "epoch": 0.4, "grad_norm": 0.6718977093696594, "learning_rate": 0.00018891424845125532, "loss": 1.6198, "step": 570 }, { "epoch": 0.4007017543859649, "grad_norm": 0.6380419731140137, "learning_rate": 0.0001888490381480274, "loss": 1.6623, "step": 571 }, { "epoch": 0.4014035087719298, "grad_norm": 0.7434234619140625, "learning_rate": 0.00018878382784479948, "loss": 1.6051, "step": 572 }, { "epoch": 0.40210526315789474, "grad_norm": 1.0555202960968018, "learning_rate": 0.00018871861754157158, "loss": 1.5526, "step": 573 }, { "epoch": 0.40280701754385967, "grad_norm": 0.6471709609031677, "learning_rate": 0.00018865340723834366, "loss": 1.7675, "step": 574 }, { "epoch": 0.40350877192982454, "grad_norm": 0.6232377886772156, "learning_rate": 0.00018858819693511576, "loss": 1.6476, "step": 575 }, { "epoch": 0.40421052631578946, "grad_norm": 0.5847394466400146, "learning_rate": 0.00018852298663188787, "loss": 1.557, "step": 576 }, { "epoch": 0.40421052631578946, "eval_loss": 1.551783561706543, "eval_runtime": 65.822, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 576 }, { "epoch": 0.4049122807017544, "grad_norm": 0.5818471908569336, "learning_rate": 0.00018845777632865994, "loss": 1.3809, "step": 577 }, { "epoch": 0.4056140350877193, "grad_norm": 0.6755268573760986, "learning_rate": 0.00018839256602543202, "loss": 1.5893, "step": 578 }, { "epoch": 0.4063157894736842, "grad_norm": 0.6453381180763245, "learning_rate": 0.00018832735572220412, "loss": 1.6204, "step": 579 }, { "epoch": 0.4070175438596491, "grad_norm": 0.7065298557281494, "learning_rate": 0.0001882621454189762, "loss": 1.537, "step": 580 }, { "epoch": 0.40771929824561404, "grad_norm": 0.6140729784965515, "learning_rate": 0.0001881969351157483, "loss": 1.563, "step": 581 }, { "epoch": 0.40842105263157896, "grad_norm": 0.5719225406646729, "learning_rate": 0.0001881317248125204, "loss": 1.4882, "step": 582 }, { "epoch": 0.40912280701754383, "grad_norm": 0.5734068155288696, "learning_rate": 0.00018806651450929246, "loss": 1.5229, "step": 583 }, { "epoch": 0.40982456140350876, "grad_norm": 0.6970155239105225, "learning_rate": 0.00018800130420606456, "loss": 1.683, "step": 584 }, { "epoch": 0.4105263157894737, "grad_norm": 0.5708716511726379, "learning_rate": 0.00018793609390283666, "loss": 1.5434, "step": 585 }, { "epoch": 0.4112280701754386, "grad_norm": 0.5651978254318237, "learning_rate": 0.00018787088359960874, "loss": 1.4525, "step": 586 }, { "epoch": 0.41192982456140353, "grad_norm": 0.5802776217460632, "learning_rate": 0.00018780567329638084, "loss": 1.5912, "step": 587 }, { "epoch": 0.4126315789473684, "grad_norm": 0.5708468556404114, "learning_rate": 0.00018774046299315292, "loss": 1.6611, "step": 588 }, { "epoch": 0.41333333333333333, "grad_norm": 0.5940225124359131, "learning_rate": 0.000187675252689925, "loss": 1.5761, "step": 589 }, { "epoch": 0.41403508771929826, "grad_norm": 0.6333124041557312, "learning_rate": 0.0001876100423866971, "loss": 1.8525, "step": 590 }, { "epoch": 0.4147368421052632, "grad_norm": 0.58876633644104, "learning_rate": 0.0001875448320834692, "loss": 1.6957, "step": 591 }, { "epoch": 0.41543859649122805, "grad_norm": 0.6868162155151367, "learning_rate": 0.00018747962178024128, "loss": 1.7813, "step": 592 }, { "epoch": 0.41543859649122805, "eval_loss": 1.5467450618743896, "eval_runtime": 65.8154, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 592 }, { "epoch": 0.416140350877193, "grad_norm": 0.6514833569526672, "learning_rate": 0.00018741441147701338, "loss": 1.5013, "step": 593 }, { "epoch": 0.4168421052631579, "grad_norm": 0.5811694860458374, "learning_rate": 0.00018734920117378546, "loss": 1.4355, "step": 594 }, { "epoch": 0.41754385964912283, "grad_norm": 0.6684769988059998, "learning_rate": 0.00018728399087055754, "loss": 1.6277, "step": 595 }, { "epoch": 0.4182456140350877, "grad_norm": 0.5779550671577454, "learning_rate": 0.00018721878056732964, "loss": 1.4807, "step": 596 }, { "epoch": 0.4189473684210526, "grad_norm": 0.6052579283714294, "learning_rate": 0.00018715357026410174, "loss": 1.6156, "step": 597 }, { "epoch": 0.41964912280701755, "grad_norm": 0.5815612077713013, "learning_rate": 0.00018708835996087385, "loss": 1.4894, "step": 598 }, { "epoch": 0.4203508771929825, "grad_norm": 0.7463383078575134, "learning_rate": 0.00018702314965764592, "loss": 1.7411, "step": 599 }, { "epoch": 0.42105263157894735, "grad_norm": 0.5905210375785828, "learning_rate": 0.000186957939354418, "loss": 1.4691, "step": 600 }, { "epoch": 0.42175438596491227, "grad_norm": 0.6143250465393066, "learning_rate": 0.0001868927290511901, "loss": 1.6379, "step": 601 }, { "epoch": 0.4224561403508772, "grad_norm": 0.639672577381134, "learning_rate": 0.00018682751874796218, "loss": 1.7274, "step": 602 }, { "epoch": 0.4231578947368421, "grad_norm": 0.5884619355201721, "learning_rate": 0.00018676230844473428, "loss": 1.6699, "step": 603 }, { "epoch": 0.423859649122807, "grad_norm": 0.5850658416748047, "learning_rate": 0.0001866970981415064, "loss": 1.472, "step": 604 }, { "epoch": 0.4245614035087719, "grad_norm": 0.5972256660461426, "learning_rate": 0.00018663188783827846, "loss": 1.5549, "step": 605 }, { "epoch": 0.42526315789473684, "grad_norm": 0.6392509341239929, "learning_rate": 0.00018656667753505054, "loss": 1.6254, "step": 606 }, { "epoch": 0.42596491228070177, "grad_norm": 0.5675361752510071, "learning_rate": 0.00018650146723182264, "loss": 1.4659, "step": 607 }, { "epoch": 0.4266666666666667, "grad_norm": 0.5884192585945129, "learning_rate": 0.00018643625692859472, "loss": 1.6733, "step": 608 }, { "epoch": 0.4266666666666667, "eval_loss": 1.5471911430358887, "eval_runtime": 65.8118, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 608 }, { "epoch": 0.42736842105263156, "grad_norm": 0.6033952832221985, "learning_rate": 0.00018637104662536682, "loss": 1.5142, "step": 609 }, { "epoch": 0.4280701754385965, "grad_norm": 0.5712110996246338, "learning_rate": 0.00018630583632213893, "loss": 1.4051, "step": 610 }, { "epoch": 0.4287719298245614, "grad_norm": 0.6735403537750244, "learning_rate": 0.00018624062601891098, "loss": 1.6533, "step": 611 }, { "epoch": 0.42947368421052634, "grad_norm": 0.6753754615783691, "learning_rate": 0.00018617541571568308, "loss": 1.6929, "step": 612 }, { "epoch": 0.4301754385964912, "grad_norm": 0.6321624517440796, "learning_rate": 0.00018611020541245518, "loss": 1.3529, "step": 613 }, { "epoch": 0.43087719298245614, "grad_norm": 0.6168909668922424, "learning_rate": 0.00018604499510922726, "loss": 1.5379, "step": 614 }, { "epoch": 0.43157894736842106, "grad_norm": 0.6143945455551147, "learning_rate": 0.00018597978480599937, "loss": 1.648, "step": 615 }, { "epoch": 0.432280701754386, "grad_norm": 0.5989311933517456, "learning_rate": 0.00018591457450277144, "loss": 1.6382, "step": 616 }, { "epoch": 0.43298245614035086, "grad_norm": 0.5536453127861023, "learning_rate": 0.00018584936419954352, "loss": 1.5205, "step": 617 }, { "epoch": 0.4336842105263158, "grad_norm": 0.6195278167724609, "learning_rate": 0.00018578415389631562, "loss": 1.8171, "step": 618 }, { "epoch": 0.4343859649122807, "grad_norm": 0.6607913374900818, "learning_rate": 0.00018571894359308773, "loss": 1.899, "step": 619 }, { "epoch": 0.43508771929824563, "grad_norm": 0.5749854445457458, "learning_rate": 0.0001856537332898598, "loss": 1.4943, "step": 620 }, { "epoch": 0.4357894736842105, "grad_norm": 0.5668680667877197, "learning_rate": 0.0001855885229866319, "loss": 1.5978, "step": 621 }, { "epoch": 0.43649122807017543, "grad_norm": 0.6476480960845947, "learning_rate": 0.00018552331268340398, "loss": 1.6358, "step": 622 }, { "epoch": 0.43719298245614036, "grad_norm": 0.5916937589645386, "learning_rate": 0.00018545810238017606, "loss": 1.4205, "step": 623 }, { "epoch": 0.4378947368421053, "grad_norm": 0.6173386573791504, "learning_rate": 0.00018539289207694816, "loss": 1.6463, "step": 624 }, { "epoch": 0.4378947368421053, "eval_loss": 1.5416231155395508, "eval_runtime": 65.818, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 624 }, { "epoch": 0.43859649122807015, "grad_norm": 0.5593937039375305, "learning_rate": 0.00018532768177372027, "loss": 1.407, "step": 625 }, { "epoch": 0.4392982456140351, "grad_norm": 0.5996303558349609, "learning_rate": 0.00018526247147049234, "loss": 1.552, "step": 626 }, { "epoch": 0.44, "grad_norm": 0.6196181774139404, "learning_rate": 0.00018519726116726445, "loss": 1.6621, "step": 627 }, { "epoch": 0.44070175438596493, "grad_norm": 0.6332850456237793, "learning_rate": 0.00018513205086403652, "loss": 1.6289, "step": 628 }, { "epoch": 0.4414035087719298, "grad_norm": 0.6012607216835022, "learning_rate": 0.0001850668405608086, "loss": 1.5916, "step": 629 }, { "epoch": 0.4421052631578947, "grad_norm": 0.626471757888794, "learning_rate": 0.0001850016302575807, "loss": 1.5286, "step": 630 }, { "epoch": 0.44280701754385965, "grad_norm": 0.566214919090271, "learning_rate": 0.0001849364199543528, "loss": 1.6354, "step": 631 }, { "epoch": 0.4435087719298246, "grad_norm": 0.696916937828064, "learning_rate": 0.00018487120965112488, "loss": 1.6399, "step": 632 }, { "epoch": 0.4442105263157895, "grad_norm": 0.6671330332756042, "learning_rate": 0.000184805999347897, "loss": 1.6901, "step": 633 }, { "epoch": 0.44491228070175437, "grad_norm": 0.5657070875167847, "learning_rate": 0.00018474078904466906, "loss": 1.5787, "step": 634 }, { "epoch": 0.4456140350877193, "grad_norm": 0.5785401463508606, "learning_rate": 0.00018467557874144114, "loss": 1.5458, "step": 635 }, { "epoch": 0.4463157894736842, "grad_norm": 0.5852059721946716, "learning_rate": 0.00018461036843821324, "loss": 1.4753, "step": 636 }, { "epoch": 0.44701754385964915, "grad_norm": 0.5778117775917053, "learning_rate": 0.00018454515813498535, "loss": 1.4977, "step": 637 }, { "epoch": 0.447719298245614, "grad_norm": 0.5789088010787964, "learning_rate": 0.00018447994783175742, "loss": 1.4049, "step": 638 }, { "epoch": 0.44842105263157894, "grad_norm": 0.6468783020973206, "learning_rate": 0.0001844147375285295, "loss": 1.6141, "step": 639 }, { "epoch": 0.44912280701754387, "grad_norm": 0.5919803977012634, "learning_rate": 0.0001843495272253016, "loss": 1.5942, "step": 640 }, { "epoch": 0.44912280701754387, "eval_loss": 1.541072130203247, "eval_runtime": 65.8182, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 640 }, { "epoch": 0.4498245614035088, "grad_norm": 0.6125659346580505, "learning_rate": 0.0001842843169220737, "loss": 1.6123, "step": 641 }, { "epoch": 0.45052631578947366, "grad_norm": 0.5989143252372742, "learning_rate": 0.00018421910661884578, "loss": 1.6512, "step": 642 }, { "epoch": 0.4512280701754386, "grad_norm": 0.5576602816581726, "learning_rate": 0.0001841538963156179, "loss": 1.4937, "step": 643 }, { "epoch": 0.4519298245614035, "grad_norm": 0.5692896842956543, "learning_rate": 0.00018408868601238996, "loss": 1.5135, "step": 644 }, { "epoch": 0.45263157894736844, "grad_norm": 0.6551368832588196, "learning_rate": 0.00018402347570916204, "loss": 1.6289, "step": 645 }, { "epoch": 0.4533333333333333, "grad_norm": 0.5667216777801514, "learning_rate": 0.00018395826540593414, "loss": 1.4464, "step": 646 }, { "epoch": 0.45403508771929824, "grad_norm": 0.6243787407875061, "learning_rate": 0.00018389305510270625, "loss": 1.5212, "step": 647 }, { "epoch": 0.45473684210526316, "grad_norm": 0.6148850917816162, "learning_rate": 0.00018382784479947832, "loss": 1.5747, "step": 648 }, { "epoch": 0.4554385964912281, "grad_norm": 0.7043919563293457, "learning_rate": 0.00018376263449625043, "loss": 1.7336, "step": 649 }, { "epoch": 0.45614035087719296, "grad_norm": 0.9238395690917969, "learning_rate": 0.0001836974241930225, "loss": 1.5943, "step": 650 }, { "epoch": 0.4568421052631579, "grad_norm": 0.585370659828186, "learning_rate": 0.00018363221388979458, "loss": 1.5692, "step": 651 }, { "epoch": 0.4575438596491228, "grad_norm": 0.5653206706047058, "learning_rate": 0.00018356700358656668, "loss": 1.4652, "step": 652 }, { "epoch": 0.45824561403508773, "grad_norm": 0.5872370600700378, "learning_rate": 0.0001835017932833388, "loss": 1.4941, "step": 653 }, { "epoch": 0.4589473684210526, "grad_norm": 0.6421022415161133, "learning_rate": 0.00018343658298011087, "loss": 1.5426, "step": 654 }, { "epoch": 0.45964912280701753, "grad_norm": 0.5456698536872864, "learning_rate": 0.00018337137267688297, "loss": 1.6212, "step": 655 }, { "epoch": 0.46035087719298246, "grad_norm": 0.5874689817428589, "learning_rate": 0.00018330616237365505, "loss": 1.4861, "step": 656 }, { "epoch": 0.46035087719298246, "eval_loss": 1.5380834341049194, "eval_runtime": 65.8163, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 656 }, { "epoch": 0.4610526315789474, "grad_norm": 0.580893337726593, "learning_rate": 0.00018324095207042712, "loss": 1.6461, "step": 657 }, { "epoch": 0.4617543859649123, "grad_norm": 0.6541386842727661, "learning_rate": 0.00018317574176719923, "loss": 1.5621, "step": 658 }, { "epoch": 0.4624561403508772, "grad_norm": 0.632037341594696, "learning_rate": 0.00018311053146397133, "loss": 1.6965, "step": 659 }, { "epoch": 0.4631578947368421, "grad_norm": 0.5768094658851624, "learning_rate": 0.0001830453211607434, "loss": 1.5764, "step": 660 }, { "epoch": 0.463859649122807, "grad_norm": 0.5521290302276611, "learning_rate": 0.0001829801108575155, "loss": 1.5702, "step": 661 }, { "epoch": 0.46456140350877195, "grad_norm": 0.6131693124771118, "learning_rate": 0.00018291490055428759, "loss": 1.602, "step": 662 }, { "epoch": 0.4652631578947368, "grad_norm": 0.604216456413269, "learning_rate": 0.00018284969025105966, "loss": 1.5523, "step": 663 }, { "epoch": 0.46596491228070175, "grad_norm": 0.6202670335769653, "learning_rate": 0.00018278447994783177, "loss": 1.7659, "step": 664 }, { "epoch": 0.4666666666666667, "grad_norm": 0.5929352045059204, "learning_rate": 0.00018271926964460387, "loss": 1.6761, "step": 665 }, { "epoch": 0.4673684210526316, "grad_norm": 0.5873444080352783, "learning_rate": 0.00018265405934137595, "loss": 1.6168, "step": 666 }, { "epoch": 0.46807017543859647, "grad_norm": 0.5841799378395081, "learning_rate": 0.00018258884903814802, "loss": 1.7046, "step": 667 }, { "epoch": 0.4687719298245614, "grad_norm": 0.5926886796951294, "learning_rate": 0.00018252363873492013, "loss": 1.4938, "step": 668 }, { "epoch": 0.4694736842105263, "grad_norm": 0.7415512204170227, "learning_rate": 0.0001824584284316922, "loss": 1.6204, "step": 669 }, { "epoch": 0.47017543859649125, "grad_norm": 0.5922508239746094, "learning_rate": 0.0001823932181284643, "loss": 1.664, "step": 670 }, { "epoch": 0.4708771929824561, "grad_norm": 0.5960471034049988, "learning_rate": 0.0001823280078252364, "loss": 1.5935, "step": 671 }, { "epoch": 0.47157894736842104, "grad_norm": 0.5473923087120056, "learning_rate": 0.0001822627975220085, "loss": 1.4349, "step": 672 }, { "epoch": 0.47157894736842104, "eval_loss": 1.5352416038513184, "eval_runtime": 65.8127, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 672 }, { "epoch": 0.47228070175438597, "grad_norm": 0.5602027773857117, "learning_rate": 0.00018219758721878056, "loss": 1.4993, "step": 673 }, { "epoch": 0.4729824561403509, "grad_norm": 0.6154656410217285, "learning_rate": 0.00018213237691555267, "loss": 1.5969, "step": 674 }, { "epoch": 0.47368421052631576, "grad_norm": 0.575515866279602, "learning_rate": 0.00018206716661232474, "loss": 1.4955, "step": 675 }, { "epoch": 0.4743859649122807, "grad_norm": 0.5683473348617554, "learning_rate": 0.00018200195630909685, "loss": 1.3993, "step": 676 }, { "epoch": 0.4750877192982456, "grad_norm": 0.7852767109870911, "learning_rate": 0.00018193674600586895, "loss": 1.7568, "step": 677 }, { "epoch": 0.47578947368421054, "grad_norm": 0.5597898364067078, "learning_rate": 0.00018187153570264103, "loss": 1.6164, "step": 678 }, { "epoch": 0.47649122807017547, "grad_norm": 0.6870754361152649, "learning_rate": 0.0001818063253994131, "loss": 1.6376, "step": 679 }, { "epoch": 0.47719298245614034, "grad_norm": 0.5542984008789062, "learning_rate": 0.0001817411150961852, "loss": 1.3359, "step": 680 }, { "epoch": 0.47789473684210526, "grad_norm": 0.6313662528991699, "learning_rate": 0.00018167590479295728, "loss": 1.7731, "step": 681 }, { "epoch": 0.4785964912280702, "grad_norm": 0.6204605102539062, "learning_rate": 0.0001816106944897294, "loss": 1.5159, "step": 682 }, { "epoch": 0.4792982456140351, "grad_norm": 0.5514792203903198, "learning_rate": 0.0001815454841865015, "loss": 1.5445, "step": 683 }, { "epoch": 0.48, "grad_norm": 0.5846177935600281, "learning_rate": 0.00018148027388327357, "loss": 1.4208, "step": 684 }, { "epoch": 0.4807017543859649, "grad_norm": 0.6128745079040527, "learning_rate": 0.00018141506358004564, "loss": 1.5772, "step": 685 }, { "epoch": 0.48140350877192983, "grad_norm": 0.5910210609436035, "learning_rate": 0.00018134985327681775, "loss": 1.4868, "step": 686 }, { "epoch": 0.48210526315789476, "grad_norm": 0.5644072890281677, "learning_rate": 0.00018128464297358985, "loss": 1.4557, "step": 687 }, { "epoch": 0.48280701754385963, "grad_norm": 0.5924608707427979, "learning_rate": 0.00018121943267036193, "loss": 1.5633, "step": 688 }, { "epoch": 0.48280701754385963, "eval_loss": 1.5303419828414917, "eval_runtime": 65.824, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 688 }, { "epoch": 0.48350877192982455, "grad_norm": 0.5801166296005249, "learning_rate": 0.00018115422236713403, "loss": 1.4972, "step": 689 }, { "epoch": 0.4842105263157895, "grad_norm": 0.6036468148231506, "learning_rate": 0.0001810890120639061, "loss": 1.6831, "step": 690 }, { "epoch": 0.4849122807017544, "grad_norm": 0.6442534923553467, "learning_rate": 0.00018102380176067818, "loss": 1.6221, "step": 691 }, { "epoch": 0.4856140350877193, "grad_norm": 0.6120734214782715, "learning_rate": 0.0001809585914574503, "loss": 1.6361, "step": 692 }, { "epoch": 0.4863157894736842, "grad_norm": 0.5898231863975525, "learning_rate": 0.0001808933811542224, "loss": 1.6407, "step": 693 }, { "epoch": 0.4870175438596491, "grad_norm": 0.582275390625, "learning_rate": 0.00018082817085099447, "loss": 1.4808, "step": 694 }, { "epoch": 0.48771929824561405, "grad_norm": 0.6247991919517517, "learning_rate": 0.00018076296054776655, "loss": 1.6383, "step": 695 }, { "epoch": 0.4884210526315789, "grad_norm": 0.6986836194992065, "learning_rate": 0.00018069775024453865, "loss": 1.5245, "step": 696 }, { "epoch": 0.48912280701754385, "grad_norm": 0.5784492492675781, "learning_rate": 0.00018063253994131073, "loss": 1.6115, "step": 697 }, { "epoch": 0.4898245614035088, "grad_norm": 0.5713099241256714, "learning_rate": 0.00018056732963808283, "loss": 1.5654, "step": 698 }, { "epoch": 0.4905263157894737, "grad_norm": 0.5952334403991699, "learning_rate": 0.00018050211933485493, "loss": 1.8133, "step": 699 }, { "epoch": 0.49122807017543857, "grad_norm": 0.5787906646728516, "learning_rate": 0.000180436909031627, "loss": 1.4218, "step": 700 }, { "epoch": 0.4919298245614035, "grad_norm": 0.6370477676391602, "learning_rate": 0.00018037169872839909, "loss": 1.7299, "step": 701 }, { "epoch": 0.4926315789473684, "grad_norm": 0.5841493010520935, "learning_rate": 0.0001803064884251712, "loss": 1.5517, "step": 702 }, { "epoch": 0.49333333333333335, "grad_norm": 0.6087077856063843, "learning_rate": 0.00018024127812194327, "loss": 1.6384, "step": 703 }, { "epoch": 0.49403508771929827, "grad_norm": 0.6383419036865234, "learning_rate": 0.00018017606781871537, "loss": 1.7665, "step": 704 }, { "epoch": 0.49403508771929827, "eval_loss": 1.527280330657959, "eval_runtime": 65.8118, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 704 }, { "epoch": 0.49473684210526314, "grad_norm": 0.6977135539054871, "learning_rate": 0.00018011085751548747, "loss": 1.5899, "step": 705 }, { "epoch": 0.49543859649122807, "grad_norm": 0.5904549956321716, "learning_rate": 0.00018004564721225955, "loss": 1.5393, "step": 706 }, { "epoch": 0.496140350877193, "grad_norm": 0.5704999566078186, "learning_rate": 0.00017998043690903163, "loss": 1.5729, "step": 707 }, { "epoch": 0.4968421052631579, "grad_norm": 0.5474374294281006, "learning_rate": 0.00017991522660580373, "loss": 1.2948, "step": 708 }, { "epoch": 0.4975438596491228, "grad_norm": 0.6330710649490356, "learning_rate": 0.0001798500163025758, "loss": 1.6156, "step": 709 }, { "epoch": 0.4982456140350877, "grad_norm": 0.6297155618667603, "learning_rate": 0.0001797848059993479, "loss": 1.5489, "step": 710 }, { "epoch": 0.49894736842105264, "grad_norm": 0.5558180212974548, "learning_rate": 0.00017971959569612001, "loss": 1.5011, "step": 711 }, { "epoch": 0.49964912280701756, "grad_norm": 0.6174421310424805, "learning_rate": 0.0001796543853928921, "loss": 1.6834, "step": 712 }, { "epoch": 0.5003508771929824, "grad_norm": 0.5773081183433533, "learning_rate": 0.00017958917508966417, "loss": 1.6233, "step": 713 }, { "epoch": 0.5010526315789474, "grad_norm": 0.6402484178543091, "learning_rate": 0.00017952396478643627, "loss": 1.5929, "step": 714 }, { "epoch": 0.5017543859649123, "grad_norm": 0.5291132926940918, "learning_rate": 0.00017945875448320835, "loss": 1.5221, "step": 715 }, { "epoch": 0.5024561403508772, "grad_norm": 0.5437853336334229, "learning_rate": 0.00017939354417998045, "loss": 1.4954, "step": 716 }, { "epoch": 0.5031578947368421, "grad_norm": 0.6234774589538574, "learning_rate": 0.00017932833387675255, "loss": 1.624, "step": 717 }, { "epoch": 0.503859649122807, "grad_norm": 0.5570104122161865, "learning_rate": 0.0001792631235735246, "loss": 1.4633, "step": 718 }, { "epoch": 0.5045614035087719, "grad_norm": 0.5618067979812622, "learning_rate": 0.0001791979132702967, "loss": 1.4789, "step": 719 }, { "epoch": 0.5052631578947369, "grad_norm": 0.6089984178543091, "learning_rate": 0.0001791327029670688, "loss": 1.577, "step": 720 }, { "epoch": 0.5052631578947369, "eval_loss": 1.525032877922058, "eval_runtime": 65.8364, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 720 }, { "epoch": 0.5059649122807017, "grad_norm": 0.610413134098053, "learning_rate": 0.0001790674926638409, "loss": 1.4928, "step": 721 }, { "epoch": 0.5066666666666667, "grad_norm": 0.5450130701065063, "learning_rate": 0.000179002282360613, "loss": 1.4951, "step": 722 }, { "epoch": 0.5073684210526316, "grad_norm": 0.5493000745773315, "learning_rate": 0.00017893707205738507, "loss": 1.4024, "step": 723 }, { "epoch": 0.5080701754385964, "grad_norm": 0.5977637767791748, "learning_rate": 0.00017887186175415714, "loss": 1.4602, "step": 724 }, { "epoch": 0.5087719298245614, "grad_norm": 0.5435031652450562, "learning_rate": 0.00017880665145092925, "loss": 1.4946, "step": 725 }, { "epoch": 0.5094736842105263, "grad_norm": 0.5711027383804321, "learning_rate": 0.00017874144114770135, "loss": 1.4472, "step": 726 }, { "epoch": 0.5101754385964913, "grad_norm": 0.645209789276123, "learning_rate": 0.00017867623084447343, "loss": 1.5524, "step": 727 }, { "epoch": 0.5108771929824562, "grad_norm": 0.5783587098121643, "learning_rate": 0.00017861102054124553, "loss": 1.4792, "step": 728 }, { "epoch": 0.511578947368421, "grad_norm": 0.6700847148895264, "learning_rate": 0.0001785458102380176, "loss": 1.573, "step": 729 }, { "epoch": 0.512280701754386, "grad_norm": 0.5822173357009888, "learning_rate": 0.0001784805999347897, "loss": 1.6207, "step": 730 }, { "epoch": 0.5129824561403509, "grad_norm": 0.5961790680885315, "learning_rate": 0.0001784153896315618, "loss": 1.5081, "step": 731 }, { "epoch": 0.5136842105263157, "grad_norm": 0.6975812315940857, "learning_rate": 0.0001783501793283339, "loss": 1.6259, "step": 732 }, { "epoch": 0.5143859649122807, "grad_norm": 0.6292494535446167, "learning_rate": 0.000178284969025106, "loss": 1.5812, "step": 733 }, { "epoch": 0.5150877192982456, "grad_norm": 0.5761540532112122, "learning_rate": 0.00017821975872187807, "loss": 1.7144, "step": 734 }, { "epoch": 0.5157894736842106, "grad_norm": 0.6378830671310425, "learning_rate": 0.00017815454841865015, "loss": 1.6065, "step": 735 }, { "epoch": 0.5164912280701754, "grad_norm": 0.6107819080352783, "learning_rate": 0.00017808933811542225, "loss": 1.5608, "step": 736 }, { "epoch": 0.5164912280701754, "eval_loss": 1.5236992835998535, "eval_runtime": 65.8176, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 736 }, { "epoch": 0.5171929824561403, "grad_norm": 0.5445180535316467, "learning_rate": 0.00017802412781219433, "loss": 1.4636, "step": 737 }, { "epoch": 0.5178947368421053, "grad_norm": 0.6609688401222229, "learning_rate": 0.00017795891750896643, "loss": 1.76, "step": 738 }, { "epoch": 0.5185964912280702, "grad_norm": 0.5446789264678955, "learning_rate": 0.00017789370720573854, "loss": 1.4515, "step": 739 }, { "epoch": 0.519298245614035, "grad_norm": 0.5666095614433289, "learning_rate": 0.0001778284969025106, "loss": 1.5856, "step": 740 }, { "epoch": 0.52, "grad_norm": 0.5357215404510498, "learning_rate": 0.0001777632865992827, "loss": 1.3726, "step": 741 }, { "epoch": 0.5207017543859649, "grad_norm": 0.6015555262565613, "learning_rate": 0.0001776980762960548, "loss": 1.6748, "step": 742 }, { "epoch": 0.5214035087719299, "grad_norm": 0.6359645128250122, "learning_rate": 0.00017763286599282687, "loss": 1.6565, "step": 743 }, { "epoch": 0.5221052631578947, "grad_norm": 0.5518999099731445, "learning_rate": 0.00017756765568959897, "loss": 1.5381, "step": 744 }, { "epoch": 0.5228070175438596, "grad_norm": 0.6048967242240906, "learning_rate": 0.00017750244538637108, "loss": 1.7166, "step": 745 }, { "epoch": 0.5235087719298246, "grad_norm": 0.5766078233718872, "learning_rate": 0.00017743723508314313, "loss": 1.5463, "step": 746 }, { "epoch": 0.5242105263157895, "grad_norm": 0.6058130860328674, "learning_rate": 0.00017737202477991523, "loss": 1.48, "step": 747 }, { "epoch": 0.5249122807017544, "grad_norm": 0.6253607869148254, "learning_rate": 0.00017730681447668733, "loss": 1.7468, "step": 748 }, { "epoch": 0.5256140350877193, "grad_norm": 0.6203944087028503, "learning_rate": 0.0001772416041734594, "loss": 1.7315, "step": 749 }, { "epoch": 0.5263157894736842, "grad_norm": 0.5887273550033569, "learning_rate": 0.0001771763938702315, "loss": 1.4822, "step": 750 }, { "epoch": 0.5270175438596492, "grad_norm": 0.57369065284729, "learning_rate": 0.0001771111835670036, "loss": 1.5151, "step": 751 }, { "epoch": 0.527719298245614, "grad_norm": 0.5566834211349487, "learning_rate": 0.00017704597326377567, "loss": 1.4643, "step": 752 }, { "epoch": 0.527719298245614, "eval_loss": 1.5187686681747437, "eval_runtime": 65.8236, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 752 }, { "epoch": 0.5284210526315789, "grad_norm": 0.5496149659156799, "learning_rate": 0.00017698076296054777, "loss": 1.532, "step": 753 }, { "epoch": 0.5291228070175439, "grad_norm": 0.5647898316383362, "learning_rate": 0.00017691555265731987, "loss": 1.4526, "step": 754 }, { "epoch": 0.5298245614035088, "grad_norm": 0.5696295499801636, "learning_rate": 0.00017685034235409195, "loss": 1.6486, "step": 755 }, { "epoch": 0.5305263157894737, "grad_norm": 0.5893814563751221, "learning_rate": 0.00017678513205086405, "loss": 1.6327, "step": 756 }, { "epoch": 0.5312280701754386, "grad_norm": 0.6877564787864685, "learning_rate": 0.00017671992174763613, "loss": 1.6792, "step": 757 }, { "epoch": 0.5319298245614035, "grad_norm": 0.5969341993331909, "learning_rate": 0.0001766547114444082, "loss": 1.5682, "step": 758 }, { "epoch": 0.5326315789473685, "grad_norm": 0.6004941463470459, "learning_rate": 0.0001765895011411803, "loss": 1.6344, "step": 759 }, { "epoch": 0.5333333333333333, "grad_norm": 0.6401600241661072, "learning_rate": 0.00017652429083795241, "loss": 1.5643, "step": 760 }, { "epoch": 0.5340350877192982, "grad_norm": 0.5405153036117554, "learning_rate": 0.0001764590805347245, "loss": 1.4815, "step": 761 }, { "epoch": 0.5347368421052632, "grad_norm": 0.5682723522186279, "learning_rate": 0.0001763938702314966, "loss": 1.4766, "step": 762 }, { "epoch": 0.535438596491228, "grad_norm": 0.5525510311126709, "learning_rate": 0.00017632865992826867, "loss": 1.5216, "step": 763 }, { "epoch": 0.536140350877193, "grad_norm": 0.5611911416053772, "learning_rate": 0.00017626344962504075, "loss": 1.496, "step": 764 }, { "epoch": 0.5368421052631579, "grad_norm": 0.6040037274360657, "learning_rate": 0.00017619823932181285, "loss": 1.6535, "step": 765 }, { "epoch": 0.5375438596491228, "grad_norm": 0.5436661839485168, "learning_rate": 0.00017613302901858495, "loss": 1.4809, "step": 766 }, { "epoch": 0.5382456140350877, "grad_norm": 0.5569854378700256, "learning_rate": 0.00017606781871535703, "loss": 1.4051, "step": 767 }, { "epoch": 0.5389473684210526, "grad_norm": 0.6644231081008911, "learning_rate": 0.00017600260841212914, "loss": 1.5405, "step": 768 }, { "epoch": 0.5389473684210526, "eval_loss": 1.5208289623260498, "eval_runtime": 65.8291, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 768 }, { "epoch": 0.5396491228070175, "grad_norm": 0.5710811614990234, "learning_rate": 0.0001759373981089012, "loss": 1.5314, "step": 769 }, { "epoch": 0.5403508771929825, "grad_norm": 0.6440165042877197, "learning_rate": 0.0001758721878056733, "loss": 1.6322, "step": 770 }, { "epoch": 0.5410526315789473, "grad_norm": 0.5761967897415161, "learning_rate": 0.0001758069775024454, "loss": 1.4376, "step": 771 }, { "epoch": 0.5417543859649123, "grad_norm": 0.6123760938644409, "learning_rate": 0.0001757417671992175, "loss": 1.7388, "step": 772 }, { "epoch": 0.5424561403508772, "grad_norm": 0.5664061307907104, "learning_rate": 0.0001756765568959896, "loss": 1.5854, "step": 773 }, { "epoch": 0.5431578947368421, "grad_norm": 0.6331567764282227, "learning_rate": 0.00017561134659276165, "loss": 1.6317, "step": 774 }, { "epoch": 0.543859649122807, "grad_norm": 0.5856006741523743, "learning_rate": 0.00017554613628953375, "loss": 1.5722, "step": 775 }, { "epoch": 0.5445614035087719, "grad_norm": 0.5387688279151917, "learning_rate": 0.00017548092598630586, "loss": 1.392, "step": 776 }, { "epoch": 0.5452631578947369, "grad_norm": 0.5793190002441406, "learning_rate": 0.00017541571568307793, "loss": 1.6352, "step": 777 }, { "epoch": 0.5459649122807018, "grad_norm": 0.5738780498504639, "learning_rate": 0.00017535050537985004, "loss": 1.5358, "step": 778 }, { "epoch": 0.5466666666666666, "grad_norm": 0.6526691913604736, "learning_rate": 0.0001752852950766221, "loss": 1.5713, "step": 779 }, { "epoch": 0.5473684210526316, "grad_norm": 0.6025631427764893, "learning_rate": 0.0001752200847733942, "loss": 1.4541, "step": 780 }, { "epoch": 0.5480701754385965, "grad_norm": 0.6034874320030212, "learning_rate": 0.0001751548744701663, "loss": 1.6534, "step": 781 }, { "epoch": 0.5487719298245614, "grad_norm": 0.6024832725524902, "learning_rate": 0.0001750896641669384, "loss": 1.5627, "step": 782 }, { "epoch": 0.5494736842105263, "grad_norm": 0.5390087366104126, "learning_rate": 0.00017502445386371047, "loss": 1.3788, "step": 783 }, { "epoch": 0.5501754385964912, "grad_norm": 0.525527834892273, "learning_rate": 0.00017495924356048258, "loss": 1.4681, "step": 784 }, { "epoch": 0.5501754385964912, "eval_loss": 1.5185675621032715, "eval_runtime": 65.815, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 784 }, { "epoch": 0.5508771929824562, "grad_norm": 0.6452107429504395, "learning_rate": 0.00017489403325725465, "loss": 1.513, "step": 785 }, { "epoch": 0.5515789473684211, "grad_norm": 0.6124709248542786, "learning_rate": 0.00017482882295402673, "loss": 1.5456, "step": 786 }, { "epoch": 0.5522807017543859, "grad_norm": 0.5367670059204102, "learning_rate": 0.00017476361265079883, "loss": 1.4651, "step": 787 }, { "epoch": 0.5529824561403509, "grad_norm": 0.5831142067909241, "learning_rate": 0.00017469840234757094, "loss": 1.5251, "step": 788 }, { "epoch": 0.5536842105263158, "grad_norm": 0.5668409466743469, "learning_rate": 0.000174633192044343, "loss": 1.5844, "step": 789 }, { "epoch": 0.5543859649122806, "grad_norm": 0.5777933597564697, "learning_rate": 0.00017456798174111512, "loss": 1.5298, "step": 790 }, { "epoch": 0.5550877192982456, "grad_norm": 0.543132483959198, "learning_rate": 0.0001745027714378872, "loss": 1.4033, "step": 791 }, { "epoch": 0.5557894736842105, "grad_norm": 0.5492923855781555, "learning_rate": 0.00017443756113465927, "loss": 1.446, "step": 792 }, { "epoch": 0.5564912280701755, "grad_norm": 0.618206799030304, "learning_rate": 0.00017437235083143137, "loss": 1.7058, "step": 793 }, { "epoch": 0.5571929824561404, "grad_norm": 0.5354285836219788, "learning_rate": 0.00017430714052820348, "loss": 1.4998, "step": 794 }, { "epoch": 0.5578947368421052, "grad_norm": 0.6591756343841553, "learning_rate": 0.00017424193022497555, "loss": 1.5517, "step": 795 }, { "epoch": 0.5585964912280702, "grad_norm": 0.5492257475852966, "learning_rate": 0.00017417671992174766, "loss": 1.5808, "step": 796 }, { "epoch": 0.5592982456140351, "grad_norm": 0.5378224849700928, "learning_rate": 0.00017411150961851973, "loss": 1.3213, "step": 797 }, { "epoch": 0.56, "grad_norm": 0.5966405272483826, "learning_rate": 0.0001740462993152918, "loss": 1.5977, "step": 798 }, { "epoch": 0.5607017543859649, "grad_norm": 0.6543579697608948, "learning_rate": 0.00017398108901206391, "loss": 1.6415, "step": 799 }, { "epoch": 0.5614035087719298, "grad_norm": 0.5304388403892517, "learning_rate": 0.00017391587870883602, "loss": 1.4048, "step": 800 }, { "epoch": 0.5614035087719298, "eval_loss": 1.5146254301071167, "eval_runtime": 65.8096, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 800 }, { "epoch": 0.5621052631578948, "grad_norm": 0.6125853061676025, "learning_rate": 0.0001738506684056081, "loss": 1.6309, "step": 801 }, { "epoch": 0.5628070175438596, "grad_norm": 0.6070045828819275, "learning_rate": 0.00017378545810238017, "loss": 1.4413, "step": 802 }, { "epoch": 0.5635087719298245, "grad_norm": 0.5895262956619263, "learning_rate": 0.00017372024779915227, "loss": 1.667, "step": 803 }, { "epoch": 0.5642105263157895, "grad_norm": 0.6319059729576111, "learning_rate": 0.00017365503749592435, "loss": 1.5311, "step": 804 }, { "epoch": 0.5649122807017544, "grad_norm": 0.5745180249214172, "learning_rate": 0.00017358982719269645, "loss": 1.484, "step": 805 }, { "epoch": 0.5656140350877193, "grad_norm": 0.6210883259773254, "learning_rate": 0.00017352461688946856, "loss": 1.5607, "step": 806 }, { "epoch": 0.5663157894736842, "grad_norm": 0.587558388710022, "learning_rate": 0.00017345940658624064, "loss": 1.5772, "step": 807 }, { "epoch": 0.5670175438596491, "grad_norm": 0.5990099310874939, "learning_rate": 0.0001733941962830127, "loss": 1.6222, "step": 808 }, { "epoch": 0.5677192982456141, "grad_norm": 0.5812878608703613, "learning_rate": 0.00017332898597978482, "loss": 1.5036, "step": 809 }, { "epoch": 0.5684210526315789, "grad_norm": 0.5086621642112732, "learning_rate": 0.0001732637756765569, "loss": 1.4253, "step": 810 }, { "epoch": 0.5691228070175438, "grad_norm": 0.5782451033592224, "learning_rate": 0.000173198565373329, "loss": 1.4621, "step": 811 }, { "epoch": 0.5698245614035088, "grad_norm": 0.5717040300369263, "learning_rate": 0.0001731333550701011, "loss": 1.4619, "step": 812 }, { "epoch": 0.5705263157894737, "grad_norm": 0.5647134780883789, "learning_rate": 0.00017306814476687318, "loss": 1.5424, "step": 813 }, { "epoch": 0.5712280701754386, "grad_norm": 0.5829737186431885, "learning_rate": 0.00017300293446364525, "loss": 1.5983, "step": 814 }, { "epoch": 0.5719298245614035, "grad_norm": 0.5456719398498535, "learning_rate": 0.00017293772416041736, "loss": 1.3606, "step": 815 }, { "epoch": 0.5726315789473684, "grad_norm": 0.5875656008720398, "learning_rate": 0.00017287251385718943, "loss": 1.5985, "step": 816 }, { "epoch": 0.5726315789473684, "eval_loss": 1.5116257667541504, "eval_runtime": 65.8158, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 816 }, { "epoch": 0.5733333333333334, "grad_norm": 0.559064507484436, "learning_rate": 0.00017280730355396154, "loss": 1.453, "step": 817 }, { "epoch": 0.5740350877192982, "grad_norm": 0.6365301609039307, "learning_rate": 0.00017274209325073364, "loss": 1.5755, "step": 818 }, { "epoch": 0.5747368421052632, "grad_norm": 0.5754192471504211, "learning_rate": 0.00017267688294750572, "loss": 1.5588, "step": 819 }, { "epoch": 0.5754385964912281, "grad_norm": 0.5575870871543884, "learning_rate": 0.0001726116726442778, "loss": 1.4572, "step": 820 }, { "epoch": 0.576140350877193, "grad_norm": 0.5524823665618896, "learning_rate": 0.0001725464623410499, "loss": 1.3925, "step": 821 }, { "epoch": 0.5768421052631579, "grad_norm": 0.5722379684448242, "learning_rate": 0.000172481252037822, "loss": 1.6741, "step": 822 }, { "epoch": 0.5775438596491228, "grad_norm": 0.5893756151199341, "learning_rate": 0.00017241604173459408, "loss": 1.6967, "step": 823 }, { "epoch": 0.5782456140350877, "grad_norm": 0.5391820073127747, "learning_rate": 0.00017235083143136615, "loss": 1.4864, "step": 824 }, { "epoch": 0.5789473684210527, "grad_norm": 0.6047136783599854, "learning_rate": 0.00017228562112813826, "loss": 1.4992, "step": 825 }, { "epoch": 0.5796491228070175, "grad_norm": 0.6159479022026062, "learning_rate": 0.00017222041082491033, "loss": 1.5205, "step": 826 }, { "epoch": 0.5803508771929825, "grad_norm": 0.5666090846061707, "learning_rate": 0.00017215520052168244, "loss": 1.4691, "step": 827 }, { "epoch": 0.5810526315789474, "grad_norm": 0.5275477766990662, "learning_rate": 0.00017208999021845454, "loss": 1.438, "step": 828 }, { "epoch": 0.5817543859649122, "grad_norm": 0.7284709811210632, "learning_rate": 0.00017202477991522662, "loss": 1.7399, "step": 829 }, { "epoch": 0.5824561403508772, "grad_norm": 0.5413610935211182, "learning_rate": 0.0001719595696119987, "loss": 1.4889, "step": 830 }, { "epoch": 0.5831578947368421, "grad_norm": 0.5636853575706482, "learning_rate": 0.0001718943593087708, "loss": 1.5964, "step": 831 }, { "epoch": 0.583859649122807, "grad_norm": 0.5630745887756348, "learning_rate": 0.00017182914900554287, "loss": 1.4497, "step": 832 }, { "epoch": 0.583859649122807, "eval_loss": 1.5093554258346558, "eval_runtime": 65.819, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 832 }, { "epoch": 0.584561403508772, "grad_norm": 0.6607223153114319, "learning_rate": 0.00017176393870231498, "loss": 1.7784, "step": 833 }, { "epoch": 0.5852631578947368, "grad_norm": 0.5987035632133484, "learning_rate": 0.00017169872839908708, "loss": 1.5878, "step": 834 }, { "epoch": 0.5859649122807018, "grad_norm": 0.5963811874389648, "learning_rate": 0.00017163351809585916, "loss": 1.5685, "step": 835 }, { "epoch": 0.5866666666666667, "grad_norm": 0.5232172608375549, "learning_rate": 0.00017156830779263123, "loss": 1.4733, "step": 836 }, { "epoch": 0.5873684210526315, "grad_norm": 0.5516242980957031, "learning_rate": 0.00017150309748940334, "loss": 1.5596, "step": 837 }, { "epoch": 0.5880701754385965, "grad_norm": 0.5559471845626831, "learning_rate": 0.00017143788718617541, "loss": 1.4657, "step": 838 }, { "epoch": 0.5887719298245614, "grad_norm": 0.5816804766654968, "learning_rate": 0.00017137267688294752, "loss": 1.6739, "step": 839 }, { "epoch": 0.5894736842105263, "grad_norm": 0.5993914604187012, "learning_rate": 0.00017130746657971962, "loss": 1.6398, "step": 840 }, { "epoch": 0.5901754385964912, "grad_norm": 0.5788246989250183, "learning_rate": 0.0001712422562764917, "loss": 1.5669, "step": 841 }, { "epoch": 0.5908771929824561, "grad_norm": 0.5899240970611572, "learning_rate": 0.00017117704597326377, "loss": 1.4871, "step": 842 }, { "epoch": 0.5915789473684211, "grad_norm": 0.5780290365219116, "learning_rate": 0.00017111183567003588, "loss": 1.5552, "step": 843 }, { "epoch": 0.592280701754386, "grad_norm": 0.5420917272567749, "learning_rate": 0.00017104662536680795, "loss": 1.4824, "step": 844 }, { "epoch": 0.5929824561403508, "grad_norm": 0.6106814742088318, "learning_rate": 0.00017098141506358006, "loss": 1.8767, "step": 845 }, { "epoch": 0.5936842105263158, "grad_norm": 0.5530657768249512, "learning_rate": 0.00017091620476035216, "loss": 1.6259, "step": 846 }, { "epoch": 0.5943859649122807, "grad_norm": 0.574223518371582, "learning_rate": 0.0001708509944571242, "loss": 1.6187, "step": 847 }, { "epoch": 0.5950877192982457, "grad_norm": 0.5438134074211121, "learning_rate": 0.00017078578415389632, "loss": 1.5533, "step": 848 }, { "epoch": 0.5950877192982457, "eval_loss": 1.5093555450439453, "eval_runtime": 65.8197, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 848 }, { "epoch": 0.5957894736842105, "grad_norm": 0.5994896292686462, "learning_rate": 0.00017072057385066842, "loss": 1.4683, "step": 849 }, { "epoch": 0.5964912280701754, "grad_norm": 0.6583132147789001, "learning_rate": 0.0001706553635474405, "loss": 1.634, "step": 850 }, { "epoch": 0.5971929824561404, "grad_norm": 0.624688982963562, "learning_rate": 0.0001705901532442126, "loss": 1.6646, "step": 851 }, { "epoch": 0.5978947368421053, "grad_norm": 0.5684359073638916, "learning_rate": 0.00017052494294098468, "loss": 1.5436, "step": 852 }, { "epoch": 0.5985964912280701, "grad_norm": 0.5592233538627625, "learning_rate": 0.00017045973263775675, "loss": 1.6981, "step": 853 }, { "epoch": 0.5992982456140351, "grad_norm": 0.5056375861167908, "learning_rate": 0.00017039452233452886, "loss": 1.4218, "step": 854 }, { "epoch": 0.6, "grad_norm": 0.5858005881309509, "learning_rate": 0.00017032931203130096, "loss": 1.5371, "step": 855 }, { "epoch": 0.600701754385965, "grad_norm": 0.6134957671165466, "learning_rate": 0.00017026410172807304, "loss": 1.7414, "step": 856 }, { "epoch": 0.6014035087719298, "grad_norm": 0.5650342106819153, "learning_rate": 0.00017019889142484514, "loss": 1.5325, "step": 857 }, { "epoch": 0.6021052631578947, "grad_norm": 0.5831318497657776, "learning_rate": 0.00017013368112161722, "loss": 1.4538, "step": 858 }, { "epoch": 0.6028070175438597, "grad_norm": 0.623827338218689, "learning_rate": 0.0001700684708183893, "loss": 1.6178, "step": 859 }, { "epoch": 0.6035087719298246, "grad_norm": 0.5647066831588745, "learning_rate": 0.0001700032605151614, "loss": 1.56, "step": 860 }, { "epoch": 0.6042105263157894, "grad_norm": 0.5488954186439514, "learning_rate": 0.0001699380502119335, "loss": 1.4221, "step": 861 }, { "epoch": 0.6049122807017544, "grad_norm": 0.6256019473075867, "learning_rate": 0.0001698728399087056, "loss": 1.7373, "step": 862 }, { "epoch": 0.6056140350877193, "grad_norm": 0.5957275629043579, "learning_rate": 0.00016980762960547768, "loss": 1.6558, "step": 863 }, { "epoch": 0.6063157894736843, "grad_norm": 0.6093901991844177, "learning_rate": 0.00016974241930224976, "loss": 1.5241, "step": 864 }, { "epoch": 0.6063157894736843, "eval_loss": 1.5030306577682495, "eval_runtime": 65.808, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 864 }, { "epoch": 0.6070175438596491, "grad_norm": 0.525657594203949, "learning_rate": 0.00016967720899902186, "loss": 1.5039, "step": 865 }, { "epoch": 0.607719298245614, "grad_norm": 0.567750871181488, "learning_rate": 0.00016961199869579394, "loss": 1.5397, "step": 866 }, { "epoch": 0.608421052631579, "grad_norm": 0.5366592407226562, "learning_rate": 0.00016954678839256604, "loss": 1.3953, "step": 867 }, { "epoch": 0.6091228070175438, "grad_norm": 0.5503570437431335, "learning_rate": 0.00016948157808933814, "loss": 1.5606, "step": 868 }, { "epoch": 0.6098245614035088, "grad_norm": 0.6069238781929016, "learning_rate": 0.00016941636778611022, "loss": 1.3876, "step": 869 }, { "epoch": 0.6105263157894737, "grad_norm": 0.5629158020019531, "learning_rate": 0.0001693511574828823, "loss": 1.4622, "step": 870 }, { "epoch": 0.6112280701754386, "grad_norm": 0.5860754251480103, "learning_rate": 0.0001692859471796544, "loss": 1.7248, "step": 871 }, { "epoch": 0.6119298245614035, "grad_norm": 0.6175757646560669, "learning_rate": 0.00016922073687642648, "loss": 1.6173, "step": 872 }, { "epoch": 0.6126315789473684, "grad_norm": 0.6086801290512085, "learning_rate": 0.00016915552657319858, "loss": 1.5478, "step": 873 }, { "epoch": 0.6133333333333333, "grad_norm": 0.5938102602958679, "learning_rate": 0.00016909031626997068, "loss": 1.4938, "step": 874 }, { "epoch": 0.6140350877192983, "grad_norm": 0.6241201162338257, "learning_rate": 0.00016902510596674273, "loss": 1.5636, "step": 875 }, { "epoch": 0.6147368421052631, "grad_norm": 0.5963270664215088, "learning_rate": 0.00016895989566351484, "loss": 1.6402, "step": 876 }, { "epoch": 0.6154385964912281, "grad_norm": 0.5238795280456543, "learning_rate": 0.00016889468536028694, "loss": 1.4751, "step": 877 }, { "epoch": 0.616140350877193, "grad_norm": 0.5442639589309692, "learning_rate": 0.00016882947505705902, "loss": 1.5709, "step": 878 }, { "epoch": 0.6168421052631579, "grad_norm": 0.5986355543136597, "learning_rate": 0.00016876426475383112, "loss": 1.4654, "step": 879 }, { "epoch": 0.6175438596491228, "grad_norm": 0.5739176869392395, "learning_rate": 0.0001686990544506032, "loss": 1.7478, "step": 880 }, { "epoch": 0.6175438596491228, "eval_loss": 1.5049394369125366, "eval_runtime": 65.8173, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 880 }, { "epoch": 0.6182456140350877, "grad_norm": 0.6870349645614624, "learning_rate": 0.00016863384414737527, "loss": 1.9007, "step": 881 }, { "epoch": 0.6189473684210526, "grad_norm": 0.5778204202651978, "learning_rate": 0.00016856863384414738, "loss": 1.7457, "step": 882 }, { "epoch": 0.6196491228070176, "grad_norm": 0.5384524464607239, "learning_rate": 0.00016850342354091948, "loss": 1.4726, "step": 883 }, { "epoch": 0.6203508771929824, "grad_norm": 0.5484864711761475, "learning_rate": 0.00016843821323769156, "loss": 1.4399, "step": 884 }, { "epoch": 0.6210526315789474, "grad_norm": 0.5394752025604248, "learning_rate": 0.00016837300293446366, "loss": 1.3455, "step": 885 }, { "epoch": 0.6217543859649123, "grad_norm": 0.5838941931724548, "learning_rate": 0.00016830779263123574, "loss": 1.5321, "step": 886 }, { "epoch": 0.6224561403508772, "grad_norm": 0.5188430547714233, "learning_rate": 0.00016824258232800782, "loss": 1.5631, "step": 887 }, { "epoch": 0.6231578947368421, "grad_norm": 0.5126955509185791, "learning_rate": 0.00016817737202477992, "loss": 1.3519, "step": 888 }, { "epoch": 0.623859649122807, "grad_norm": 0.6394215226173401, "learning_rate": 0.00016811216172155202, "loss": 1.6372, "step": 889 }, { "epoch": 0.624561403508772, "grad_norm": 0.5780743956565857, "learning_rate": 0.0001680469514183241, "loss": 1.5928, "step": 890 }, { "epoch": 0.6252631578947369, "grad_norm": 0.5558223724365234, "learning_rate": 0.0001679817411150962, "loss": 1.4386, "step": 891 }, { "epoch": 0.6259649122807017, "grad_norm": 0.5539360046386719, "learning_rate": 0.00016791653081186828, "loss": 1.5675, "step": 892 }, { "epoch": 0.6266666666666667, "grad_norm": 0.5505474209785461, "learning_rate": 0.00016785132050864036, "loss": 1.5776, "step": 893 }, { "epoch": 0.6273684210526316, "grad_norm": 0.5593991875648499, "learning_rate": 0.00016778611020541246, "loss": 1.47, "step": 894 }, { "epoch": 0.6280701754385964, "grad_norm": 0.5745978355407715, "learning_rate": 0.00016772089990218456, "loss": 1.6263, "step": 895 }, { "epoch": 0.6287719298245614, "grad_norm": 0.5401878952980042, "learning_rate": 0.00016765568959895664, "loss": 1.6015, "step": 896 }, { "epoch": 0.6287719298245614, "eval_loss": 1.5046435594558716, "eval_runtime": 65.8198, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 896 }, { "epoch": 0.6294736842105263, "grad_norm": 0.5463538765907288, "learning_rate": 0.00016759047929572874, "loss": 1.581, "step": 897 }, { "epoch": 0.6301754385964913, "grad_norm": 0.5799575448036194, "learning_rate": 0.00016752526899250082, "loss": 1.5817, "step": 898 }, { "epoch": 0.6308771929824561, "grad_norm": 0.5627912282943726, "learning_rate": 0.0001674600586892729, "loss": 1.5976, "step": 899 }, { "epoch": 0.631578947368421, "grad_norm": 0.6262831687927246, "learning_rate": 0.000167394848386045, "loss": 1.7332, "step": 900 }, { "epoch": 0.632280701754386, "grad_norm": 0.5846352577209473, "learning_rate": 0.0001673296380828171, "loss": 1.7163, "step": 901 }, { "epoch": 0.6329824561403509, "grad_norm": 0.5737934708595276, "learning_rate": 0.00016726442777958918, "loss": 1.4545, "step": 902 }, { "epoch": 0.6336842105263157, "grad_norm": 0.5596430897712708, "learning_rate": 0.00016719921747636126, "loss": 1.5425, "step": 903 }, { "epoch": 0.6343859649122807, "grad_norm": 0.5810723304748535, "learning_rate": 0.00016713400717313336, "loss": 1.5922, "step": 904 }, { "epoch": 0.6350877192982456, "grad_norm": 0.5351682305335999, "learning_rate": 0.00016706879686990546, "loss": 1.5849, "step": 905 }, { "epoch": 0.6357894736842106, "grad_norm": 0.5739153027534485, "learning_rate": 0.00016700358656667754, "loss": 1.6135, "step": 906 }, { "epoch": 0.6364912280701754, "grad_norm": 0.5510746240615845, "learning_rate": 0.00016693837626344964, "loss": 1.6439, "step": 907 }, { "epoch": 0.6371929824561403, "grad_norm": 0.5561052560806274, "learning_rate": 0.00016687316596022172, "loss": 1.4106, "step": 908 }, { "epoch": 0.6378947368421053, "grad_norm": 0.638592004776001, "learning_rate": 0.0001668079556569938, "loss": 1.3999, "step": 909 }, { "epoch": 0.6385964912280702, "grad_norm": 0.6152625679969788, "learning_rate": 0.0001667427453537659, "loss": 1.5004, "step": 910 }, { "epoch": 0.639298245614035, "grad_norm": 0.5640016198158264, "learning_rate": 0.000166677535050538, "loss": 1.5702, "step": 911 }, { "epoch": 0.64, "grad_norm": 0.6054765582084656, "learning_rate": 0.00016661232474731008, "loss": 1.6257, "step": 912 }, { "epoch": 0.64, "eval_loss": 1.5025749206542969, "eval_runtime": 65.8157, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 912 }, { "epoch": 0.6407017543859649, "grad_norm": 0.5775781273841858, "learning_rate": 0.00016654711444408218, "loss": 1.3984, "step": 913 }, { "epoch": 0.6414035087719299, "grad_norm": 0.5553260445594788, "learning_rate": 0.00016648190414085426, "loss": 1.5165, "step": 914 }, { "epoch": 0.6421052631578947, "grad_norm": 0.6113903522491455, "learning_rate": 0.00016641669383762634, "loss": 1.5917, "step": 915 }, { "epoch": 0.6428070175438596, "grad_norm": 0.5284942388534546, "learning_rate": 0.00016635148353439844, "loss": 1.4109, "step": 916 }, { "epoch": 0.6435087719298246, "grad_norm": 0.5685892105102539, "learning_rate": 0.00016628627323117054, "loss": 1.6065, "step": 917 }, { "epoch": 0.6442105263157895, "grad_norm": 0.566496729850769, "learning_rate": 0.00016622106292794262, "loss": 1.4374, "step": 918 }, { "epoch": 0.6449122807017544, "grad_norm": 0.5806846618652344, "learning_rate": 0.00016615585262471472, "loss": 1.5604, "step": 919 }, { "epoch": 0.6456140350877193, "grad_norm": 0.5764017701148987, "learning_rate": 0.0001660906423214868, "loss": 1.4271, "step": 920 }, { "epoch": 0.6463157894736842, "grad_norm": 0.5892585515975952, "learning_rate": 0.00016602543201825888, "loss": 1.6242, "step": 921 }, { "epoch": 0.6470175438596492, "grad_norm": 0.555974543094635, "learning_rate": 0.00016596022171503098, "loss": 1.5055, "step": 922 }, { "epoch": 0.647719298245614, "grad_norm": 0.606529712677002, "learning_rate": 0.00016589501141180309, "loss": 1.4248, "step": 923 }, { "epoch": 0.6484210526315789, "grad_norm": 0.5273278951644897, "learning_rate": 0.00016582980110857516, "loss": 1.4133, "step": 924 }, { "epoch": 0.6491228070175439, "grad_norm": 0.5186945796012878, "learning_rate": 0.00016576459080534727, "loss": 1.4271, "step": 925 }, { "epoch": 0.6498245614035087, "grad_norm": 0.5691914558410645, "learning_rate": 0.00016569938050211934, "loss": 1.7164, "step": 926 }, { "epoch": 0.6505263157894737, "grad_norm": 0.606552243232727, "learning_rate": 0.00016563417019889142, "loss": 1.7122, "step": 927 }, { "epoch": 0.6512280701754386, "grad_norm": 0.6035021543502808, "learning_rate": 0.00016556895989566352, "loss": 1.7009, "step": 928 }, { "epoch": 0.6512280701754386, "eval_loss": 1.502448320388794, "eval_runtime": 65.8169, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 928 }, { "epoch": 0.6519298245614035, "grad_norm": 0.5463997721672058, "learning_rate": 0.00016550374959243563, "loss": 1.514, "step": 929 }, { "epoch": 0.6526315789473685, "grad_norm": 0.5846124291419983, "learning_rate": 0.0001654385392892077, "loss": 1.6484, "step": 930 }, { "epoch": 0.6533333333333333, "grad_norm": 0.626053512096405, "learning_rate": 0.00016537332898597978, "loss": 1.6848, "step": 931 }, { "epoch": 0.6540350877192982, "grad_norm": 0.5468665361404419, "learning_rate": 0.00016530811868275188, "loss": 1.4271, "step": 932 }, { "epoch": 0.6547368421052632, "grad_norm": 0.50266432762146, "learning_rate": 0.00016524290837952396, "loss": 1.4162, "step": 933 }, { "epoch": 0.655438596491228, "grad_norm": 0.560589611530304, "learning_rate": 0.00016517769807629606, "loss": 1.6611, "step": 934 }, { "epoch": 0.656140350877193, "grad_norm": 0.562118411064148, "learning_rate": 0.00016511248777306817, "loss": 1.6201, "step": 935 }, { "epoch": 0.6568421052631579, "grad_norm": 0.5693764686584473, "learning_rate": 0.00016504727746984024, "loss": 1.5921, "step": 936 }, { "epoch": 0.6575438596491228, "grad_norm": 0.5732945203781128, "learning_rate": 0.00016498206716661232, "loss": 1.6354, "step": 937 }, { "epoch": 0.6582456140350877, "grad_norm": 0.6125863194465637, "learning_rate": 0.00016491685686338442, "loss": 1.7785, "step": 938 }, { "epoch": 0.6589473684210526, "grad_norm": 0.6188894510269165, "learning_rate": 0.0001648516465601565, "loss": 1.619, "step": 939 }, { "epoch": 0.6596491228070176, "grad_norm": 0.5613630414009094, "learning_rate": 0.0001647864362569286, "loss": 1.4906, "step": 940 }, { "epoch": 0.6603508771929825, "grad_norm": 0.5600090026855469, "learning_rate": 0.0001647212259537007, "loss": 1.617, "step": 941 }, { "epoch": 0.6610526315789473, "grad_norm": 0.5143298506736755, "learning_rate": 0.00016465601565047278, "loss": 1.5192, "step": 942 }, { "epoch": 0.6617543859649123, "grad_norm": 0.5696310997009277, "learning_rate": 0.00016459080534724486, "loss": 1.4547, "step": 943 }, { "epoch": 0.6624561403508772, "grad_norm": 0.5769887566566467, "learning_rate": 0.00016452559504401696, "loss": 1.5939, "step": 944 }, { "epoch": 0.6624561403508772, "eval_loss": 1.498206615447998, "eval_runtime": 65.8178, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 944 }, { "epoch": 0.6631578947368421, "grad_norm": 0.5561100840568542, "learning_rate": 0.00016446038474078904, "loss": 1.4277, "step": 945 }, { "epoch": 0.663859649122807, "grad_norm": 0.5511457920074463, "learning_rate": 0.00016439517443756114, "loss": 1.4776, "step": 946 }, { "epoch": 0.6645614035087719, "grad_norm": 0.5476071238517761, "learning_rate": 0.00016432996413433325, "loss": 1.5147, "step": 947 }, { "epoch": 0.6652631578947369, "grad_norm": 0.5653707981109619, "learning_rate": 0.00016426475383110532, "loss": 1.4417, "step": 948 }, { "epoch": 0.6659649122807018, "grad_norm": 0.5701811909675598, "learning_rate": 0.0001641995435278774, "loss": 1.8139, "step": 949 }, { "epoch": 0.6666666666666666, "grad_norm": 0.5435730218887329, "learning_rate": 0.0001641343332246495, "loss": 1.5197, "step": 950 }, { "epoch": 0.6673684210526316, "grad_norm": 0.5706349611282349, "learning_rate": 0.0001640691229214216, "loss": 1.6172, "step": 951 }, { "epoch": 0.6680701754385965, "grad_norm": 0.5611158013343811, "learning_rate": 0.00016400391261819368, "loss": 1.4286, "step": 952 }, { "epoch": 0.6687719298245614, "grad_norm": 0.5222998857498169, "learning_rate": 0.0001639387023149658, "loss": 1.6405, "step": 953 }, { "epoch": 0.6694736842105263, "grad_norm": 0.5275372862815857, "learning_rate": 0.00016387349201173786, "loss": 1.2823, "step": 954 }, { "epoch": 0.6701754385964912, "grad_norm": 0.5924704670906067, "learning_rate": 0.00016380828170850994, "loss": 1.6365, "step": 955 }, { "epoch": 0.6708771929824562, "grad_norm": 0.545463502407074, "learning_rate": 0.00016374307140528204, "loss": 1.5378, "step": 956 }, { "epoch": 0.671578947368421, "grad_norm": 0.5322976112365723, "learning_rate": 0.00016367786110205415, "loss": 1.5932, "step": 957 }, { "epoch": 0.6722807017543859, "grad_norm": 0.5663872361183167, "learning_rate": 0.00016361265079882622, "loss": 1.5976, "step": 958 }, { "epoch": 0.6729824561403509, "grad_norm": 0.5648203492164612, "learning_rate": 0.0001635474404955983, "loss": 1.4686, "step": 959 }, { "epoch": 0.6736842105263158, "grad_norm": 0.5893710851669312, "learning_rate": 0.0001634822301923704, "loss": 1.5302, "step": 960 }, { "epoch": 0.6736842105263158, "eval_loss": 1.4991371631622314, "eval_runtime": 65.8285, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 960 }, { "epoch": 0.6743859649122808, "grad_norm": 0.5324001312255859, "learning_rate": 0.00016341701988914248, "loss": 1.446, "step": 961 }, { "epoch": 0.6750877192982456, "grad_norm": 0.5884497165679932, "learning_rate": 0.00016335180958591459, "loss": 1.4613, "step": 962 }, { "epoch": 0.6757894736842105, "grad_norm": 0.5939708352088928, "learning_rate": 0.0001632865992826867, "loss": 1.6161, "step": 963 }, { "epoch": 0.6764912280701755, "grad_norm": 0.5566709041595459, "learning_rate": 0.00016322138897945877, "loss": 1.5556, "step": 964 }, { "epoch": 0.6771929824561403, "grad_norm": 0.6111564040184021, "learning_rate": 0.00016315617867623084, "loss": 1.3816, "step": 965 }, { "epoch": 0.6778947368421052, "grad_norm": 0.5475108027458191, "learning_rate": 0.00016309096837300295, "loss": 1.6345, "step": 966 }, { "epoch": 0.6785964912280702, "grad_norm": 0.5949693918228149, "learning_rate": 0.00016302575806977502, "loss": 1.5853, "step": 967 }, { "epoch": 0.6792982456140351, "grad_norm": 0.5877975821495056, "learning_rate": 0.00016296054776654713, "loss": 1.518, "step": 968 }, { "epoch": 0.68, "grad_norm": 0.5159136056900024, "learning_rate": 0.00016289533746331923, "loss": 1.3704, "step": 969 }, { "epoch": 0.6807017543859649, "grad_norm": 0.524744987487793, "learning_rate": 0.0001628301271600913, "loss": 1.4703, "step": 970 }, { "epoch": 0.6814035087719298, "grad_norm": 0.5552459359169006, "learning_rate": 0.00016276491685686338, "loss": 1.6027, "step": 971 }, { "epoch": 0.6821052631578948, "grad_norm": 0.5636270046234131, "learning_rate": 0.00016269970655363549, "loss": 1.6626, "step": 972 }, { "epoch": 0.6828070175438596, "grad_norm": 0.5966308116912842, "learning_rate": 0.00016263449625040756, "loss": 1.6241, "step": 973 }, { "epoch": 0.6835087719298245, "grad_norm": 0.5635896921157837, "learning_rate": 0.00016256928594717967, "loss": 1.5761, "step": 974 }, { "epoch": 0.6842105263157895, "grad_norm": 0.5518362522125244, "learning_rate": 0.00016250407564395177, "loss": 1.6343, "step": 975 }, { "epoch": 0.6849122807017544, "grad_norm": 0.5717785954475403, "learning_rate": 0.00016243886534072385, "loss": 1.4848, "step": 976 }, { "epoch": 0.6849122807017544, "eval_loss": 1.4960129261016846, "eval_runtime": 65.817, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 976 }, { "epoch": 0.6856140350877193, "grad_norm": 0.5702008605003357, "learning_rate": 0.00016237365503749592, "loss": 1.5119, "step": 977 }, { "epoch": 0.6863157894736842, "grad_norm": 0.6073006987571716, "learning_rate": 0.00016230844473426803, "loss": 1.6117, "step": 978 }, { "epoch": 0.6870175438596491, "grad_norm": 0.5566091537475586, "learning_rate": 0.0001622432344310401, "loss": 1.5986, "step": 979 }, { "epoch": 0.6877192982456141, "grad_norm": 0.5638344287872314, "learning_rate": 0.0001621780241278122, "loss": 1.7903, "step": 980 }, { "epoch": 0.6884210526315789, "grad_norm": 0.6073338985443115, "learning_rate": 0.0001621128138245843, "loss": 1.6161, "step": 981 }, { "epoch": 0.6891228070175439, "grad_norm": 0.5801055431365967, "learning_rate": 0.00016204760352135636, "loss": 1.4165, "step": 982 }, { "epoch": 0.6898245614035088, "grad_norm": 0.5726273059844971, "learning_rate": 0.00016198239321812846, "loss": 1.342, "step": 983 }, { "epoch": 0.6905263157894737, "grad_norm": 0.544964075088501, "learning_rate": 0.00016191718291490057, "loss": 1.4982, "step": 984 }, { "epoch": 0.6912280701754386, "grad_norm": 0.49850529432296753, "learning_rate": 0.00016185197261167264, "loss": 1.3981, "step": 985 }, { "epoch": 0.6919298245614035, "grad_norm": 0.538500964641571, "learning_rate": 0.00016178676230844475, "loss": 1.4734, "step": 986 }, { "epoch": 0.6926315789473684, "grad_norm": 0.5548259019851685, "learning_rate": 0.00016172155200521682, "loss": 1.4736, "step": 987 }, { "epoch": 0.6933333333333334, "grad_norm": 0.6729204058647156, "learning_rate": 0.0001616563417019889, "loss": 1.6213, "step": 988 }, { "epoch": 0.6940350877192982, "grad_norm": 0.5247981548309326, "learning_rate": 0.000161591131398761, "loss": 1.4662, "step": 989 }, { "epoch": 0.6947368421052632, "grad_norm": 0.524615466594696, "learning_rate": 0.0001615259210955331, "loss": 1.4209, "step": 990 }, { "epoch": 0.6954385964912281, "grad_norm": 0.5118274092674255, "learning_rate": 0.00016146071079230518, "loss": 1.5292, "step": 991 }, { "epoch": 0.696140350877193, "grad_norm": 0.5734535455703735, "learning_rate": 0.0001613955004890773, "loss": 1.5098, "step": 992 }, { "epoch": 0.696140350877193, "eval_loss": 1.4955847263336182, "eval_runtime": 65.8105, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 992 }, { "epoch": 0.6968421052631579, "grad_norm": 0.5919320583343506, "learning_rate": 0.00016133029018584936, "loss": 1.7173, "step": 993 }, { "epoch": 0.6975438596491228, "grad_norm": 0.5363337993621826, "learning_rate": 0.00016126507988262147, "loss": 1.504, "step": 994 }, { "epoch": 0.6982456140350877, "grad_norm": 0.5321435928344727, "learning_rate": 0.00016119986957939354, "loss": 1.4921, "step": 995 }, { "epoch": 0.6989473684210527, "grad_norm": 0.5315688848495483, "learning_rate": 0.00016113465927616565, "loss": 1.4819, "step": 996 }, { "epoch": 0.6996491228070175, "grad_norm": 0.5532930493354797, "learning_rate": 0.00016106944897293775, "loss": 1.6247, "step": 997 }, { "epoch": 0.7003508771929825, "grad_norm": 0.9368827939033508, "learning_rate": 0.00016100423866970983, "loss": 1.5972, "step": 998 }, { "epoch": 0.7010526315789474, "grad_norm": 0.568902313709259, "learning_rate": 0.0001609390283664819, "loss": 1.5289, "step": 999 }, { "epoch": 0.7017543859649122, "grad_norm": 0.5488641858100891, "learning_rate": 0.000160873818063254, "loss": 1.5368, "step": 1000 }, { "epoch": 0.7024561403508772, "grad_norm": 0.5821620225906372, "learning_rate": 0.00016080860776002609, "loss": 1.428, "step": 1001 }, { "epoch": 0.7031578947368421, "grad_norm": 0.5383942127227783, "learning_rate": 0.0001607433974567982, "loss": 1.5006, "step": 1002 }, { "epoch": 0.703859649122807, "grad_norm": 0.5646656155586243, "learning_rate": 0.0001606781871535703, "loss": 1.4727, "step": 1003 }, { "epoch": 0.7045614035087719, "grad_norm": 0.5503569841384888, "learning_rate": 0.00016061297685034237, "loss": 1.4368, "step": 1004 }, { "epoch": 0.7052631578947368, "grad_norm": 0.5584920644760132, "learning_rate": 0.00016054776654711445, "loss": 1.4192, "step": 1005 }, { "epoch": 0.7059649122807018, "grad_norm": 0.6551129817962646, "learning_rate": 0.00016048255624388655, "loss": 1.4767, "step": 1006 }, { "epoch": 0.7066666666666667, "grad_norm": 0.5503270030021667, "learning_rate": 0.00016041734594065863, "loss": 1.5995, "step": 1007 }, { "epoch": 0.7073684210526315, "grad_norm": 0.5789254903793335, "learning_rate": 0.00016035213563743073, "loss": 1.5481, "step": 1008 }, { "epoch": 0.7073684210526315, "eval_loss": 1.4927951097488403, "eval_runtime": 65.8235, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1008 }, { "epoch": 0.7080701754385965, "grad_norm": 0.5707395672798157, "learning_rate": 0.00016028692533420283, "loss": 1.5991, "step": 1009 }, { "epoch": 0.7087719298245614, "grad_norm": 0.5429443717002869, "learning_rate": 0.00016022171503097488, "loss": 1.7095, "step": 1010 }, { "epoch": 0.7094736842105264, "grad_norm": 0.5435569286346436, "learning_rate": 0.00016015650472774699, "loss": 1.6229, "step": 1011 }, { "epoch": 0.7101754385964912, "grad_norm": 0.5059309601783752, "learning_rate": 0.0001600912944245191, "loss": 1.3694, "step": 1012 }, { "epoch": 0.7108771929824561, "grad_norm": 0.5629433393478394, "learning_rate": 0.00016002608412129117, "loss": 1.4364, "step": 1013 }, { "epoch": 0.7115789473684211, "grad_norm": 0.5851459503173828, "learning_rate": 0.00015996087381806327, "loss": 1.7478, "step": 1014 }, { "epoch": 0.712280701754386, "grad_norm": 0.5546910166740417, "learning_rate": 0.00015989566351483535, "loss": 1.4787, "step": 1015 }, { "epoch": 0.7129824561403508, "grad_norm": 0.5300161242485046, "learning_rate": 0.00015983045321160742, "loss": 1.604, "step": 1016 }, { "epoch": 0.7136842105263158, "grad_norm": 0.530249834060669, "learning_rate": 0.00015976524290837953, "loss": 1.4358, "step": 1017 }, { "epoch": 0.7143859649122807, "grad_norm": 0.5750465393066406, "learning_rate": 0.00015970003260515163, "loss": 1.5122, "step": 1018 }, { "epoch": 0.7150877192982457, "grad_norm": 0.5770545601844788, "learning_rate": 0.0001596348223019237, "loss": 1.6742, "step": 1019 }, { "epoch": 0.7157894736842105, "grad_norm": 0.5350867509841919, "learning_rate": 0.0001595696119986958, "loss": 1.5172, "step": 1020 }, { "epoch": 0.7164912280701754, "grad_norm": 0.5668665766716003, "learning_rate": 0.0001595044016954679, "loss": 1.5197, "step": 1021 }, { "epoch": 0.7171929824561404, "grad_norm": 0.5812252759933472, "learning_rate": 0.00015943919139223996, "loss": 1.4834, "step": 1022 }, { "epoch": 0.7178947368421053, "grad_norm": 0.5577958822250366, "learning_rate": 0.00015937398108901207, "loss": 1.502, "step": 1023 }, { "epoch": 0.7185964912280701, "grad_norm": 0.6446884870529175, "learning_rate": 0.00015930877078578417, "loss": 1.5989, "step": 1024 }, { "epoch": 0.7185964912280701, "eval_loss": 1.4908775091171265, "eval_runtime": 65.8142, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1024 }, { "epoch": 0.7192982456140351, "grad_norm": 0.5790585875511169, "learning_rate": 0.00015924356048255625, "loss": 1.4928, "step": 1025 }, { "epoch": 0.72, "grad_norm": 0.6687883138656616, "learning_rate": 0.00015917835017932835, "loss": 1.7637, "step": 1026 }, { "epoch": 0.720701754385965, "grad_norm": 0.5783678293228149, "learning_rate": 0.00015911313987610043, "loss": 1.5109, "step": 1027 }, { "epoch": 0.7214035087719298, "grad_norm": 0.5479987263679504, "learning_rate": 0.0001590479295728725, "loss": 1.5421, "step": 1028 }, { "epoch": 0.7221052631578947, "grad_norm": 0.5299309492111206, "learning_rate": 0.0001589827192696446, "loss": 1.5937, "step": 1029 }, { "epoch": 0.7228070175438597, "grad_norm": 0.5468580722808838, "learning_rate": 0.0001589175089664167, "loss": 1.5796, "step": 1030 }, { "epoch": 0.7235087719298245, "grad_norm": 0.5445224046707153, "learning_rate": 0.0001588522986631888, "loss": 1.5441, "step": 1031 }, { "epoch": 0.7242105263157895, "grad_norm": 0.5666026473045349, "learning_rate": 0.0001587870883599609, "loss": 1.6686, "step": 1032 }, { "epoch": 0.7249122807017544, "grad_norm": 0.5585528016090393, "learning_rate": 0.00015872187805673297, "loss": 1.4832, "step": 1033 }, { "epoch": 0.7256140350877193, "grad_norm": 0.5503672957420349, "learning_rate": 0.00015865666775350504, "loss": 1.7069, "step": 1034 }, { "epoch": 0.7263157894736842, "grad_norm": 0.5814653635025024, "learning_rate": 0.00015859145745027715, "loss": 1.5782, "step": 1035 }, { "epoch": 0.7270175438596491, "grad_norm": 0.5572834014892578, "learning_rate": 0.00015852624714704925, "loss": 1.561, "step": 1036 }, { "epoch": 0.727719298245614, "grad_norm": 0.5603299140930176, "learning_rate": 0.00015846103684382136, "loss": 1.5671, "step": 1037 }, { "epoch": 0.728421052631579, "grad_norm": 0.5151978731155396, "learning_rate": 0.0001583958265405934, "loss": 1.4051, "step": 1038 }, { "epoch": 0.7291228070175438, "grad_norm": 0.5274546146392822, "learning_rate": 0.0001583306162373655, "loss": 1.4727, "step": 1039 }, { "epoch": 0.7298245614035088, "grad_norm": 0.6518547534942627, "learning_rate": 0.0001582654059341376, "loss": 1.6066, "step": 1040 }, { "epoch": 0.7298245614035088, "eval_loss": 1.4875272512435913, "eval_runtime": 65.824, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1040 }, { "epoch": 0.7305263157894737, "grad_norm": 0.5850233435630798, "learning_rate": 0.0001582001956309097, "loss": 1.5607, "step": 1041 }, { "epoch": 0.7312280701754386, "grad_norm": 0.5441792607307434, "learning_rate": 0.0001581349853276818, "loss": 1.4531, "step": 1042 }, { "epoch": 0.7319298245614035, "grad_norm": 0.5598700642585754, "learning_rate": 0.00015806977502445387, "loss": 1.5173, "step": 1043 }, { "epoch": 0.7326315789473684, "grad_norm": 0.541962206363678, "learning_rate": 0.00015800456472122595, "loss": 1.5192, "step": 1044 }, { "epoch": 0.7333333333333333, "grad_norm": 0.5679405331611633, "learning_rate": 0.00015793935441799805, "loss": 1.5861, "step": 1045 }, { "epoch": 0.7340350877192983, "grad_norm": 0.5342012643814087, "learning_rate": 0.00015787414411477015, "loss": 1.4804, "step": 1046 }, { "epoch": 0.7347368421052631, "grad_norm": 0.558336079120636, "learning_rate": 0.00015780893381154223, "loss": 1.692, "step": 1047 }, { "epoch": 0.7354385964912281, "grad_norm": 0.6123239994049072, "learning_rate": 0.00015774372350831433, "loss": 1.5616, "step": 1048 }, { "epoch": 0.736140350877193, "grad_norm": 0.5656084418296814, "learning_rate": 0.0001576785132050864, "loss": 1.6112, "step": 1049 }, { "epoch": 0.7368421052631579, "grad_norm": 0.6985616683959961, "learning_rate": 0.00015761330290185849, "loss": 1.9988, "step": 1050 }, { "epoch": 0.7375438596491228, "grad_norm": 0.5800986289978027, "learning_rate": 0.0001575480925986306, "loss": 1.4884, "step": 1051 }, { "epoch": 0.7382456140350877, "grad_norm": 0.5741161108016968, "learning_rate": 0.0001574828822954027, "loss": 1.5634, "step": 1052 }, { "epoch": 0.7389473684210527, "grad_norm": 0.5880879759788513, "learning_rate": 0.00015741767199217477, "loss": 1.692, "step": 1053 }, { "epoch": 0.7396491228070176, "grad_norm": 0.6097129583358765, "learning_rate": 0.00015735246168894687, "loss": 1.5888, "step": 1054 }, { "epoch": 0.7403508771929824, "grad_norm": 0.5516387224197388, "learning_rate": 0.00015728725138571895, "loss": 1.4508, "step": 1055 }, { "epoch": 0.7410526315789474, "grad_norm": 0.5876122713088989, "learning_rate": 0.00015722204108249103, "loss": 1.6533, "step": 1056 }, { "epoch": 0.7410526315789474, "eval_loss": 1.4865460395812988, "eval_runtime": 65.8281, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 1056 }, { "epoch": 0.7417543859649123, "grad_norm": 0.5413911938667297, "learning_rate": 0.00015715683077926313, "loss": 1.4968, "step": 1057 }, { "epoch": 0.7424561403508771, "grad_norm": 0.6163926124572754, "learning_rate": 0.00015709162047603523, "loss": 1.6022, "step": 1058 }, { "epoch": 0.7431578947368421, "grad_norm": 0.5413510203361511, "learning_rate": 0.0001570264101728073, "loss": 1.5278, "step": 1059 }, { "epoch": 0.743859649122807, "grad_norm": 0.5358436107635498, "learning_rate": 0.00015696119986957941, "loss": 1.4292, "step": 1060 }, { "epoch": 0.744561403508772, "grad_norm": 0.5715653896331787, "learning_rate": 0.0001568959895663515, "loss": 1.5562, "step": 1061 }, { "epoch": 0.7452631578947368, "grad_norm": 0.5911796689033508, "learning_rate": 0.00015683077926312357, "loss": 1.7022, "step": 1062 }, { "epoch": 0.7459649122807017, "grad_norm": 0.5555885434150696, "learning_rate": 0.00015676556895989567, "loss": 1.6049, "step": 1063 }, { "epoch": 0.7466666666666667, "grad_norm": 0.6018003225326538, "learning_rate": 0.00015670035865666777, "loss": 1.7073, "step": 1064 }, { "epoch": 0.7473684210526316, "grad_norm": 0.5916467308998108, "learning_rate": 0.00015663514835343985, "loss": 1.4903, "step": 1065 }, { "epoch": 0.7480701754385964, "grad_norm": 0.6027873158454895, "learning_rate": 0.00015656993805021193, "loss": 1.5576, "step": 1066 }, { "epoch": 0.7487719298245614, "grad_norm": 0.5180323719978333, "learning_rate": 0.00015650472774698403, "loss": 1.521, "step": 1067 }, { "epoch": 0.7494736842105263, "grad_norm": 0.590090811252594, "learning_rate": 0.0001564395174437561, "loss": 1.7412, "step": 1068 }, { "epoch": 0.7501754385964913, "grad_norm": 0.5812859535217285, "learning_rate": 0.0001563743071405282, "loss": 1.5461, "step": 1069 }, { "epoch": 0.7508771929824561, "grad_norm": 0.5998135805130005, "learning_rate": 0.00015630909683730031, "loss": 1.4829, "step": 1070 }, { "epoch": 0.751578947368421, "grad_norm": 0.5476358532905579, "learning_rate": 0.0001562438865340724, "loss": 1.482, "step": 1071 }, { "epoch": 0.752280701754386, "grad_norm": 0.5477606654167175, "learning_rate": 0.00015617867623084447, "loss": 1.5789, "step": 1072 }, { "epoch": 0.752280701754386, "eval_loss": 1.4898402690887451, "eval_runtime": 65.8179, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1072 }, { "epoch": 0.7529824561403509, "grad_norm": 0.5781694650650024, "learning_rate": 0.00015611346592761657, "loss": 1.4515, "step": 1073 }, { "epoch": 0.7536842105263157, "grad_norm": 0.6200684309005737, "learning_rate": 0.00015604825562438865, "loss": 1.5479, "step": 1074 }, { "epoch": 0.7543859649122807, "grad_norm": 0.5113518238067627, "learning_rate": 0.00015598304532116075, "loss": 1.4253, "step": 1075 }, { "epoch": 0.7550877192982456, "grad_norm": 0.522424578666687, "learning_rate": 0.00015591783501793286, "loss": 1.3532, "step": 1076 }, { "epoch": 0.7557894736842106, "grad_norm": 0.5146982669830322, "learning_rate": 0.00015585262471470493, "loss": 1.536, "step": 1077 }, { "epoch": 0.7564912280701754, "grad_norm": 0.5637925267219543, "learning_rate": 0.000155787414411477, "loss": 1.4207, "step": 1078 }, { "epoch": 0.7571929824561403, "grad_norm": 0.5368340015411377, "learning_rate": 0.0001557222041082491, "loss": 1.3692, "step": 1079 }, { "epoch": 0.7578947368421053, "grad_norm": 0.5588705539703369, "learning_rate": 0.00015565699380502122, "loss": 1.5344, "step": 1080 }, { "epoch": 0.7585964912280702, "grad_norm": 0.6020300388336182, "learning_rate": 0.0001555917835017933, "loss": 1.453, "step": 1081 }, { "epoch": 0.7592982456140351, "grad_norm": 0.5892632007598877, "learning_rate": 0.0001555265731985654, "loss": 1.5785, "step": 1082 }, { "epoch": 0.76, "grad_norm": 0.5373041033744812, "learning_rate": 0.00015546136289533747, "loss": 1.5181, "step": 1083 }, { "epoch": 0.7607017543859649, "grad_norm": 0.5612801313400269, "learning_rate": 0.00015539615259210955, "loss": 1.5133, "step": 1084 }, { "epoch": 0.7614035087719299, "grad_norm": 0.5535155534744263, "learning_rate": 0.00015533094228888165, "loss": 1.5604, "step": 1085 }, { "epoch": 0.7621052631578947, "grad_norm": 0.5867001414299011, "learning_rate": 0.00015526573198565376, "loss": 1.6541, "step": 1086 }, { "epoch": 0.7628070175438596, "grad_norm": 0.5619101524353027, "learning_rate": 0.00015520052168242583, "loss": 1.4713, "step": 1087 }, { "epoch": 0.7635087719298246, "grad_norm": 0.5588623881340027, "learning_rate": 0.00015513531137919794, "loss": 1.7281, "step": 1088 }, { "epoch": 0.7635087719298246, "eval_loss": 1.483282208442688, "eval_runtime": 65.8183, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1088 }, { "epoch": 0.7642105263157895, "grad_norm": 0.6085705757141113, "learning_rate": 0.00015507010107597, "loss": 1.4906, "step": 1089 }, { "epoch": 0.7649122807017544, "grad_norm": 0.6493943929672241, "learning_rate": 0.0001550048907727421, "loss": 1.6637, "step": 1090 }, { "epoch": 0.7656140350877193, "grad_norm": 0.5284192562103271, "learning_rate": 0.0001549396804695142, "loss": 1.6389, "step": 1091 }, { "epoch": 0.7663157894736842, "grad_norm": 0.6089284420013428, "learning_rate": 0.0001548744701662863, "loss": 1.6897, "step": 1092 }, { "epoch": 0.7670175438596492, "grad_norm": 0.5539531111717224, "learning_rate": 0.00015480925986305837, "loss": 1.475, "step": 1093 }, { "epoch": 0.767719298245614, "grad_norm": 0.5698765516281128, "learning_rate": 0.00015474404955983045, "loss": 1.5391, "step": 1094 }, { "epoch": 0.7684210526315789, "grad_norm": 0.5583752393722534, "learning_rate": 0.00015467883925660255, "loss": 1.6242, "step": 1095 }, { "epoch": 0.7691228070175439, "grad_norm": 0.5880687236785889, "learning_rate": 0.00015461362895337463, "loss": 1.5979, "step": 1096 }, { "epoch": 0.7698245614035087, "grad_norm": 0.5982322096824646, "learning_rate": 0.00015454841865014673, "loss": 1.5994, "step": 1097 }, { "epoch": 0.7705263157894737, "grad_norm": 0.538945198059082, "learning_rate": 0.00015448320834691884, "loss": 1.4817, "step": 1098 }, { "epoch": 0.7712280701754386, "grad_norm": 0.5434532761573792, "learning_rate": 0.00015441799804369091, "loss": 1.535, "step": 1099 }, { "epoch": 0.7719298245614035, "grad_norm": 0.5411986708641052, "learning_rate": 0.000154352787740463, "loss": 1.5505, "step": 1100 }, { "epoch": 0.7726315789473684, "grad_norm": 0.5366385579109192, "learning_rate": 0.0001542875774372351, "loss": 1.3835, "step": 1101 }, { "epoch": 0.7733333333333333, "grad_norm": 0.5405188798904419, "learning_rate": 0.00015422236713400717, "loss": 1.5404, "step": 1102 }, { "epoch": 0.7740350877192983, "grad_norm": 0.554742157459259, "learning_rate": 0.00015415715683077927, "loss": 1.6214, "step": 1103 }, { "epoch": 0.7747368421052632, "grad_norm": 0.7033591270446777, "learning_rate": 0.00015409194652755138, "loss": 1.4765, "step": 1104 }, { "epoch": 0.7747368421052632, "eval_loss": 1.4819482564926147, "eval_runtime": 65.8224, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1104 }, { "epoch": 0.775438596491228, "grad_norm": 0.5729840993881226, "learning_rate": 0.00015402673622432345, "loss": 1.5598, "step": 1105 }, { "epoch": 0.776140350877193, "grad_norm": 0.5700107216835022, "learning_rate": 0.00015396152592109553, "loss": 1.4698, "step": 1106 }, { "epoch": 0.7768421052631579, "grad_norm": 0.5663378834724426, "learning_rate": 0.00015389631561786763, "loss": 1.7365, "step": 1107 }, { "epoch": 0.7775438596491228, "grad_norm": 0.6087296605110168, "learning_rate": 0.0001538311053146397, "loss": 1.6399, "step": 1108 }, { "epoch": 0.7782456140350877, "grad_norm": 0.5511271953582764, "learning_rate": 0.00015376589501141181, "loss": 1.611, "step": 1109 }, { "epoch": 0.7789473684210526, "grad_norm": 0.5823012590408325, "learning_rate": 0.00015370068470818392, "loss": 1.5418, "step": 1110 }, { "epoch": 0.7796491228070176, "grad_norm": 0.49866241216659546, "learning_rate": 0.00015363547440495597, "loss": 1.498, "step": 1111 }, { "epoch": 0.7803508771929825, "grad_norm": 0.5712208151817322, "learning_rate": 0.00015357026410172807, "loss": 1.4375, "step": 1112 }, { "epoch": 0.7810526315789473, "grad_norm": 0.5406383872032166, "learning_rate": 0.00015350505379850018, "loss": 1.4731, "step": 1113 }, { "epoch": 0.7817543859649123, "grad_norm": 0.5105082988739014, "learning_rate": 0.00015343984349527225, "loss": 1.3458, "step": 1114 }, { "epoch": 0.7824561403508772, "grad_norm": 0.5568353533744812, "learning_rate": 0.00015337463319204436, "loss": 1.505, "step": 1115 }, { "epoch": 0.783157894736842, "grad_norm": 0.5458256006240845, "learning_rate": 0.00015330942288881643, "loss": 1.3912, "step": 1116 }, { "epoch": 0.783859649122807, "grad_norm": 0.5321551561355591, "learning_rate": 0.0001532442125855885, "loss": 1.4934, "step": 1117 }, { "epoch": 0.7845614035087719, "grad_norm": 0.6144965887069702, "learning_rate": 0.0001531790022823606, "loss": 1.5461, "step": 1118 }, { "epoch": 0.7852631578947369, "grad_norm": 0.5564832091331482, "learning_rate": 0.00015311379197913272, "loss": 1.5027, "step": 1119 }, { "epoch": 0.7859649122807018, "grad_norm": 0.5085788369178772, "learning_rate": 0.0001530485816759048, "loss": 1.4565, "step": 1120 }, { "epoch": 0.7859649122807018, "eval_loss": 1.4831657409667969, "eval_runtime": 65.8141, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1120 }, { "epoch": 0.7866666666666666, "grad_norm": 0.5764320492744446, "learning_rate": 0.0001529833713726769, "loss": 1.5563, "step": 1121 }, { "epoch": 0.7873684210526316, "grad_norm": 0.5041537880897522, "learning_rate": 0.00015291816106944897, "loss": 1.4558, "step": 1122 }, { "epoch": 0.7880701754385965, "grad_norm": 0.6431847810745239, "learning_rate": 0.00015285295076622108, "loss": 1.3501, "step": 1123 }, { "epoch": 0.7887719298245615, "grad_norm": 0.5563727021217346, "learning_rate": 0.00015278774046299315, "loss": 1.57, "step": 1124 }, { "epoch": 0.7894736842105263, "grad_norm": 0.5844342708587646, "learning_rate": 0.00015272253015976526, "loss": 1.5672, "step": 1125 }, { "epoch": 0.7901754385964912, "grad_norm": 0.5447474122047424, "learning_rate": 0.00015265731985653736, "loss": 1.526, "step": 1126 }, { "epoch": 0.7908771929824562, "grad_norm": 0.6564974784851074, "learning_rate": 0.00015259210955330944, "loss": 1.6514, "step": 1127 }, { "epoch": 0.791578947368421, "grad_norm": 0.5566999316215515, "learning_rate": 0.0001525268992500815, "loss": 1.5545, "step": 1128 }, { "epoch": 0.7922807017543859, "grad_norm": 0.5866788625717163, "learning_rate": 0.00015246168894685362, "loss": 1.4701, "step": 1129 }, { "epoch": 0.7929824561403509, "grad_norm": 0.5782862901687622, "learning_rate": 0.0001523964786436257, "loss": 1.6178, "step": 1130 }, { "epoch": 0.7936842105263158, "grad_norm": 0.5793597102165222, "learning_rate": 0.0001523312683403978, "loss": 1.6869, "step": 1131 }, { "epoch": 0.7943859649122808, "grad_norm": 0.5253363847732544, "learning_rate": 0.0001522660580371699, "loss": 1.5678, "step": 1132 }, { "epoch": 0.7950877192982456, "grad_norm": 0.5230212211608887, "learning_rate": 0.00015220084773394198, "loss": 1.3684, "step": 1133 }, { "epoch": 0.7957894736842105, "grad_norm": 0.5810248851776123, "learning_rate": 0.00015213563743071405, "loss": 1.5635, "step": 1134 }, { "epoch": 0.7964912280701755, "grad_norm": 0.5745883584022522, "learning_rate": 0.00015207042712748616, "loss": 1.6645, "step": 1135 }, { "epoch": 0.7971929824561403, "grad_norm": 0.5333226323127747, "learning_rate": 0.00015200521682425823, "loss": 1.5256, "step": 1136 }, { "epoch": 0.7971929824561403, "eval_loss": 1.4812346696853638, "eval_runtime": 65.8251, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1136 }, { "epoch": 0.7978947368421052, "grad_norm": 0.5619837641716003, "learning_rate": 0.00015194000652103034, "loss": 1.7075, "step": 1137 }, { "epoch": 0.7985964912280702, "grad_norm": 0.5636736154556274, "learning_rate": 0.00015187479621780244, "loss": 1.4744, "step": 1138 }, { "epoch": 0.7992982456140351, "grad_norm": 0.5512598156929016, "learning_rate": 0.0001518095859145745, "loss": 1.5102, "step": 1139 }, { "epoch": 0.8, "grad_norm": 0.5332244038581848, "learning_rate": 0.0001517443756113466, "loss": 1.3847, "step": 1140 }, { "epoch": 0.8007017543859649, "grad_norm": 0.5320185422897339, "learning_rate": 0.0001516791653081187, "loss": 1.5481, "step": 1141 }, { "epoch": 0.8014035087719298, "grad_norm": 0.6351203322410583, "learning_rate": 0.00015161395500489077, "loss": 1.6086, "step": 1142 }, { "epoch": 0.8021052631578948, "grad_norm": 0.535847544670105, "learning_rate": 0.00015154874470166288, "loss": 1.2876, "step": 1143 }, { "epoch": 0.8028070175438596, "grad_norm": 0.5218328237533569, "learning_rate": 0.00015148353439843495, "loss": 1.5208, "step": 1144 }, { "epoch": 0.8035087719298246, "grad_norm": 0.5772022604942322, "learning_rate": 0.00015141832409520703, "loss": 1.5512, "step": 1145 }, { "epoch": 0.8042105263157895, "grad_norm": 0.5807579159736633, "learning_rate": 0.00015135311379197913, "loss": 1.3461, "step": 1146 }, { "epoch": 0.8049122807017544, "grad_norm": 0.5659253001213074, "learning_rate": 0.00015128790348875124, "loss": 1.6402, "step": 1147 }, { "epoch": 0.8056140350877193, "grad_norm": 0.559472918510437, "learning_rate": 0.00015122269318552331, "loss": 1.452, "step": 1148 }, { "epoch": 0.8063157894736842, "grad_norm": 0.5749139189720154, "learning_rate": 0.00015115748288229542, "loss": 1.4536, "step": 1149 }, { "epoch": 0.8070175438596491, "grad_norm": 0.5923994183540344, "learning_rate": 0.0001510922725790675, "loss": 1.6637, "step": 1150 }, { "epoch": 0.8077192982456141, "grad_norm": 0.5595855712890625, "learning_rate": 0.00015102706227583957, "loss": 1.4804, "step": 1151 }, { "epoch": 0.8084210526315789, "grad_norm": 0.6050682663917542, "learning_rate": 0.00015096185197261167, "loss": 1.5159, "step": 1152 }, { "epoch": 0.8084210526315789, "eval_loss": 1.4866607189178467, "eval_runtime": 65.8175, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1152 }, { "epoch": 0.8091228070175439, "grad_norm": 0.5412077903747559, "learning_rate": 0.00015089664166938378, "loss": 1.4693, "step": 1153 }, { "epoch": 0.8098245614035088, "grad_norm": 0.5422680377960205, "learning_rate": 0.00015083143136615586, "loss": 1.4135, "step": 1154 }, { "epoch": 0.8105263157894737, "grad_norm": 0.538861870765686, "learning_rate": 0.00015076622106292796, "loss": 1.4366, "step": 1155 }, { "epoch": 0.8112280701754386, "grad_norm": 0.5232272744178772, "learning_rate": 0.00015070101075970004, "loss": 1.4161, "step": 1156 }, { "epoch": 0.8119298245614035, "grad_norm": 0.5336839556694031, "learning_rate": 0.0001506358004564721, "loss": 1.5015, "step": 1157 }, { "epoch": 0.8126315789473684, "grad_norm": 0.6205464005470276, "learning_rate": 0.00015057059015324422, "loss": 1.4938, "step": 1158 }, { "epoch": 0.8133333333333334, "grad_norm": 0.5690969824790955, "learning_rate": 0.00015050537985001632, "loss": 1.8682, "step": 1159 }, { "epoch": 0.8140350877192982, "grad_norm": 0.5797418355941772, "learning_rate": 0.0001504401695467884, "loss": 1.4562, "step": 1160 }, { "epoch": 0.8147368421052632, "grad_norm": 0.48879551887512207, "learning_rate": 0.0001503749592435605, "loss": 1.2611, "step": 1161 }, { "epoch": 0.8154385964912281, "grad_norm": 0.5584746599197388, "learning_rate": 0.00015030974894033258, "loss": 1.5335, "step": 1162 }, { "epoch": 0.8161403508771929, "grad_norm": 0.5313799977302551, "learning_rate": 0.00015024453863710465, "loss": 1.4481, "step": 1163 }, { "epoch": 0.8168421052631579, "grad_norm": 0.6279599666595459, "learning_rate": 0.00015017932833387676, "loss": 1.7126, "step": 1164 }, { "epoch": 0.8175438596491228, "grad_norm": 0.5306081175804138, "learning_rate": 0.00015011411803064886, "loss": 1.4508, "step": 1165 }, { "epoch": 0.8182456140350877, "grad_norm": 0.5519396662712097, "learning_rate": 0.00015004890772742094, "loss": 1.5628, "step": 1166 }, { "epoch": 0.8189473684210526, "grad_norm": 0.5092682242393494, "learning_rate": 0.000149983697424193, "loss": 1.4497, "step": 1167 }, { "epoch": 0.8196491228070175, "grad_norm": 0.622856855392456, "learning_rate": 0.00014991848712096512, "loss": 1.6556, "step": 1168 }, { "epoch": 0.8196491228070175, "eval_loss": 1.4816622734069824, "eval_runtime": 65.8199, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1168 }, { "epoch": 0.8203508771929825, "grad_norm": 0.565819501876831, "learning_rate": 0.00014985327681773722, "loss": 1.4736, "step": 1169 }, { "epoch": 0.8210526315789474, "grad_norm": 0.5267189741134644, "learning_rate": 0.0001497880665145093, "loss": 1.5036, "step": 1170 }, { "epoch": 0.8217543859649122, "grad_norm": 0.550203263759613, "learning_rate": 0.0001497228562112814, "loss": 1.5681, "step": 1171 }, { "epoch": 0.8224561403508772, "grad_norm": 0.5300328135490417, "learning_rate": 0.00014965764590805348, "loss": 1.529, "step": 1172 }, { "epoch": 0.8231578947368421, "grad_norm": 0.5399124026298523, "learning_rate": 0.00014959243560482555, "loss": 1.582, "step": 1173 }, { "epoch": 0.8238596491228071, "grad_norm": 0.48800745606422424, "learning_rate": 0.00014952722530159766, "loss": 1.5299, "step": 1174 }, { "epoch": 0.8245614035087719, "grad_norm": 0.4974161386489868, "learning_rate": 0.00014946201499836976, "loss": 1.3491, "step": 1175 }, { "epoch": 0.8252631578947368, "grad_norm": 0.5481404662132263, "learning_rate": 0.00014939680469514184, "loss": 1.6571, "step": 1176 }, { "epoch": 0.8259649122807018, "grad_norm": 0.58710777759552, "learning_rate": 0.00014933159439191394, "loss": 1.5581, "step": 1177 }, { "epoch": 0.8266666666666667, "grad_norm": 0.5547757148742676, "learning_rate": 0.00014926638408868602, "loss": 1.4927, "step": 1178 }, { "epoch": 0.8273684210526315, "grad_norm": 0.5317093729972839, "learning_rate": 0.0001492011737854581, "loss": 1.5128, "step": 1179 }, { "epoch": 0.8280701754385965, "grad_norm": 0.5430612564086914, "learning_rate": 0.0001491359634822302, "loss": 1.5092, "step": 1180 }, { "epoch": 0.8287719298245614, "grad_norm": 0.5511969923973083, "learning_rate": 0.0001490707531790023, "loss": 1.4893, "step": 1181 }, { "epoch": 0.8294736842105264, "grad_norm": 0.5654397010803223, "learning_rate": 0.00014900554287577438, "loss": 1.5597, "step": 1182 }, { "epoch": 0.8301754385964912, "grad_norm": 0.5461217164993286, "learning_rate": 0.00014894033257254648, "loss": 1.4581, "step": 1183 }, { "epoch": 0.8308771929824561, "grad_norm": 0.5668187141418457, "learning_rate": 0.00014887512226931856, "loss": 1.6018, "step": 1184 }, { "epoch": 0.8308771929824561, "eval_loss": 1.479556918144226, "eval_runtime": 65.8182, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1184 }, { "epoch": 0.8315789473684211, "grad_norm": 0.5497478246688843, "learning_rate": 0.00014880991196609063, "loss": 1.5522, "step": 1185 }, { "epoch": 0.832280701754386, "grad_norm": 0.6065900921821594, "learning_rate": 0.00014874470166286274, "loss": 1.6163, "step": 1186 }, { "epoch": 0.8329824561403508, "grad_norm": 0.5533166527748108, "learning_rate": 0.00014867949135963484, "loss": 1.6228, "step": 1187 }, { "epoch": 0.8336842105263158, "grad_norm": 0.5917710065841675, "learning_rate": 0.00014861428105640692, "loss": 1.657, "step": 1188 }, { "epoch": 0.8343859649122807, "grad_norm": 0.5532718300819397, "learning_rate": 0.00014854907075317902, "loss": 1.5216, "step": 1189 }, { "epoch": 0.8350877192982457, "grad_norm": 0.5597209930419922, "learning_rate": 0.0001484838604499511, "loss": 1.5674, "step": 1190 }, { "epoch": 0.8357894736842105, "grad_norm": 0.5722250938415527, "learning_rate": 0.00014841865014672317, "loss": 1.6278, "step": 1191 }, { "epoch": 0.8364912280701754, "grad_norm": 0.5262444615364075, "learning_rate": 0.00014835343984349528, "loss": 1.5379, "step": 1192 }, { "epoch": 0.8371929824561404, "grad_norm": 0.5476327538490295, "learning_rate": 0.00014828822954026738, "loss": 1.5422, "step": 1193 }, { "epoch": 0.8378947368421052, "grad_norm": 0.5486054420471191, "learning_rate": 0.00014822301923703946, "loss": 1.5021, "step": 1194 }, { "epoch": 0.8385964912280702, "grad_norm": 0.5926556587219238, "learning_rate": 0.00014815780893381154, "loss": 1.5153, "step": 1195 }, { "epoch": 0.8392982456140351, "grad_norm": 0.5681668519973755, "learning_rate": 0.00014809259863058364, "loss": 1.5156, "step": 1196 }, { "epoch": 0.84, "grad_norm": 0.5313260555267334, "learning_rate": 0.00014802738832735572, "loss": 1.377, "step": 1197 }, { "epoch": 0.840701754385965, "grad_norm": 0.5032557249069214, "learning_rate": 0.00014796217802412782, "loss": 1.3982, "step": 1198 }, { "epoch": 0.8414035087719298, "grad_norm": 0.5287100672721863, "learning_rate": 0.00014789696772089992, "loss": 1.3817, "step": 1199 }, { "epoch": 0.8421052631578947, "grad_norm": 0.5340286493301392, "learning_rate": 0.000147831757417672, "loss": 1.5185, "step": 1200 }, { "epoch": 0.8421052631578947, "eval_loss": 1.4747413396835327, "eval_runtime": 65.8237, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1200 }, { "epoch": 0.8428070175438597, "grad_norm": 0.5634503960609436, "learning_rate": 0.00014776654711444408, "loss": 1.4618, "step": 1201 }, { "epoch": 0.8435087719298245, "grad_norm": 0.6031055450439453, "learning_rate": 0.00014770133681121618, "loss": 1.5232, "step": 1202 }, { "epoch": 0.8442105263157895, "grad_norm": 0.5010560750961304, "learning_rate": 0.00014763612650798826, "loss": 1.4447, "step": 1203 }, { "epoch": 0.8449122807017544, "grad_norm": 0.5494067668914795, "learning_rate": 0.00014757091620476036, "loss": 1.3937, "step": 1204 }, { "epoch": 0.8456140350877193, "grad_norm": 0.5259153842926025, "learning_rate": 0.00014750570590153246, "loss": 1.5177, "step": 1205 }, { "epoch": 0.8463157894736842, "grad_norm": 0.5892198085784912, "learning_rate": 0.00014744049559830454, "loss": 1.5682, "step": 1206 }, { "epoch": 0.8470175438596491, "grad_norm": 0.5536060929298401, "learning_rate": 0.00014737528529507662, "loss": 1.3775, "step": 1207 }, { "epoch": 0.847719298245614, "grad_norm": 0.5325253009796143, "learning_rate": 0.00014731007499184872, "loss": 1.484, "step": 1208 }, { "epoch": 0.848421052631579, "grad_norm": 0.5159037113189697, "learning_rate": 0.0001472448646886208, "loss": 1.3486, "step": 1209 }, { "epoch": 0.8491228070175438, "grad_norm": 0.5590388178825378, "learning_rate": 0.0001471796543853929, "loss": 1.4387, "step": 1210 }, { "epoch": 0.8498245614035088, "grad_norm": 0.5092251896858215, "learning_rate": 0.000147114444082165, "loss": 1.4466, "step": 1211 }, { "epoch": 0.8505263157894737, "grad_norm": 0.5708092451095581, "learning_rate": 0.00014704923377893708, "loss": 1.4014, "step": 1212 }, { "epoch": 0.8512280701754386, "grad_norm": 0.529369056224823, "learning_rate": 0.00014698402347570916, "loss": 1.5516, "step": 1213 }, { "epoch": 0.8519298245614035, "grad_norm": 0.5403317809104919, "learning_rate": 0.00014691881317248126, "loss": 1.4726, "step": 1214 }, { "epoch": 0.8526315789473684, "grad_norm": 0.5014476776123047, "learning_rate": 0.00014685360286925336, "loss": 1.334, "step": 1215 }, { "epoch": 0.8533333333333334, "grad_norm": 0.5399596691131592, "learning_rate": 0.00014678839256602544, "loss": 1.5272, "step": 1216 }, { "epoch": 0.8533333333333334, "eval_loss": 1.4750645160675049, "eval_runtime": 65.812, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1216 }, { "epoch": 0.8540350877192983, "grad_norm": 0.5082270503044128, "learning_rate": 0.00014672318226279754, "loss": 1.5707, "step": 1217 }, { "epoch": 0.8547368421052631, "grad_norm": 0.540999174118042, "learning_rate": 0.00014665797195956962, "loss": 1.379, "step": 1218 }, { "epoch": 0.8554385964912281, "grad_norm": 0.5463826060295105, "learning_rate": 0.0001465927616563417, "loss": 1.4966, "step": 1219 }, { "epoch": 0.856140350877193, "grad_norm": 0.5287704467773438, "learning_rate": 0.0001465275513531138, "loss": 1.5784, "step": 1220 }, { "epoch": 0.8568421052631578, "grad_norm": 0.534685492515564, "learning_rate": 0.0001464623410498859, "loss": 1.5562, "step": 1221 }, { "epoch": 0.8575438596491228, "grad_norm": 0.5836363434791565, "learning_rate": 0.00014639713074665798, "loss": 1.4794, "step": 1222 }, { "epoch": 0.8582456140350877, "grad_norm": 0.5317553877830505, "learning_rate": 0.00014633192044343006, "loss": 1.4291, "step": 1223 }, { "epoch": 0.8589473684210527, "grad_norm": 0.5595473647117615, "learning_rate": 0.00014626671014020216, "loss": 1.4724, "step": 1224 }, { "epoch": 0.8596491228070176, "grad_norm": 0.5390156507492065, "learning_rate": 0.00014620149983697424, "loss": 1.5976, "step": 1225 }, { "epoch": 0.8603508771929824, "grad_norm": 0.538212239742279, "learning_rate": 0.00014613628953374634, "loss": 1.4856, "step": 1226 }, { "epoch": 0.8610526315789474, "grad_norm": 0.5650454759597778, "learning_rate": 0.00014607107923051845, "loss": 1.6488, "step": 1227 }, { "epoch": 0.8617543859649123, "grad_norm": 0.6359328627586365, "learning_rate": 0.00014600586892729052, "loss": 1.8124, "step": 1228 }, { "epoch": 0.8624561403508771, "grad_norm": 0.5422840118408203, "learning_rate": 0.0001459406586240626, "loss": 1.5992, "step": 1229 }, { "epoch": 0.8631578947368421, "grad_norm": 0.5374161005020142, "learning_rate": 0.0001458754483208347, "loss": 1.4714, "step": 1230 }, { "epoch": 0.863859649122807, "grad_norm": 0.5436745882034302, "learning_rate": 0.00014581023801760678, "loss": 1.5375, "step": 1231 }, { "epoch": 0.864561403508772, "grad_norm": 0.5574515461921692, "learning_rate": 0.00014574502771437888, "loss": 1.4534, "step": 1232 }, { "epoch": 0.864561403508772, "eval_loss": 1.470949411392212, "eval_runtime": 65.8198, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1232 }, { "epoch": 0.8652631578947368, "grad_norm": 0.5197882652282715, "learning_rate": 0.00014567981741115099, "loss": 1.6359, "step": 1233 }, { "epoch": 0.8659649122807017, "grad_norm": 0.530919075012207, "learning_rate": 0.00014561460710792306, "loss": 1.4595, "step": 1234 }, { "epoch": 0.8666666666666667, "grad_norm": 0.5855174660682678, "learning_rate": 0.00014554939680469514, "loss": 1.5901, "step": 1235 }, { "epoch": 0.8673684210526316, "grad_norm": 0.5662688612937927, "learning_rate": 0.00014548418650146724, "loss": 1.4689, "step": 1236 }, { "epoch": 0.8680701754385964, "grad_norm": 0.5507927536964417, "learning_rate": 0.00014541897619823932, "loss": 1.5697, "step": 1237 }, { "epoch": 0.8687719298245614, "grad_norm": 0.5883938074111938, "learning_rate": 0.00014535376589501142, "loss": 1.6785, "step": 1238 }, { "epoch": 0.8694736842105263, "grad_norm": 0.5684956908226013, "learning_rate": 0.00014528855559178353, "loss": 1.5273, "step": 1239 }, { "epoch": 0.8701754385964913, "grad_norm": 0.5536518096923828, "learning_rate": 0.0001452233452885556, "loss": 1.413, "step": 1240 }, { "epoch": 0.8708771929824561, "grad_norm": 0.5402714610099792, "learning_rate": 0.00014515813498532768, "loss": 1.468, "step": 1241 }, { "epoch": 0.871578947368421, "grad_norm": 0.5639040470123291, "learning_rate": 0.00014509292468209978, "loss": 1.6112, "step": 1242 }, { "epoch": 0.872280701754386, "grad_norm": 0.5390848517417908, "learning_rate": 0.00014502771437887186, "loss": 1.5372, "step": 1243 }, { "epoch": 0.8729824561403509, "grad_norm": 0.5556238889694214, "learning_rate": 0.00014496250407564396, "loss": 1.5931, "step": 1244 }, { "epoch": 0.8736842105263158, "grad_norm": 0.5737256407737732, "learning_rate": 0.00014489729377241607, "loss": 1.4244, "step": 1245 }, { "epoch": 0.8743859649122807, "grad_norm": 0.5315448641777039, "learning_rate": 0.00014483208346918812, "loss": 1.5162, "step": 1246 }, { "epoch": 0.8750877192982456, "grad_norm": 0.5116593837738037, "learning_rate": 0.00014476687316596022, "loss": 1.4892, "step": 1247 }, { "epoch": 0.8757894736842106, "grad_norm": 0.6269446015357971, "learning_rate": 0.00014470166286273232, "loss": 1.7704, "step": 1248 }, { "epoch": 0.8757894736842106, "eval_loss": 1.4738192558288574, "eval_runtime": 65.8209, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1248 }, { "epoch": 0.8764912280701754, "grad_norm": 0.5659319758415222, "learning_rate": 0.0001446364525595044, "loss": 1.65, "step": 1249 }, { "epoch": 0.8771929824561403, "grad_norm": 0.5416236519813538, "learning_rate": 0.0001445712422562765, "loss": 1.5322, "step": 1250 }, { "epoch": 0.8778947368421053, "grad_norm": 0.557841956615448, "learning_rate": 0.00014450603195304858, "loss": 1.445, "step": 1251 }, { "epoch": 0.8785964912280702, "grad_norm": 0.543647825717926, "learning_rate": 0.00014444082164982066, "loss": 1.5263, "step": 1252 }, { "epoch": 0.8792982456140351, "grad_norm": 0.5999258160591125, "learning_rate": 0.00014437561134659276, "loss": 1.6295, "step": 1253 }, { "epoch": 0.88, "grad_norm": 0.4822601079940796, "learning_rate": 0.00014431040104336486, "loss": 1.4738, "step": 1254 }, { "epoch": 0.8807017543859649, "grad_norm": 0.5323823094367981, "learning_rate": 0.00014424519074013697, "loss": 1.5739, "step": 1255 }, { "epoch": 0.8814035087719299, "grad_norm": 0.5779425501823425, "learning_rate": 0.00014417998043690904, "loss": 1.4856, "step": 1256 }, { "epoch": 0.8821052631578947, "grad_norm": 0.547266960144043, "learning_rate": 0.00014411477013368112, "loss": 1.5976, "step": 1257 }, { "epoch": 0.8828070175438596, "grad_norm": 0.5390084981918335, "learning_rate": 0.00014404955983045322, "loss": 1.3723, "step": 1258 }, { "epoch": 0.8835087719298246, "grad_norm": 0.560590922832489, "learning_rate": 0.0001439843495272253, "loss": 1.5529, "step": 1259 }, { "epoch": 0.8842105263157894, "grad_norm": 0.6155678033828735, "learning_rate": 0.0001439191392239974, "loss": 1.6627, "step": 1260 }, { "epoch": 0.8849122807017544, "grad_norm": 0.5419203042984009, "learning_rate": 0.0001438539289207695, "loss": 1.5134, "step": 1261 }, { "epoch": 0.8856140350877193, "grad_norm": 0.5284544825553894, "learning_rate": 0.00014378871861754158, "loss": 1.5324, "step": 1262 }, { "epoch": 0.8863157894736842, "grad_norm": 0.5784335732460022, "learning_rate": 0.00014372350831431366, "loss": 1.4126, "step": 1263 }, { "epoch": 0.8870175438596491, "grad_norm": 0.5507122278213501, "learning_rate": 0.00014365829801108576, "loss": 1.4149, "step": 1264 }, { "epoch": 0.8870175438596491, "eval_loss": 1.4733964204788208, "eval_runtime": 65.8255, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1264 }, { "epoch": 0.887719298245614, "grad_norm": 0.537939190864563, "learning_rate": 0.00014359308770785784, "loss": 1.5286, "step": 1265 }, { "epoch": 0.888421052631579, "grad_norm": 0.5086240172386169, "learning_rate": 0.00014352787740462995, "loss": 1.4061, "step": 1266 }, { "epoch": 0.8891228070175439, "grad_norm": 0.5368987917900085, "learning_rate": 0.00014346266710140205, "loss": 1.6155, "step": 1267 }, { "epoch": 0.8898245614035087, "grad_norm": 0.549022376537323, "learning_rate": 0.00014339745679817413, "loss": 1.5586, "step": 1268 }, { "epoch": 0.8905263157894737, "grad_norm": 0.5592303276062012, "learning_rate": 0.0001433322464949462, "loss": 1.6811, "step": 1269 }, { "epoch": 0.8912280701754386, "grad_norm": 0.5138877034187317, "learning_rate": 0.0001432670361917183, "loss": 1.5199, "step": 1270 }, { "epoch": 0.8919298245614035, "grad_norm": 0.5340042114257812, "learning_rate": 0.00014320182588849038, "loss": 1.4878, "step": 1271 }, { "epoch": 0.8926315789473684, "grad_norm": 0.5377689599990845, "learning_rate": 0.00014313661558526249, "loss": 1.476, "step": 1272 }, { "epoch": 0.8933333333333333, "grad_norm": 0.5274060368537903, "learning_rate": 0.0001430714052820346, "loss": 1.4585, "step": 1273 }, { "epoch": 0.8940350877192983, "grad_norm": 0.5237327814102173, "learning_rate": 0.00014300619497880664, "loss": 1.4536, "step": 1274 }, { "epoch": 0.8947368421052632, "grad_norm": 0.5537549257278442, "learning_rate": 0.00014294098467557874, "loss": 1.5554, "step": 1275 }, { "epoch": 0.895438596491228, "grad_norm": 0.5253876447677612, "learning_rate": 0.00014287577437235085, "loss": 1.5553, "step": 1276 }, { "epoch": 0.896140350877193, "grad_norm": 0.514738142490387, "learning_rate": 0.00014281056406912292, "loss": 1.3378, "step": 1277 }, { "epoch": 0.8968421052631579, "grad_norm": 0.5640460252761841, "learning_rate": 0.00014274535376589503, "loss": 1.4867, "step": 1278 }, { "epoch": 0.8975438596491228, "grad_norm": 0.5499711036682129, "learning_rate": 0.0001426801434626671, "loss": 1.6168, "step": 1279 }, { "epoch": 0.8982456140350877, "grad_norm": 0.5841776132583618, "learning_rate": 0.00014261493315943918, "loss": 1.3695, "step": 1280 }, { "epoch": 0.8982456140350877, "eval_loss": 1.470279335975647, "eval_runtime": 65.82, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1280 }, { "epoch": 0.8989473684210526, "grad_norm": 0.6124866604804993, "learning_rate": 0.00014254972285621128, "loss": 1.5329, "step": 1281 }, { "epoch": 0.8996491228070176, "grad_norm": 0.5483736991882324, "learning_rate": 0.0001424845125529834, "loss": 1.4937, "step": 1282 }, { "epoch": 0.9003508771929825, "grad_norm": 0.5663347840309143, "learning_rate": 0.00014241930224975546, "loss": 1.5209, "step": 1283 }, { "epoch": 0.9010526315789473, "grad_norm": 0.5112165808677673, "learning_rate": 0.00014235409194652757, "loss": 1.473, "step": 1284 }, { "epoch": 0.9017543859649123, "grad_norm": 0.5460093021392822, "learning_rate": 0.00014228888164329964, "loss": 1.5318, "step": 1285 }, { "epoch": 0.9024561403508772, "grad_norm": 0.5104202032089233, "learning_rate": 0.00014222367134007172, "loss": 1.2813, "step": 1286 }, { "epoch": 0.9031578947368422, "grad_norm": 0.5612935423851013, "learning_rate": 0.00014215846103684382, "loss": 1.4802, "step": 1287 }, { "epoch": 0.903859649122807, "grad_norm": 0.630139172077179, "learning_rate": 0.00014209325073361593, "loss": 1.5457, "step": 1288 }, { "epoch": 0.9045614035087719, "grad_norm": 0.545143723487854, "learning_rate": 0.000142028040430388, "loss": 1.4164, "step": 1289 }, { "epoch": 0.9052631578947369, "grad_norm": 0.5486801862716675, "learning_rate": 0.0001419628301271601, "loss": 1.4483, "step": 1290 }, { "epoch": 0.9059649122807018, "grad_norm": 0.5840705037117004, "learning_rate": 0.00014189761982393218, "loss": 1.5334, "step": 1291 }, { "epoch": 0.9066666666666666, "grad_norm": 0.5083035230636597, "learning_rate": 0.00014183240952070426, "loss": 1.4347, "step": 1292 }, { "epoch": 0.9073684210526316, "grad_norm": 0.5289025902748108, "learning_rate": 0.00014176719921747636, "loss": 1.3791, "step": 1293 }, { "epoch": 0.9080701754385965, "grad_norm": 0.926874041557312, "learning_rate": 0.00014170198891424847, "loss": 1.6151, "step": 1294 }, { "epoch": 0.9087719298245615, "grad_norm": 0.5230058431625366, "learning_rate": 0.00014163677861102054, "loss": 1.5473, "step": 1295 }, { "epoch": 0.9094736842105263, "grad_norm": 0.5359719395637512, "learning_rate": 0.00014157156830779265, "loss": 1.5082, "step": 1296 }, { "epoch": 0.9094736842105263, "eval_loss": 1.469955325126648, "eval_runtime": 65.8256, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1296 }, { "epoch": 0.9101754385964912, "grad_norm": 0.5188238024711609, "learning_rate": 0.00014150635800456472, "loss": 1.5477, "step": 1297 }, { "epoch": 0.9108771929824562, "grad_norm": 0.5406233072280884, "learning_rate": 0.0001414411477013368, "loss": 1.4293, "step": 1298 }, { "epoch": 0.911578947368421, "grad_norm": 0.5747801065444946, "learning_rate": 0.0001413759373981089, "loss": 1.6004, "step": 1299 }, { "epoch": 0.9122807017543859, "grad_norm": 0.6011098027229309, "learning_rate": 0.000141310727094881, "loss": 1.6415, "step": 1300 }, { "epoch": 0.9129824561403509, "grad_norm": 0.558925211429596, "learning_rate": 0.0001412455167916531, "loss": 1.8118, "step": 1301 }, { "epoch": 0.9136842105263158, "grad_norm": 0.5044886469841003, "learning_rate": 0.00014118030648842516, "loss": 1.3841, "step": 1302 }, { "epoch": 0.9143859649122807, "grad_norm": 0.5560135841369629, "learning_rate": 0.00014111509618519726, "loss": 1.5118, "step": 1303 }, { "epoch": 0.9150877192982456, "grad_norm": 0.5280001163482666, "learning_rate": 0.00014104988588196937, "loss": 1.3482, "step": 1304 }, { "epoch": 0.9157894736842105, "grad_norm": 0.5125765800476074, "learning_rate": 0.00014098467557874145, "loss": 1.3188, "step": 1305 }, { "epoch": 0.9164912280701755, "grad_norm": 0.5489683747291565, "learning_rate": 0.00014091946527551355, "loss": 1.4921, "step": 1306 }, { "epoch": 0.9171929824561403, "grad_norm": 0.5081249475479126, "learning_rate": 0.00014085425497228563, "loss": 1.3464, "step": 1307 }, { "epoch": 0.9178947368421052, "grad_norm": 0.5606095194816589, "learning_rate": 0.0001407890446690577, "loss": 1.5814, "step": 1308 }, { "epoch": 0.9185964912280702, "grad_norm": 0.541929304599762, "learning_rate": 0.0001407238343658298, "loss": 1.4898, "step": 1309 }, { "epoch": 0.9192982456140351, "grad_norm": 0.5949207544326782, "learning_rate": 0.0001406586240626019, "loss": 1.5754, "step": 1310 }, { "epoch": 0.92, "grad_norm": 0.6357597708702087, "learning_rate": 0.00014059341375937399, "loss": 1.5619, "step": 1311 }, { "epoch": 0.9207017543859649, "grad_norm": 0.4799520671367645, "learning_rate": 0.0001405282034561461, "loss": 1.4275, "step": 1312 }, { "epoch": 0.9207017543859649, "eval_loss": 1.4672327041625977, "eval_runtime": 65.8127, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1312 }, { "epoch": 0.9214035087719298, "grad_norm": 0.5945150852203369, "learning_rate": 0.00014046299315291817, "loss": 1.5101, "step": 1313 }, { "epoch": 0.9221052631578948, "grad_norm": 0.5418935418128967, "learning_rate": 0.00014039778284969024, "loss": 1.3678, "step": 1314 }, { "epoch": 0.9228070175438596, "grad_norm": 0.5284458994865417, "learning_rate": 0.00014033257254646235, "loss": 1.4799, "step": 1315 }, { "epoch": 0.9235087719298246, "grad_norm": 0.5239183306694031, "learning_rate": 0.00014026736224323445, "loss": 1.4265, "step": 1316 }, { "epoch": 0.9242105263157895, "grad_norm": 0.5453318357467651, "learning_rate": 0.00014020215194000653, "loss": 1.6145, "step": 1317 }, { "epoch": 0.9249122807017544, "grad_norm": 0.5366153717041016, "learning_rate": 0.00014013694163677863, "loss": 1.5202, "step": 1318 }, { "epoch": 0.9256140350877193, "grad_norm": 0.5162118077278137, "learning_rate": 0.0001400717313335507, "loss": 1.4544, "step": 1319 }, { "epoch": 0.9263157894736842, "grad_norm": 0.522141695022583, "learning_rate": 0.00014000652103032278, "loss": 1.4928, "step": 1320 }, { "epoch": 0.9270175438596491, "grad_norm": 0.5702985525131226, "learning_rate": 0.00013994131072709489, "loss": 1.5626, "step": 1321 }, { "epoch": 0.927719298245614, "grad_norm": 0.5863943696022034, "learning_rate": 0.000139876100423867, "loss": 1.5545, "step": 1322 }, { "epoch": 0.9284210526315789, "grad_norm": 0.5446662306785583, "learning_rate": 0.00013981089012063907, "loss": 1.5399, "step": 1323 }, { "epoch": 0.9291228070175439, "grad_norm": 0.5459403395652771, "learning_rate": 0.00013974567981741117, "loss": 1.5286, "step": 1324 }, { "epoch": 0.9298245614035088, "grad_norm": 0.5816934108734131, "learning_rate": 0.00013968046951418325, "loss": 1.7282, "step": 1325 }, { "epoch": 0.9305263157894736, "grad_norm": 0.525581419467926, "learning_rate": 0.00013961525921095532, "loss": 1.4582, "step": 1326 }, { "epoch": 0.9312280701754386, "grad_norm": 0.5580085515975952, "learning_rate": 0.00013955004890772743, "loss": 1.3686, "step": 1327 }, { "epoch": 0.9319298245614035, "grad_norm": 0.5344337821006775, "learning_rate": 0.00013948483860449953, "loss": 1.4525, "step": 1328 }, { "epoch": 0.9319298245614035, "eval_loss": 1.4628639221191406, "eval_runtime": 65.8213, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1328 }, { "epoch": 0.9326315789473684, "grad_norm": 0.5185452699661255, "learning_rate": 0.0001394196283012716, "loss": 1.2977, "step": 1329 }, { "epoch": 0.9333333333333333, "grad_norm": 0.5791798233985901, "learning_rate": 0.00013935441799804368, "loss": 1.395, "step": 1330 }, { "epoch": 0.9340350877192982, "grad_norm": 0.617954432964325, "learning_rate": 0.0001392892076948158, "loss": 1.639, "step": 1331 }, { "epoch": 0.9347368421052632, "grad_norm": 0.5743063688278198, "learning_rate": 0.00013922399739158786, "loss": 1.5615, "step": 1332 }, { "epoch": 0.9354385964912281, "grad_norm": 0.60340416431427, "learning_rate": 0.00013915878708835997, "loss": 1.6969, "step": 1333 }, { "epoch": 0.9361403508771929, "grad_norm": 0.5096635818481445, "learning_rate": 0.00013909357678513207, "loss": 1.425, "step": 1334 }, { "epoch": 0.9368421052631579, "grad_norm": 0.5511636137962341, "learning_rate": 0.00013902836648190415, "loss": 1.5418, "step": 1335 }, { "epoch": 0.9375438596491228, "grad_norm": 0.5726730227470398, "learning_rate": 0.00013896315617867622, "loss": 1.4955, "step": 1336 }, { "epoch": 0.9382456140350878, "grad_norm": 0.5756725668907166, "learning_rate": 0.00013889794587544833, "loss": 1.4616, "step": 1337 }, { "epoch": 0.9389473684210526, "grad_norm": 0.5718767046928406, "learning_rate": 0.0001388327355722204, "loss": 1.5682, "step": 1338 }, { "epoch": 0.9396491228070175, "grad_norm": 0.5202625393867493, "learning_rate": 0.0001387675252689925, "loss": 1.585, "step": 1339 }, { "epoch": 0.9403508771929825, "grad_norm": 0.5889431238174438, "learning_rate": 0.0001387023149657646, "loss": 1.6715, "step": 1340 }, { "epoch": 0.9410526315789474, "grad_norm": 0.5576559901237488, "learning_rate": 0.0001386371046625367, "loss": 1.6181, "step": 1341 }, { "epoch": 0.9417543859649122, "grad_norm": 0.6135910749435425, "learning_rate": 0.00013857189435930876, "loss": 1.6069, "step": 1342 }, { "epoch": 0.9424561403508772, "grad_norm": 0.5227523446083069, "learning_rate": 0.00013850668405608087, "loss": 1.3836, "step": 1343 }, { "epoch": 0.9431578947368421, "grad_norm": 0.546610414981842, "learning_rate": 0.00013844147375285297, "loss": 1.473, "step": 1344 }, { "epoch": 0.9431578947368421, "eval_loss": 1.4644501209259033, "eval_runtime": 65.8163, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1344 }, { "epoch": 0.9438596491228071, "grad_norm": 0.5300866365432739, "learning_rate": 0.00013837626344962505, "loss": 1.5227, "step": 1345 }, { "epoch": 0.9445614035087719, "grad_norm": 0.5843913555145264, "learning_rate": 0.00013831105314639715, "loss": 1.6518, "step": 1346 }, { "epoch": 0.9452631578947368, "grad_norm": 0.5087118744850159, "learning_rate": 0.00013824584284316923, "loss": 1.4879, "step": 1347 }, { "epoch": 0.9459649122807018, "grad_norm": 0.5611976385116577, "learning_rate": 0.0001381806325399413, "loss": 1.5254, "step": 1348 }, { "epoch": 0.9466666666666667, "grad_norm": 0.5514966249465942, "learning_rate": 0.0001381154222367134, "loss": 1.4413, "step": 1349 }, { "epoch": 0.9473684210526315, "grad_norm": 0.5582779049873352, "learning_rate": 0.0001380502119334855, "loss": 1.6179, "step": 1350 }, { "epoch": 0.9480701754385965, "grad_norm": 0.5745506286621094, "learning_rate": 0.0001379850016302576, "loss": 1.5338, "step": 1351 }, { "epoch": 0.9487719298245614, "grad_norm": 0.548124372959137, "learning_rate": 0.0001379197913270297, "loss": 1.4274, "step": 1352 }, { "epoch": 0.9494736842105264, "grad_norm": 0.5343481302261353, "learning_rate": 0.00013785458102380177, "loss": 1.4293, "step": 1353 }, { "epoch": 0.9501754385964912, "grad_norm": 0.5399684309959412, "learning_rate": 0.00013778937072057385, "loss": 1.6639, "step": 1354 }, { "epoch": 0.9508771929824561, "grad_norm": 0.5420792102813721, "learning_rate": 0.00013772416041734595, "loss": 1.6034, "step": 1355 }, { "epoch": 0.9515789473684211, "grad_norm": 0.5921055674552917, "learning_rate": 0.00013765895011411805, "loss": 1.5864, "step": 1356 }, { "epoch": 0.952280701754386, "grad_norm": 0.59754878282547, "learning_rate": 0.00013759373981089013, "loss": 1.4384, "step": 1357 }, { "epoch": 0.9529824561403509, "grad_norm": 0.5537499189376831, "learning_rate": 0.0001375285295076622, "loss": 1.6341, "step": 1358 }, { "epoch": 0.9536842105263158, "grad_norm": 0.5400024056434631, "learning_rate": 0.0001374633192044343, "loss": 1.6376, "step": 1359 }, { "epoch": 0.9543859649122807, "grad_norm": 0.5441380143165588, "learning_rate": 0.00013739810890120639, "loss": 1.3068, "step": 1360 }, { "epoch": 0.9543859649122807, "eval_loss": 1.4653337001800537, "eval_runtime": 65.8227, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1360 }, { "epoch": 0.9550877192982457, "grad_norm": 0.5515937209129333, "learning_rate": 0.0001373328985979785, "loss": 1.5346, "step": 1361 }, { "epoch": 0.9557894736842105, "grad_norm": 0.5352932214736938, "learning_rate": 0.0001372676882947506, "loss": 1.4995, "step": 1362 }, { "epoch": 0.9564912280701754, "grad_norm": 0.5566843152046204, "learning_rate": 0.00013720247799152267, "loss": 1.5821, "step": 1363 }, { "epoch": 0.9571929824561404, "grad_norm": 0.5478880405426025, "learning_rate": 0.00013713726768829475, "loss": 1.5052, "step": 1364 }, { "epoch": 0.9578947368421052, "grad_norm": 0.547013521194458, "learning_rate": 0.00013707205738506685, "loss": 1.6191, "step": 1365 }, { "epoch": 0.9585964912280702, "grad_norm": 0.5814816951751709, "learning_rate": 0.00013700684708183893, "loss": 1.6168, "step": 1366 }, { "epoch": 0.9592982456140351, "grad_norm": 0.5655395984649658, "learning_rate": 0.00013694163677861103, "loss": 1.5193, "step": 1367 }, { "epoch": 0.96, "grad_norm": 0.5317264795303345, "learning_rate": 0.00013687642647538313, "loss": 1.5369, "step": 1368 }, { "epoch": 0.960701754385965, "grad_norm": 0.5512471795082092, "learning_rate": 0.0001368112161721552, "loss": 1.4861, "step": 1369 }, { "epoch": 0.9614035087719298, "grad_norm": 0.5386704802513123, "learning_rate": 0.0001367460058689273, "loss": 1.6125, "step": 1370 }, { "epoch": 0.9621052631578947, "grad_norm": 0.5215173959732056, "learning_rate": 0.0001366807955656994, "loss": 1.5365, "step": 1371 }, { "epoch": 0.9628070175438597, "grad_norm": 0.5143553018569946, "learning_rate": 0.00013661558526247147, "loss": 1.5275, "step": 1372 }, { "epoch": 0.9635087719298245, "grad_norm": 0.4954354465007782, "learning_rate": 0.00013655037495924357, "loss": 1.5203, "step": 1373 }, { "epoch": 0.9642105263157895, "grad_norm": 0.5579405426979065, "learning_rate": 0.00013648516465601567, "loss": 1.5393, "step": 1374 }, { "epoch": 0.9649122807017544, "grad_norm": 0.5346971154212952, "learning_rate": 0.00013641995435278775, "loss": 1.3443, "step": 1375 }, { "epoch": 0.9656140350877193, "grad_norm": 0.5336636304855347, "learning_rate": 0.00013635474404955983, "loss": 1.6275, "step": 1376 }, { "epoch": 0.9656140350877193, "eval_loss": 1.4607013463974, "eval_runtime": 65.8273, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.243, "step": 1376 }, { "epoch": 0.9663157894736842, "grad_norm": 0.5614655613899231, "learning_rate": 0.00013628953374633193, "loss": 1.4017, "step": 1377 }, { "epoch": 0.9670175438596491, "grad_norm": 0.6254684925079346, "learning_rate": 0.000136224323443104, "loss": 1.6443, "step": 1378 }, { "epoch": 0.9677192982456141, "grad_norm": 0.587593138217926, "learning_rate": 0.0001361591131398761, "loss": 1.604, "step": 1379 }, { "epoch": 0.968421052631579, "grad_norm": 0.5505099296569824, "learning_rate": 0.0001360939028366482, "loss": 1.5288, "step": 1380 }, { "epoch": 0.9691228070175438, "grad_norm": 0.556948721408844, "learning_rate": 0.00013602869253342026, "loss": 1.5344, "step": 1381 }, { "epoch": 0.9698245614035088, "grad_norm": 0.5183830857276917, "learning_rate": 0.00013596348223019237, "loss": 1.3491, "step": 1382 }, { "epoch": 0.9705263157894737, "grad_norm": 0.5576204657554626, "learning_rate": 0.00013589827192696447, "loss": 1.5426, "step": 1383 }, { "epoch": 0.9712280701754386, "grad_norm": 0.5600231289863586, "learning_rate": 0.00013583306162373655, "loss": 1.5411, "step": 1384 }, { "epoch": 0.9719298245614035, "grad_norm": 0.5508085489273071, "learning_rate": 0.00013576785132050865, "loss": 1.4882, "step": 1385 }, { "epoch": 0.9726315789473684, "grad_norm": 0.5481739640235901, "learning_rate": 0.00013570264101728073, "loss": 1.7498, "step": 1386 }, { "epoch": 0.9733333333333334, "grad_norm": 0.5554347038269043, "learning_rate": 0.00013563743071405283, "loss": 1.3597, "step": 1387 }, { "epoch": 0.9740350877192983, "grad_norm": 0.5449756979942322, "learning_rate": 0.0001355722204108249, "loss": 1.7922, "step": 1388 }, { "epoch": 0.9747368421052631, "grad_norm": 0.5169976353645325, "learning_rate": 0.000135507010107597, "loss": 1.4667, "step": 1389 }, { "epoch": 0.9754385964912281, "grad_norm": 0.5110458731651306, "learning_rate": 0.00013544179980436912, "loss": 1.5102, "step": 1390 }, { "epoch": 0.976140350877193, "grad_norm": 0.6121553778648376, "learning_rate": 0.0001353765895011412, "loss": 1.5965, "step": 1391 }, { "epoch": 0.9768421052631578, "grad_norm": 0.5615525245666504, "learning_rate": 0.00013531137919791327, "loss": 1.5691, "step": 1392 }, { "epoch": 0.9768421052631578, "eval_loss": 1.4648855924606323, "eval_runtime": 65.8158, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1392 }, { "epoch": 0.9775438596491228, "grad_norm": 0.48953384160995483, "learning_rate": 0.00013524616889468537, "loss": 1.3436, "step": 1393 }, { "epoch": 0.9782456140350877, "grad_norm": 0.5522525906562805, "learning_rate": 0.00013518095859145745, "loss": 1.5187, "step": 1394 }, { "epoch": 0.9789473684210527, "grad_norm": 0.5907886028289795, "learning_rate": 0.00013511574828822955, "loss": 1.5078, "step": 1395 }, { "epoch": 0.9796491228070175, "grad_norm": 0.5503443479537964, "learning_rate": 0.00013505053798500166, "loss": 1.4408, "step": 1396 }, { "epoch": 0.9803508771929824, "grad_norm": 0.5928301215171814, "learning_rate": 0.00013498532768177373, "loss": 1.5479, "step": 1397 }, { "epoch": 0.9810526315789474, "grad_norm": 0.5075883865356445, "learning_rate": 0.0001349201173785458, "loss": 1.4524, "step": 1398 }, { "epoch": 0.9817543859649123, "grad_norm": 0.5159602761268616, "learning_rate": 0.0001348549070753179, "loss": 1.3679, "step": 1399 }, { "epoch": 0.9824561403508771, "grad_norm": 0.5250809788703918, "learning_rate": 0.00013478969677209, "loss": 1.357, "step": 1400 }, { "epoch": 0.9831578947368421, "grad_norm": 0.5305306315422058, "learning_rate": 0.0001347244864688621, "loss": 1.4595, "step": 1401 }, { "epoch": 0.983859649122807, "grad_norm": 0.5421623587608337, "learning_rate": 0.0001346592761656342, "loss": 1.5184, "step": 1402 }, { "epoch": 0.984561403508772, "grad_norm": 0.5799669623374939, "learning_rate": 0.00013459406586240625, "loss": 1.4898, "step": 1403 }, { "epoch": 0.9852631578947368, "grad_norm": 0.5101059079170227, "learning_rate": 0.00013452885555917835, "loss": 1.3958, "step": 1404 }, { "epoch": 0.9859649122807017, "grad_norm": 0.6235257387161255, "learning_rate": 0.00013446364525595045, "loss": 1.5469, "step": 1405 }, { "epoch": 0.9866666666666667, "grad_norm": 0.5369657278060913, "learning_rate": 0.00013439843495272253, "loss": 1.5956, "step": 1406 }, { "epoch": 0.9873684210526316, "grad_norm": 0.576069712638855, "learning_rate": 0.00013433322464949463, "loss": 1.5538, "step": 1407 }, { "epoch": 0.9880701754385965, "grad_norm": 0.515951931476593, "learning_rate": 0.0001342680143462667, "loss": 1.5023, "step": 1408 }, { "epoch": 0.9880701754385965, "eval_loss": 1.4590816497802734, "eval_runtime": 65.8253, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1408 }, { "epoch": 0.9887719298245614, "grad_norm": 0.5294514298439026, "learning_rate": 0.0001342028040430388, "loss": 1.4488, "step": 1409 }, { "epoch": 0.9894736842105263, "grad_norm": 0.6072840094566345, "learning_rate": 0.0001341375937398109, "loss": 1.5603, "step": 1410 }, { "epoch": 0.9901754385964913, "grad_norm": 0.5944836735725403, "learning_rate": 0.000134072383436583, "loss": 1.5904, "step": 1411 }, { "epoch": 0.9908771929824561, "grad_norm": 0.5474660396575928, "learning_rate": 0.00013400717313335507, "loss": 1.4926, "step": 1412 }, { "epoch": 0.991578947368421, "grad_norm": 0.5764719247817993, "learning_rate": 0.00013394196283012717, "loss": 1.7046, "step": 1413 }, { "epoch": 0.992280701754386, "grad_norm": 0.5953339338302612, "learning_rate": 0.00013387675252689925, "loss": 1.5146, "step": 1414 }, { "epoch": 0.9929824561403509, "grad_norm": 0.5297874212265015, "learning_rate": 0.00013381154222367133, "loss": 1.476, "step": 1415 }, { "epoch": 0.9936842105263158, "grad_norm": 0.5325372815132141, "learning_rate": 0.00013374633192044343, "loss": 1.4169, "step": 1416 }, { "epoch": 0.9943859649122807, "grad_norm": 0.5061718821525574, "learning_rate": 0.00013368112161721553, "loss": 1.5318, "step": 1417 }, { "epoch": 0.9950877192982456, "grad_norm": 0.49733030796051025, "learning_rate": 0.0001336159113139876, "loss": 1.2793, "step": 1418 }, { "epoch": 0.9957894736842106, "grad_norm": 0.5987153053283691, "learning_rate": 0.00013355070101075972, "loss": 1.6366, "step": 1419 }, { "epoch": 0.9964912280701754, "grad_norm": 0.5505638122558594, "learning_rate": 0.0001334854907075318, "loss": 1.5346, "step": 1420 }, { "epoch": 0.9971929824561403, "grad_norm": 0.6379963159561157, "learning_rate": 0.00013342028040430387, "loss": 1.6315, "step": 1421 }, { "epoch": 0.9978947368421053, "grad_norm": 0.5537451505661011, "learning_rate": 0.00013335507010107597, "loss": 1.5021, "step": 1422 }, { "epoch": 0.9985964912280701, "grad_norm": 0.5499988794326782, "learning_rate": 0.00013328985979784808, "loss": 1.3855, "step": 1423 }, { "epoch": 0.9992982456140351, "grad_norm": 0.5662896037101746, "learning_rate": 0.00013322464949462015, "loss": 1.621, "step": 1424 }, { "epoch": 0.9992982456140351, "eval_loss": 1.4569982290267944, "eval_runtime": 65.8158, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.243, "step": 1424 } ], "logging_steps": 1, "max_steps": 1425, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 16, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.936883700662272e+16, "train_batch_size": 3, "trial_name": null, "trial_params": null }