{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 511, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019569471624266144, "grad_norm": 0.66796875, "learning_rate": 9.980430528375734e-06, "loss": 1.4461, "step": 1 }, { "epoch": 0.003913894324853229, "grad_norm": 0.6796875, "learning_rate": 9.960861056751468e-06, "loss": 1.3661, "step": 2 }, { "epoch": 0.005870841487279843, "grad_norm": 0.59765625, "learning_rate": 9.941291585127202e-06, "loss": 1.4274, "step": 3 }, { "epoch": 0.007827788649706457, "grad_norm": 0.486328125, "learning_rate": 9.921722113502935e-06, "loss": 1.3709, "step": 4 }, { "epoch": 0.009784735812133072, "grad_norm": 0.4609375, "learning_rate": 9.902152641878669e-06, "loss": 1.2909, "step": 5 }, { "epoch": 0.011741682974559686, "grad_norm": 0.466796875, "learning_rate": 9.882583170254404e-06, "loss": 1.4032, "step": 6 }, { "epoch": 0.0136986301369863, "grad_norm": 0.443359375, "learning_rate": 9.863013698630138e-06, "loss": 1.3758, "step": 7 }, { "epoch": 0.015655577299412915, "grad_norm": 0.37109375, "learning_rate": 9.843444227005872e-06, "loss": 1.2739, "step": 8 }, { "epoch": 0.01761252446183953, "grad_norm": 0.3359375, "learning_rate": 9.823874755381605e-06, "loss": 1.1982, "step": 9 }, { "epoch": 0.019569471624266144, "grad_norm": 0.328125, "learning_rate": 9.804305283757339e-06, "loss": 1.2455, "step": 10 }, { "epoch": 0.021526418786692758, "grad_norm": 0.330078125, "learning_rate": 9.784735812133073e-06, "loss": 1.2607, "step": 11 }, { "epoch": 0.023483365949119372, "grad_norm": 0.392578125, "learning_rate": 9.765166340508806e-06, "loss": 1.246, "step": 12 }, { "epoch": 0.025440313111545987, "grad_norm": 0.30078125, "learning_rate": 9.74559686888454e-06, "loss": 1.3415, "step": 13 }, { "epoch": 0.0273972602739726, "grad_norm": 0.33203125, "learning_rate": 9.726027397260275e-06, "loss": 1.2426, "step": 14 }, { "epoch": 0.029354207436399216, "grad_norm": 0.28125, "learning_rate": 9.706457925636007e-06, "loss": 1.1957, "step": 15 }, { "epoch": 0.03131115459882583, "grad_norm": 0.240234375, "learning_rate": 9.686888454011743e-06, "loss": 1.196, "step": 16 }, { "epoch": 0.033268101761252444, "grad_norm": 0.3359375, "learning_rate": 9.667318982387476e-06, "loss": 1.2096, "step": 17 }, { "epoch": 0.03522504892367906, "grad_norm": 0.2353515625, "learning_rate": 9.64774951076321e-06, "loss": 1.2594, "step": 18 }, { "epoch": 0.03718199608610567, "grad_norm": 0.228515625, "learning_rate": 9.628180039138944e-06, "loss": 1.169, "step": 19 }, { "epoch": 0.03913894324853229, "grad_norm": 0.2060546875, "learning_rate": 9.608610567514677e-06, "loss": 1.1443, "step": 20 }, { "epoch": 0.0410958904109589, "grad_norm": 0.2421875, "learning_rate": 9.589041095890411e-06, "loss": 1.1179, "step": 21 }, { "epoch": 0.043052837573385516, "grad_norm": 0.212890625, "learning_rate": 9.569471624266146e-06, "loss": 1.1259, "step": 22 }, { "epoch": 0.04500978473581213, "grad_norm": 0.201171875, "learning_rate": 9.549902152641878e-06, "loss": 1.154, "step": 23 }, { "epoch": 0.046966731898238745, "grad_norm": 0.2158203125, "learning_rate": 9.530332681017614e-06, "loss": 1.1163, "step": 24 }, { "epoch": 0.04892367906066536, "grad_norm": 0.21484375, "learning_rate": 9.510763209393347e-06, "loss": 1.0888, "step": 25 }, { "epoch": 0.050880626223091974, "grad_norm": 0.2109375, "learning_rate": 9.49119373776908e-06, "loss": 1.0933, "step": 26 }, { "epoch": 0.05283757338551859, "grad_norm": 0.2119140625, "learning_rate": 9.471624266144814e-06, "loss": 1.1238, "step": 27 }, { "epoch": 0.0547945205479452, "grad_norm": 0.1845703125, "learning_rate": 9.452054794520548e-06, "loss": 1.1178, "step": 28 }, { "epoch": 0.05675146771037182, "grad_norm": 0.19921875, "learning_rate": 9.432485322896282e-06, "loss": 1.1092, "step": 29 }, { "epoch": 0.05870841487279843, "grad_norm": 0.20703125, "learning_rate": 9.412915851272017e-06, "loss": 1.1798, "step": 30 }, { "epoch": 0.060665362035225046, "grad_norm": 0.189453125, "learning_rate": 9.393346379647749e-06, "loss": 1.0471, "step": 31 }, { "epoch": 0.06262230919765166, "grad_norm": 0.177734375, "learning_rate": 9.373776908023484e-06, "loss": 1.1234, "step": 32 }, { "epoch": 0.06457925636007827, "grad_norm": 0.1591796875, "learning_rate": 9.354207436399218e-06, "loss": 0.9983, "step": 33 }, { "epoch": 0.06653620352250489, "grad_norm": 0.1474609375, "learning_rate": 9.334637964774952e-06, "loss": 1.0824, "step": 34 }, { "epoch": 0.0684931506849315, "grad_norm": 0.169921875, "learning_rate": 9.315068493150685e-06, "loss": 1.0707, "step": 35 }, { "epoch": 0.07045009784735812, "grad_norm": 0.1533203125, "learning_rate": 9.295499021526419e-06, "loss": 1.0503, "step": 36 }, { "epoch": 0.07240704500978473, "grad_norm": 0.142578125, "learning_rate": 9.275929549902153e-06, "loss": 1.0387, "step": 37 }, { "epoch": 0.07436399217221135, "grad_norm": 0.158203125, "learning_rate": 9.256360078277888e-06, "loss": 1.0149, "step": 38 }, { "epoch": 0.07632093933463796, "grad_norm": 0.173828125, "learning_rate": 9.23679060665362e-06, "loss": 1.0664, "step": 39 }, { "epoch": 0.07827788649706457, "grad_norm": 0.1630859375, "learning_rate": 9.217221135029355e-06, "loss": 0.9682, "step": 40 }, { "epoch": 0.08023483365949119, "grad_norm": 0.1923828125, "learning_rate": 9.197651663405089e-06, "loss": 1.1108, "step": 41 }, { "epoch": 0.0821917808219178, "grad_norm": 0.16015625, "learning_rate": 9.178082191780823e-06, "loss": 1.0633, "step": 42 }, { "epoch": 0.08414872798434442, "grad_norm": 0.2060546875, "learning_rate": 9.158512720156556e-06, "loss": 0.9949, "step": 43 }, { "epoch": 0.08610567514677103, "grad_norm": 0.1376953125, "learning_rate": 9.13894324853229e-06, "loss": 0.9986, "step": 44 }, { "epoch": 0.08806262230919765, "grad_norm": 0.1953125, "learning_rate": 9.119373776908024e-06, "loss": 0.9542, "step": 45 }, { "epoch": 0.09001956947162426, "grad_norm": 0.1591796875, "learning_rate": 9.099804305283759e-06, "loss": 0.9736, "step": 46 }, { "epoch": 0.09197651663405088, "grad_norm": 0.16015625, "learning_rate": 9.080234833659491e-06, "loss": 1.0057, "step": 47 }, { "epoch": 0.09393346379647749, "grad_norm": 0.279296875, "learning_rate": 9.060665362035226e-06, "loss": 0.9697, "step": 48 }, { "epoch": 0.0958904109589041, "grad_norm": 0.1533203125, "learning_rate": 9.04109589041096e-06, "loss": 0.9534, "step": 49 }, { "epoch": 0.09784735812133072, "grad_norm": 0.14453125, "learning_rate": 9.021526418786694e-06, "loss": 0.9397, "step": 50 }, { "epoch": 0.09980430528375733, "grad_norm": 0.1416015625, "learning_rate": 9.001956947162427e-06, "loss": 0.9926, "step": 51 }, { "epoch": 0.10176125244618395, "grad_norm": 0.1455078125, "learning_rate": 8.982387475538161e-06, "loss": 0.9833, "step": 52 }, { "epoch": 0.10371819960861056, "grad_norm": 0.146484375, "learning_rate": 8.962818003913895e-06, "loss": 1.0001, "step": 53 }, { "epoch": 0.10567514677103718, "grad_norm": 0.1357421875, "learning_rate": 8.943248532289628e-06, "loss": 1.0054, "step": 54 }, { "epoch": 0.10763209393346379, "grad_norm": 0.1923828125, "learning_rate": 8.923679060665362e-06, "loss": 0.8902, "step": 55 }, { "epoch": 0.1095890410958904, "grad_norm": 0.15625, "learning_rate": 8.904109589041097e-06, "loss": 0.9753, "step": 56 }, { "epoch": 0.11154598825831702, "grad_norm": 0.1416015625, "learning_rate": 8.88454011741683e-06, "loss": 0.9192, "step": 57 }, { "epoch": 0.11350293542074363, "grad_norm": 0.1298828125, "learning_rate": 8.864970645792564e-06, "loss": 0.9731, "step": 58 }, { "epoch": 0.11545988258317025, "grad_norm": 0.140625, "learning_rate": 8.845401174168298e-06, "loss": 0.993, "step": 59 }, { "epoch": 0.11741682974559686, "grad_norm": 0.1171875, "learning_rate": 8.825831702544032e-06, "loss": 1.013, "step": 60 }, { "epoch": 0.11937377690802348, "grad_norm": 0.1357421875, "learning_rate": 8.806262230919765e-06, "loss": 0.9423, "step": 61 }, { "epoch": 0.12133072407045009, "grad_norm": 0.1357421875, "learning_rate": 8.786692759295499e-06, "loss": 0.9794, "step": 62 }, { "epoch": 0.1232876712328767, "grad_norm": 0.11572265625, "learning_rate": 8.767123287671233e-06, "loss": 0.9225, "step": 63 }, { "epoch": 0.12524461839530332, "grad_norm": 0.12158203125, "learning_rate": 8.747553816046968e-06, "loss": 0.9053, "step": 64 }, { "epoch": 0.12720156555772993, "grad_norm": 0.11767578125, "learning_rate": 8.7279843444227e-06, "loss": 0.9814, "step": 65 }, { "epoch": 0.12915851272015655, "grad_norm": 0.12890625, "learning_rate": 8.708414872798435e-06, "loss": 0.9611, "step": 66 }, { "epoch": 0.13111545988258316, "grad_norm": 0.12353515625, "learning_rate": 8.688845401174169e-06, "loss": 0.9405, "step": 67 }, { "epoch": 0.13307240704500978, "grad_norm": 0.146484375, "learning_rate": 8.669275929549903e-06, "loss": 0.8448, "step": 68 }, { "epoch": 0.1350293542074364, "grad_norm": 0.1171875, "learning_rate": 8.649706457925636e-06, "loss": 0.959, "step": 69 }, { "epoch": 0.136986301369863, "grad_norm": 0.130859375, "learning_rate": 8.63013698630137e-06, "loss": 0.978, "step": 70 }, { "epoch": 0.13894324853228962, "grad_norm": 0.12255859375, "learning_rate": 8.610567514677104e-06, "loss": 0.9405, "step": 71 }, { "epoch": 0.14090019569471623, "grad_norm": 0.11865234375, "learning_rate": 8.590998043052839e-06, "loss": 0.9356, "step": 72 }, { "epoch": 0.14285714285714285, "grad_norm": 0.1435546875, "learning_rate": 8.571428571428571e-06, "loss": 0.903, "step": 73 }, { "epoch": 0.14481409001956946, "grad_norm": 0.1337890625, "learning_rate": 8.551859099804306e-06, "loss": 0.9432, "step": 74 }, { "epoch": 0.14677103718199608, "grad_norm": 0.1220703125, "learning_rate": 8.53228962818004e-06, "loss": 0.9193, "step": 75 }, { "epoch": 0.1487279843444227, "grad_norm": 0.1337890625, "learning_rate": 8.512720156555774e-06, "loss": 0.9166, "step": 76 }, { "epoch": 0.1506849315068493, "grad_norm": 0.1279296875, "learning_rate": 8.493150684931507e-06, "loss": 0.9898, "step": 77 }, { "epoch": 0.15264187866927592, "grad_norm": 0.15234375, "learning_rate": 8.473581213307241e-06, "loss": 0.8916, "step": 78 }, { "epoch": 0.15459882583170254, "grad_norm": 0.1533203125, "learning_rate": 8.454011741682975e-06, "loss": 0.8989, "step": 79 }, { "epoch": 0.15655577299412915, "grad_norm": 0.1181640625, "learning_rate": 8.43444227005871e-06, "loss": 0.8994, "step": 80 }, { "epoch": 0.15851272015655576, "grad_norm": 0.119140625, "learning_rate": 8.414872798434442e-06, "loss": 0.9294, "step": 81 }, { "epoch": 0.16046966731898238, "grad_norm": 0.123046875, "learning_rate": 8.395303326810177e-06, "loss": 0.9239, "step": 82 }, { "epoch": 0.162426614481409, "grad_norm": 0.234375, "learning_rate": 8.37573385518591e-06, "loss": 0.9128, "step": 83 }, { "epoch": 0.1643835616438356, "grad_norm": 0.1396484375, "learning_rate": 8.356164383561644e-06, "loss": 0.8694, "step": 84 }, { "epoch": 0.16634050880626222, "grad_norm": 0.1279296875, "learning_rate": 8.336594911937378e-06, "loss": 0.8914, "step": 85 }, { "epoch": 0.16829745596868884, "grad_norm": 0.107421875, "learning_rate": 8.317025440313112e-06, "loss": 0.9794, "step": 86 }, { "epoch": 0.17025440313111545, "grad_norm": 0.119140625, "learning_rate": 8.297455968688845e-06, "loss": 0.8922, "step": 87 }, { "epoch": 0.17221135029354206, "grad_norm": 0.12451171875, "learning_rate": 8.27788649706458e-06, "loss": 0.9167, "step": 88 }, { "epoch": 0.17416829745596868, "grad_norm": 0.1279296875, "learning_rate": 8.258317025440313e-06, "loss": 0.8793, "step": 89 }, { "epoch": 0.1761252446183953, "grad_norm": 0.11669921875, "learning_rate": 8.238747553816048e-06, "loss": 0.9054, "step": 90 }, { "epoch": 0.1780821917808219, "grad_norm": 0.1220703125, "learning_rate": 8.219178082191782e-06, "loss": 0.9021, "step": 91 }, { "epoch": 0.18003913894324852, "grad_norm": 0.12255859375, "learning_rate": 8.199608610567515e-06, "loss": 0.858, "step": 92 }, { "epoch": 0.18199608610567514, "grad_norm": 0.1220703125, "learning_rate": 8.180039138943249e-06, "loss": 0.9246, "step": 93 }, { "epoch": 0.18395303326810175, "grad_norm": 0.11865234375, "learning_rate": 8.160469667318983e-06, "loss": 0.8801, "step": 94 }, { "epoch": 0.18590998043052837, "grad_norm": 0.162109375, "learning_rate": 8.140900195694716e-06, "loss": 0.8954, "step": 95 }, { "epoch": 0.18786692759295498, "grad_norm": 0.1171875, "learning_rate": 8.121330724070452e-06, "loss": 0.9247, "step": 96 }, { "epoch": 0.1898238747553816, "grad_norm": 0.1396484375, "learning_rate": 8.101761252446184e-06, "loss": 0.8993, "step": 97 }, { "epoch": 0.1917808219178082, "grad_norm": 0.1611328125, "learning_rate": 8.082191780821919e-06, "loss": 0.8513, "step": 98 }, { "epoch": 0.19373776908023482, "grad_norm": 0.1201171875, "learning_rate": 8.062622309197653e-06, "loss": 0.8881, "step": 99 }, { "epoch": 0.19569471624266144, "grad_norm": 0.125, "learning_rate": 8.043052837573386e-06, "loss": 0.8603, "step": 100 }, { "epoch": 0.19765166340508805, "grad_norm": 0.11572265625, "learning_rate": 8.02348336594912e-06, "loss": 0.9218, "step": 101 }, { "epoch": 0.19960861056751467, "grad_norm": 0.13671875, "learning_rate": 8.003913894324854e-06, "loss": 0.8472, "step": 102 }, { "epoch": 0.20156555772994128, "grad_norm": 0.1494140625, "learning_rate": 7.984344422700587e-06, "loss": 0.8988, "step": 103 }, { "epoch": 0.2035225048923679, "grad_norm": 0.1220703125, "learning_rate": 7.964774951076321e-06, "loss": 0.9051, "step": 104 }, { "epoch": 0.2054794520547945, "grad_norm": 0.11865234375, "learning_rate": 7.945205479452055e-06, "loss": 0.8947, "step": 105 }, { "epoch": 0.20743639921722112, "grad_norm": 0.12109375, "learning_rate": 7.92563600782779e-06, "loss": 0.8639, "step": 106 }, { "epoch": 0.20939334637964774, "grad_norm": 0.12109375, "learning_rate": 7.906066536203524e-06, "loss": 0.8954, "step": 107 }, { "epoch": 0.21135029354207435, "grad_norm": 0.125, "learning_rate": 7.886497064579257e-06, "loss": 0.8478, "step": 108 }, { "epoch": 0.21330724070450097, "grad_norm": 0.13671875, "learning_rate": 7.86692759295499e-06, "loss": 0.9215, "step": 109 }, { "epoch": 0.21526418786692758, "grad_norm": 0.12060546875, "learning_rate": 7.847358121330724e-06, "loss": 0.8932, "step": 110 }, { "epoch": 0.2172211350293542, "grad_norm": 0.140625, "learning_rate": 7.827788649706458e-06, "loss": 0.8828, "step": 111 }, { "epoch": 0.2191780821917808, "grad_norm": 0.1416015625, "learning_rate": 7.808219178082192e-06, "loss": 0.9027, "step": 112 }, { "epoch": 0.22113502935420742, "grad_norm": 0.1240234375, "learning_rate": 7.788649706457925e-06, "loss": 0.8689, "step": 113 }, { "epoch": 0.22309197651663404, "grad_norm": 0.1357421875, "learning_rate": 7.76908023483366e-06, "loss": 0.8872, "step": 114 }, { "epoch": 0.22504892367906065, "grad_norm": 0.1328125, "learning_rate": 7.749510763209393e-06, "loss": 0.7915, "step": 115 }, { "epoch": 0.22700587084148727, "grad_norm": 0.125, "learning_rate": 7.729941291585128e-06, "loss": 0.8166, "step": 116 }, { "epoch": 0.22896281800391388, "grad_norm": 0.1787109375, "learning_rate": 7.710371819960862e-06, "loss": 0.8751, "step": 117 }, { "epoch": 0.2309197651663405, "grad_norm": 0.1484375, "learning_rate": 7.690802348336595e-06, "loss": 0.9084, "step": 118 }, { "epoch": 0.2328767123287671, "grad_norm": 0.1435546875, "learning_rate": 7.671232876712329e-06, "loss": 0.8125, "step": 119 }, { "epoch": 0.23483365949119372, "grad_norm": 0.1416015625, "learning_rate": 7.651663405088063e-06, "loss": 0.7863, "step": 120 }, { "epoch": 0.23679060665362034, "grad_norm": 0.154296875, "learning_rate": 7.632093933463796e-06, "loss": 0.8586, "step": 121 }, { "epoch": 0.23874755381604695, "grad_norm": 0.1337890625, "learning_rate": 7.612524461839531e-06, "loss": 0.8224, "step": 122 }, { "epoch": 0.24070450097847357, "grad_norm": 0.1337890625, "learning_rate": 7.5929549902152645e-06, "loss": 0.8387, "step": 123 }, { "epoch": 0.24266144814090018, "grad_norm": 0.1181640625, "learning_rate": 7.573385518590999e-06, "loss": 0.9003, "step": 124 }, { "epoch": 0.2446183953033268, "grad_norm": 0.130859375, "learning_rate": 7.553816046966732e-06, "loss": 0.9102, "step": 125 }, { "epoch": 0.2465753424657534, "grad_norm": 0.130859375, "learning_rate": 7.534246575342466e-06, "loss": 0.8381, "step": 126 }, { "epoch": 0.24853228962818003, "grad_norm": 0.11328125, "learning_rate": 7.5146771037182e-06, "loss": 0.8954, "step": 127 }, { "epoch": 0.25048923679060664, "grad_norm": 0.13671875, "learning_rate": 7.4951076320939344e-06, "loss": 0.8644, "step": 128 }, { "epoch": 0.25244618395303325, "grad_norm": 0.125, "learning_rate": 7.475538160469667e-06, "loss": 0.847, "step": 129 }, { "epoch": 0.25440313111545987, "grad_norm": 0.1298828125, "learning_rate": 7.455968688845402e-06, "loss": 0.8124, "step": 130 }, { "epoch": 0.2563600782778865, "grad_norm": 0.1396484375, "learning_rate": 7.436399217221135e-06, "loss": 0.8837, "step": 131 }, { "epoch": 0.2583170254403131, "grad_norm": 0.1259765625, "learning_rate": 7.41682974559687e-06, "loss": 0.849, "step": 132 }, { "epoch": 0.2602739726027397, "grad_norm": 0.123046875, "learning_rate": 7.397260273972603e-06, "loss": 0.8032, "step": 133 }, { "epoch": 0.2622309197651663, "grad_norm": 0.1279296875, "learning_rate": 7.377690802348337e-06, "loss": 0.908, "step": 134 }, { "epoch": 0.26418786692759294, "grad_norm": 0.1572265625, "learning_rate": 7.358121330724071e-06, "loss": 0.8655, "step": 135 }, { "epoch": 0.26614481409001955, "grad_norm": 0.140625, "learning_rate": 7.338551859099805e-06, "loss": 0.8951, "step": 136 }, { "epoch": 0.26810176125244617, "grad_norm": 0.1376953125, "learning_rate": 7.318982387475538e-06, "loss": 0.8982, "step": 137 }, { "epoch": 0.2700587084148728, "grad_norm": 0.126953125, "learning_rate": 7.299412915851273e-06, "loss": 0.9128, "step": 138 }, { "epoch": 0.2720156555772994, "grad_norm": 0.138671875, "learning_rate": 7.279843444227006e-06, "loss": 0.8041, "step": 139 }, { "epoch": 0.273972602739726, "grad_norm": 0.12109375, "learning_rate": 7.260273972602741e-06, "loss": 0.9096, "step": 140 }, { "epoch": 0.2759295499021526, "grad_norm": 0.140625, "learning_rate": 7.240704500978474e-06, "loss": 0.8573, "step": 141 }, { "epoch": 0.27788649706457924, "grad_norm": 0.1318359375, "learning_rate": 7.221135029354208e-06, "loss": 0.8514, "step": 142 }, { "epoch": 0.27984344422700586, "grad_norm": 0.130859375, "learning_rate": 7.201565557729942e-06, "loss": 0.7995, "step": 143 }, { "epoch": 0.28180039138943247, "grad_norm": 0.12353515625, "learning_rate": 7.181996086105676e-06, "loss": 0.8932, "step": 144 }, { "epoch": 0.2837573385518591, "grad_norm": 0.1845703125, "learning_rate": 7.162426614481409e-06, "loss": 0.9044, "step": 145 }, { "epoch": 0.2857142857142857, "grad_norm": 0.1923828125, "learning_rate": 7.1428571428571436e-06, "loss": 0.9009, "step": 146 }, { "epoch": 0.2876712328767123, "grad_norm": 0.1318359375, "learning_rate": 7.123287671232877e-06, "loss": 0.8576, "step": 147 }, { "epoch": 0.2896281800391389, "grad_norm": 0.12451171875, "learning_rate": 7.103718199608612e-06, "loss": 0.8488, "step": 148 }, { "epoch": 0.29158512720156554, "grad_norm": 0.1376953125, "learning_rate": 7.0841487279843445e-06, "loss": 0.8388, "step": 149 }, { "epoch": 0.29354207436399216, "grad_norm": 0.1337890625, "learning_rate": 7.064579256360079e-06, "loss": 0.9016, "step": 150 }, { "epoch": 0.29549902152641877, "grad_norm": 0.126953125, "learning_rate": 7.045009784735813e-06, "loss": 0.8215, "step": 151 }, { "epoch": 0.2974559686888454, "grad_norm": 0.130859375, "learning_rate": 7.025440313111546e-06, "loss": 0.8762, "step": 152 }, { "epoch": 0.299412915851272, "grad_norm": 0.1279296875, "learning_rate": 7.00587084148728e-06, "loss": 0.8513, "step": 153 }, { "epoch": 0.3013698630136986, "grad_norm": 0.123046875, "learning_rate": 6.9863013698630145e-06, "loss": 0.8267, "step": 154 }, { "epoch": 0.30332681017612523, "grad_norm": 0.115234375, "learning_rate": 6.966731898238748e-06, "loss": 0.8657, "step": 155 }, { "epoch": 0.30528375733855184, "grad_norm": 0.13671875, "learning_rate": 6.947162426614482e-06, "loss": 0.8961, "step": 156 }, { "epoch": 0.30724070450097846, "grad_norm": 0.126953125, "learning_rate": 6.927592954990215e-06, "loss": 0.8809, "step": 157 }, { "epoch": 0.30919765166340507, "grad_norm": 0.1435546875, "learning_rate": 6.90802348336595e-06, "loss": 0.8914, "step": 158 }, { "epoch": 0.3111545988258317, "grad_norm": 0.12890625, "learning_rate": 6.8884540117416836e-06, "loss": 0.8824, "step": 159 }, { "epoch": 0.3131115459882583, "grad_norm": 0.1416015625, "learning_rate": 6.868884540117417e-06, "loss": 0.7989, "step": 160 }, { "epoch": 0.3150684931506849, "grad_norm": 0.12353515625, "learning_rate": 6.849315068493151e-06, "loss": 0.8252, "step": 161 }, { "epoch": 0.31702544031311153, "grad_norm": 0.138671875, "learning_rate": 6.829745596868885e-06, "loss": 0.9499, "step": 162 }, { "epoch": 0.31898238747553814, "grad_norm": 0.12451171875, "learning_rate": 6.810176125244618e-06, "loss": 0.8282, "step": 163 }, { "epoch": 0.32093933463796476, "grad_norm": 0.21484375, "learning_rate": 6.790606653620353e-06, "loss": 0.8443, "step": 164 }, { "epoch": 0.32289628180039137, "grad_norm": 0.125, "learning_rate": 6.771037181996086e-06, "loss": 0.862, "step": 165 }, { "epoch": 0.324853228962818, "grad_norm": 0.12890625, "learning_rate": 6.751467710371821e-06, "loss": 0.873, "step": 166 }, { "epoch": 0.3268101761252446, "grad_norm": 0.1689453125, "learning_rate": 6.731898238747554e-06, "loss": 0.7797, "step": 167 }, { "epoch": 0.3287671232876712, "grad_norm": 0.1318359375, "learning_rate": 6.712328767123288e-06, "loss": 0.8868, "step": 168 }, { "epoch": 0.33072407045009783, "grad_norm": 0.1337890625, "learning_rate": 6.692759295499022e-06, "loss": 0.8322, "step": 169 }, { "epoch": 0.33268101761252444, "grad_norm": 0.18359375, "learning_rate": 6.673189823874756e-06, "loss": 0.8408, "step": 170 }, { "epoch": 0.33463796477495106, "grad_norm": 0.1357421875, "learning_rate": 6.653620352250489e-06, "loss": 0.7677, "step": 171 }, { "epoch": 0.33659491193737767, "grad_norm": 0.150390625, "learning_rate": 6.634050880626224e-06, "loss": 0.9103, "step": 172 }, { "epoch": 0.3385518590998043, "grad_norm": 0.140625, "learning_rate": 6.614481409001957e-06, "loss": 0.8327, "step": 173 }, { "epoch": 0.3405088062622309, "grad_norm": 0.1328125, "learning_rate": 6.594911937377692e-06, "loss": 0.8531, "step": 174 }, { "epoch": 0.3424657534246575, "grad_norm": 0.138671875, "learning_rate": 6.5753424657534245e-06, "loss": 0.8673, "step": 175 }, { "epoch": 0.34442270058708413, "grad_norm": 0.134765625, "learning_rate": 6.555772994129159e-06, "loss": 0.828, "step": 176 }, { "epoch": 0.34637964774951074, "grad_norm": 0.140625, "learning_rate": 6.536203522504893e-06, "loss": 0.8719, "step": 177 }, { "epoch": 0.34833659491193736, "grad_norm": 0.1279296875, "learning_rate": 6.516634050880627e-06, "loss": 0.864, "step": 178 }, { "epoch": 0.350293542074364, "grad_norm": 0.12890625, "learning_rate": 6.49706457925636e-06, "loss": 0.8971, "step": 179 }, { "epoch": 0.3522504892367906, "grad_norm": 0.1220703125, "learning_rate": 6.4774951076320945e-06, "loss": 0.8787, "step": 180 }, { "epoch": 0.3542074363992172, "grad_norm": 0.1416015625, "learning_rate": 6.457925636007828e-06, "loss": 0.8259, "step": 181 }, { "epoch": 0.3561643835616438, "grad_norm": 0.212890625, "learning_rate": 6.438356164383563e-06, "loss": 0.8709, "step": 182 }, { "epoch": 0.35812133072407043, "grad_norm": 0.12109375, "learning_rate": 6.4187866927592954e-06, "loss": 0.8821, "step": 183 }, { "epoch": 0.36007827788649704, "grad_norm": 0.138671875, "learning_rate": 6.39921722113503e-06, "loss": 0.8212, "step": 184 }, { "epoch": 0.36203522504892366, "grad_norm": 1.5703125, "learning_rate": 6.379647749510764e-06, "loss": 0.874, "step": 185 }, { "epoch": 0.3639921722113503, "grad_norm": 0.23828125, "learning_rate": 6.360078277886498e-06, "loss": 0.8168, "step": 186 }, { "epoch": 0.3659491193737769, "grad_norm": 0.1455078125, "learning_rate": 6.340508806262231e-06, "loss": 0.8382, "step": 187 }, { "epoch": 0.3679060665362035, "grad_norm": 0.1669921875, "learning_rate": 6.320939334637965e-06, "loss": 0.8943, "step": 188 }, { "epoch": 0.3698630136986301, "grad_norm": 0.1298828125, "learning_rate": 6.301369863013699e-06, "loss": 0.8834, "step": 189 }, { "epoch": 0.37181996086105673, "grad_norm": 0.1484375, "learning_rate": 6.2818003913894335e-06, "loss": 0.8424, "step": 190 }, { "epoch": 0.37377690802348335, "grad_norm": 0.138671875, "learning_rate": 6.262230919765166e-06, "loss": 0.8496, "step": 191 }, { "epoch": 0.37573385518590996, "grad_norm": 0.1220703125, "learning_rate": 6.242661448140901e-06, "loss": 0.8941, "step": 192 }, { "epoch": 0.3776908023483366, "grad_norm": 0.1728515625, "learning_rate": 6.2230919765166345e-06, "loss": 0.8154, "step": 193 }, { "epoch": 0.3796477495107632, "grad_norm": 0.1484375, "learning_rate": 6.203522504892369e-06, "loss": 0.8654, "step": 194 }, { "epoch": 0.3816046966731898, "grad_norm": 0.2158203125, "learning_rate": 6.183953033268102e-06, "loss": 0.848, "step": 195 }, { "epoch": 0.3835616438356164, "grad_norm": 0.1416015625, "learning_rate": 6.164383561643836e-06, "loss": 0.8558, "step": 196 }, { "epoch": 0.38551859099804303, "grad_norm": 0.27734375, "learning_rate": 6.14481409001957e-06, "loss": 0.8601, "step": 197 }, { "epoch": 0.38747553816046965, "grad_norm": 0.1201171875, "learning_rate": 6.1252446183953044e-06, "loss": 0.8833, "step": 198 }, { "epoch": 0.38943248532289626, "grad_norm": 0.126953125, "learning_rate": 6.105675146771037e-06, "loss": 0.852, "step": 199 }, { "epoch": 0.3913894324853229, "grad_norm": 0.1298828125, "learning_rate": 6.086105675146772e-06, "loss": 0.8546, "step": 200 }, { "epoch": 0.3933463796477495, "grad_norm": 0.1494140625, "learning_rate": 6.066536203522505e-06, "loss": 0.9004, "step": 201 }, { "epoch": 0.3953033268101761, "grad_norm": 0.1318359375, "learning_rate": 6.046966731898239e-06, "loss": 0.8638, "step": 202 }, { "epoch": 0.3972602739726027, "grad_norm": 0.1328125, "learning_rate": 6.027397260273973e-06, "loss": 0.8235, "step": 203 }, { "epoch": 0.39921722113502933, "grad_norm": 0.14453125, "learning_rate": 6.007827788649707e-06, "loss": 0.7872, "step": 204 }, { "epoch": 0.40117416829745595, "grad_norm": 0.1484375, "learning_rate": 5.988258317025441e-06, "loss": 0.8676, "step": 205 }, { "epoch": 0.40313111545988256, "grad_norm": 0.171875, "learning_rate": 5.9686888454011745e-06, "loss": 0.8139, "step": 206 }, { "epoch": 0.4050880626223092, "grad_norm": 0.126953125, "learning_rate": 5.949119373776908e-06, "loss": 0.8064, "step": 207 }, { "epoch": 0.4070450097847358, "grad_norm": 0.1318359375, "learning_rate": 5.929549902152643e-06, "loss": 0.8975, "step": 208 }, { "epoch": 0.4090019569471624, "grad_norm": 0.1455078125, "learning_rate": 5.909980430528376e-06, "loss": 0.8096, "step": 209 }, { "epoch": 0.410958904109589, "grad_norm": 0.1416015625, "learning_rate": 5.89041095890411e-06, "loss": 0.7626, "step": 210 }, { "epoch": 0.41291585127201563, "grad_norm": 0.138671875, "learning_rate": 5.870841487279844e-06, "loss": 0.8408, "step": 211 }, { "epoch": 0.41487279843444225, "grad_norm": 0.12890625, "learning_rate": 5.851272015655578e-06, "loss": 0.8423, "step": 212 }, { "epoch": 0.41682974559686886, "grad_norm": 0.1513671875, "learning_rate": 5.831702544031311e-06, "loss": 0.8388, "step": 213 }, { "epoch": 0.4187866927592955, "grad_norm": 0.140625, "learning_rate": 5.812133072407045e-06, "loss": 0.8358, "step": 214 }, { "epoch": 0.4207436399217221, "grad_norm": 0.138671875, "learning_rate": 5.792563600782779e-06, "loss": 0.8598, "step": 215 }, { "epoch": 0.4227005870841487, "grad_norm": 0.16796875, "learning_rate": 5.7729941291585136e-06, "loss": 0.8662, "step": 216 }, { "epoch": 0.4246575342465753, "grad_norm": 0.1337890625, "learning_rate": 5.753424657534246e-06, "loss": 0.8337, "step": 217 }, { "epoch": 0.42661448140900193, "grad_norm": 0.140625, "learning_rate": 5.733855185909981e-06, "loss": 0.7651, "step": 218 }, { "epoch": 0.42857142857142855, "grad_norm": 0.1416015625, "learning_rate": 5.7142857142857145e-06, "loss": 0.8396, "step": 219 }, { "epoch": 0.43052837573385516, "grad_norm": 0.140625, "learning_rate": 5.694716242661449e-06, "loss": 0.8224, "step": 220 }, { "epoch": 0.4324853228962818, "grad_norm": 0.1416015625, "learning_rate": 5.675146771037182e-06, "loss": 0.8409, "step": 221 }, { "epoch": 0.4344422700587084, "grad_norm": 0.1376953125, "learning_rate": 5.655577299412916e-06, "loss": 0.8605, "step": 222 }, { "epoch": 0.436399217221135, "grad_norm": 0.1318359375, "learning_rate": 5.63600782778865e-06, "loss": 0.908, "step": 223 }, { "epoch": 0.4383561643835616, "grad_norm": 0.1474609375, "learning_rate": 5.6164383561643845e-06, "loss": 0.865, "step": 224 }, { "epoch": 0.44031311154598823, "grad_norm": 0.1337890625, "learning_rate": 5.596868884540117e-06, "loss": 0.7979, "step": 225 }, { "epoch": 0.44227005870841485, "grad_norm": 0.2021484375, "learning_rate": 5.577299412915852e-06, "loss": 0.7827, "step": 226 }, { "epoch": 0.44422700587084146, "grad_norm": 0.16015625, "learning_rate": 5.557729941291585e-06, "loss": 0.8459, "step": 227 }, { "epoch": 0.4461839530332681, "grad_norm": 0.1962890625, "learning_rate": 5.53816046966732e-06, "loss": 0.71, "step": 228 }, { "epoch": 0.4481409001956947, "grad_norm": 0.142578125, "learning_rate": 5.518590998043053e-06, "loss": 0.8219, "step": 229 }, { "epoch": 0.4500978473581213, "grad_norm": 0.1279296875, "learning_rate": 5.499021526418787e-06, "loss": 0.8574, "step": 230 }, { "epoch": 0.4520547945205479, "grad_norm": 0.17578125, "learning_rate": 5.479452054794521e-06, "loss": 0.8048, "step": 231 }, { "epoch": 0.45401174168297453, "grad_norm": 0.1455078125, "learning_rate": 5.459882583170255e-06, "loss": 0.8098, "step": 232 }, { "epoch": 0.45596868884540115, "grad_norm": 0.1484375, "learning_rate": 5.440313111545988e-06, "loss": 0.7238, "step": 233 }, { "epoch": 0.45792563600782776, "grad_norm": 0.1328125, "learning_rate": 5.420743639921723e-06, "loss": 0.815, "step": 234 }, { "epoch": 0.4598825831702544, "grad_norm": 0.1455078125, "learning_rate": 5.401174168297456e-06, "loss": 0.8157, "step": 235 }, { "epoch": 0.461839530332681, "grad_norm": 0.1552734375, "learning_rate": 5.381604696673191e-06, "loss": 0.8267, "step": 236 }, { "epoch": 0.4637964774951076, "grad_norm": 0.173828125, "learning_rate": 5.362035225048924e-06, "loss": 0.8532, "step": 237 }, { "epoch": 0.4657534246575342, "grad_norm": 0.1435546875, "learning_rate": 5.342465753424658e-06, "loss": 0.8053, "step": 238 }, { "epoch": 0.46771037181996084, "grad_norm": 0.1328125, "learning_rate": 5.322896281800392e-06, "loss": 0.7989, "step": 239 }, { "epoch": 0.46966731898238745, "grad_norm": 0.1357421875, "learning_rate": 5.303326810176126e-06, "loss": 0.8279, "step": 240 }, { "epoch": 0.47162426614481406, "grad_norm": 0.142578125, "learning_rate": 5.283757338551859e-06, "loss": 0.8052, "step": 241 }, { "epoch": 0.4735812133072407, "grad_norm": 0.1455078125, "learning_rate": 5.2641878669275936e-06, "loss": 0.8135, "step": 242 }, { "epoch": 0.4755381604696673, "grad_norm": 0.1591796875, "learning_rate": 5.244618395303327e-06, "loss": 0.837, "step": 243 }, { "epoch": 0.4774951076320939, "grad_norm": 0.1982421875, "learning_rate": 5.225048923679062e-06, "loss": 0.8033, "step": 244 }, { "epoch": 0.4794520547945205, "grad_norm": 0.140625, "learning_rate": 5.2054794520547945e-06, "loss": 0.8504, "step": 245 }, { "epoch": 0.48140900195694714, "grad_norm": 0.205078125, "learning_rate": 5.185909980430529e-06, "loss": 0.8732, "step": 246 }, { "epoch": 0.48336594911937375, "grad_norm": 0.1337890625, "learning_rate": 5.166340508806263e-06, "loss": 0.8369, "step": 247 }, { "epoch": 0.48532289628180036, "grad_norm": 0.1513671875, "learning_rate": 5.146771037181997e-06, "loss": 0.815, "step": 248 }, { "epoch": 0.487279843444227, "grad_norm": 0.138671875, "learning_rate": 5.12720156555773e-06, "loss": 0.8367, "step": 249 }, { "epoch": 0.4892367906066536, "grad_norm": 0.259765625, "learning_rate": 5.1076320939334645e-06, "loss": 0.8433, "step": 250 }, { "epoch": 0.4911937377690802, "grad_norm": 0.15625, "learning_rate": 5.088062622309198e-06, "loss": 0.8641, "step": 251 }, { "epoch": 0.4931506849315068, "grad_norm": 0.1357421875, "learning_rate": 5.068493150684932e-06, "loss": 0.8243, "step": 252 }, { "epoch": 0.49510763209393344, "grad_norm": 0.134765625, "learning_rate": 5.0489236790606654e-06, "loss": 0.8296, "step": 253 }, { "epoch": 0.49706457925636005, "grad_norm": 0.1533203125, "learning_rate": 5.0293542074364e-06, "loss": 0.8127, "step": 254 }, { "epoch": 0.49902152641878667, "grad_norm": 0.1640625, "learning_rate": 5.009784735812134e-06, "loss": 0.8214, "step": 255 }, { "epoch": 0.5009784735812133, "grad_norm": 0.1376953125, "learning_rate": 4.990215264187867e-06, "loss": 0.7784, "step": 256 }, { "epoch": 0.50293542074364, "grad_norm": 0.123046875, "learning_rate": 4.970645792563601e-06, "loss": 0.8956, "step": 257 }, { "epoch": 0.5048923679060665, "grad_norm": 0.142578125, "learning_rate": 4.9510763209393345e-06, "loss": 0.7886, "step": 258 }, { "epoch": 0.5068493150684932, "grad_norm": 0.1474609375, "learning_rate": 4.931506849315069e-06, "loss": 0.8058, "step": 259 }, { "epoch": 0.5088062622309197, "grad_norm": 0.14453125, "learning_rate": 4.911937377690803e-06, "loss": 0.8415, "step": 260 }, { "epoch": 0.5107632093933464, "grad_norm": 0.15234375, "learning_rate": 4.892367906066536e-06, "loss": 0.8112, "step": 261 }, { "epoch": 0.512720156555773, "grad_norm": 0.1357421875, "learning_rate": 4.87279843444227e-06, "loss": 0.8293, "step": 262 }, { "epoch": 0.5146771037181996, "grad_norm": 0.150390625, "learning_rate": 4.853228962818004e-06, "loss": 0.8278, "step": 263 }, { "epoch": 0.5166340508806262, "grad_norm": 0.1669921875, "learning_rate": 4.833659491193738e-06, "loss": 0.8316, "step": 264 }, { "epoch": 0.5185909980430529, "grad_norm": 0.1396484375, "learning_rate": 4.814090019569472e-06, "loss": 0.8299, "step": 265 }, { "epoch": 0.5205479452054794, "grad_norm": 0.1787109375, "learning_rate": 4.7945205479452054e-06, "loss": 0.8322, "step": 266 }, { "epoch": 0.5225048923679061, "grad_norm": 0.1435546875, "learning_rate": 4.774951076320939e-06, "loss": 0.7967, "step": 267 }, { "epoch": 0.5244618395303327, "grad_norm": 0.1455078125, "learning_rate": 4.755381604696674e-06, "loss": 0.8079, "step": 268 }, { "epoch": 0.5264187866927593, "grad_norm": 0.154296875, "learning_rate": 4.735812133072407e-06, "loss": 0.8257, "step": 269 }, { "epoch": 0.5283757338551859, "grad_norm": 0.1513671875, "learning_rate": 4.716242661448141e-06, "loss": 0.8509, "step": 270 }, { "epoch": 0.5303326810176126, "grad_norm": 0.150390625, "learning_rate": 4.6966731898238745e-06, "loss": 0.8002, "step": 271 }, { "epoch": 0.5322896281800391, "grad_norm": 0.1337890625, "learning_rate": 4.677103718199609e-06, "loss": 0.8707, "step": 272 }, { "epoch": 0.5342465753424658, "grad_norm": 0.130859375, "learning_rate": 4.657534246575343e-06, "loss": 0.8337, "step": 273 }, { "epoch": 0.5362035225048923, "grad_norm": 0.150390625, "learning_rate": 4.637964774951076e-06, "loss": 0.7674, "step": 274 }, { "epoch": 0.538160469667319, "grad_norm": 0.13671875, "learning_rate": 4.61839530332681e-06, "loss": 0.8229, "step": 275 }, { "epoch": 0.5401174168297456, "grad_norm": 0.1494140625, "learning_rate": 4.5988258317025445e-06, "loss": 0.8068, "step": 276 }, { "epoch": 0.5420743639921722, "grad_norm": 0.13671875, "learning_rate": 4.579256360078278e-06, "loss": 0.817, "step": 277 }, { "epoch": 0.5440313111545988, "grad_norm": 0.1220703125, "learning_rate": 4.559686888454012e-06, "loss": 0.9284, "step": 278 }, { "epoch": 0.5459882583170255, "grad_norm": 0.25390625, "learning_rate": 4.5401174168297455e-06, "loss": 0.8302, "step": 279 }, { "epoch": 0.547945205479452, "grad_norm": 0.1533203125, "learning_rate": 4.52054794520548e-06, "loss": 0.7787, "step": 280 }, { "epoch": 0.5499021526418787, "grad_norm": 0.1376953125, "learning_rate": 4.500978473581214e-06, "loss": 0.8362, "step": 281 }, { "epoch": 0.5518590998043053, "grad_norm": 0.2255859375, "learning_rate": 4.481409001956947e-06, "loss": 0.8446, "step": 282 }, { "epoch": 0.5538160469667319, "grad_norm": 0.1484375, "learning_rate": 4.461839530332681e-06, "loss": 0.7899, "step": 283 }, { "epoch": 0.5557729941291585, "grad_norm": 0.1474609375, "learning_rate": 4.442270058708415e-06, "loss": 0.8018, "step": 284 }, { "epoch": 0.5577299412915852, "grad_norm": 0.150390625, "learning_rate": 4.422700587084149e-06, "loss": 0.8233, "step": 285 }, { "epoch": 0.5596868884540117, "grad_norm": 0.12890625, "learning_rate": 4.403131115459883e-06, "loss": 0.8734, "step": 286 }, { "epoch": 0.5616438356164384, "grad_norm": 0.1318359375, "learning_rate": 4.383561643835616e-06, "loss": 0.837, "step": 287 }, { "epoch": 0.5636007827788649, "grad_norm": 0.1416015625, "learning_rate": 4.36399217221135e-06, "loss": 0.8116, "step": 288 }, { "epoch": 0.5655577299412916, "grad_norm": 0.1455078125, "learning_rate": 4.3444227005870845e-06, "loss": 0.8029, "step": 289 }, { "epoch": 0.5675146771037182, "grad_norm": 0.345703125, "learning_rate": 4.324853228962818e-06, "loss": 0.7608, "step": 290 }, { "epoch": 0.5694716242661448, "grad_norm": 0.1416015625, "learning_rate": 4.305283757338552e-06, "loss": 0.7919, "step": 291 }, { "epoch": 0.5714285714285714, "grad_norm": 0.1533203125, "learning_rate": 4.2857142857142855e-06, "loss": 0.7834, "step": 292 }, { "epoch": 0.5733855185909981, "grad_norm": 0.1337890625, "learning_rate": 4.26614481409002e-06, "loss": 0.8632, "step": 293 }, { "epoch": 0.5753424657534246, "grad_norm": 0.142578125, "learning_rate": 4.246575342465754e-06, "loss": 0.8458, "step": 294 }, { "epoch": 0.5772994129158513, "grad_norm": 0.138671875, "learning_rate": 4.227005870841487e-06, "loss": 0.8095, "step": 295 }, { "epoch": 0.5792563600782779, "grad_norm": 0.1357421875, "learning_rate": 4.207436399217221e-06, "loss": 0.7865, "step": 296 }, { "epoch": 0.5812133072407045, "grad_norm": 0.14453125, "learning_rate": 4.187866927592955e-06, "loss": 0.7765, "step": 297 }, { "epoch": 0.5831702544031311, "grad_norm": 0.15234375, "learning_rate": 4.168297455968689e-06, "loss": 0.7958, "step": 298 }, { "epoch": 0.5851272015655578, "grad_norm": 0.140625, "learning_rate": 4.148727984344423e-06, "loss": 0.8319, "step": 299 }, { "epoch": 0.5870841487279843, "grad_norm": 0.138671875, "learning_rate": 4.129158512720156e-06, "loss": 0.7867, "step": 300 }, { "epoch": 0.589041095890411, "grad_norm": 0.14453125, "learning_rate": 4.109589041095891e-06, "loss": 0.89, "step": 301 }, { "epoch": 0.5909980430528375, "grad_norm": 0.15234375, "learning_rate": 4.0900195694716245e-06, "loss": 0.7982, "step": 302 }, { "epoch": 0.5929549902152642, "grad_norm": 0.138671875, "learning_rate": 4.070450097847358e-06, "loss": 0.7748, "step": 303 }, { "epoch": 0.5949119373776908, "grad_norm": 0.1484375, "learning_rate": 4.050880626223092e-06, "loss": 0.7973, "step": 304 }, { "epoch": 0.5968688845401174, "grad_norm": 0.193359375, "learning_rate": 4.031311154598826e-06, "loss": 0.7829, "step": 305 }, { "epoch": 0.598825831702544, "grad_norm": 0.1474609375, "learning_rate": 4.01174168297456e-06, "loss": 0.8327, "step": 306 }, { "epoch": 0.6007827788649707, "grad_norm": 0.1630859375, "learning_rate": 3.992172211350294e-06, "loss": 0.8311, "step": 307 }, { "epoch": 0.6027397260273972, "grad_norm": 0.140625, "learning_rate": 3.972602739726027e-06, "loss": 0.8463, "step": 308 }, { "epoch": 0.6046966731898239, "grad_norm": 0.15625, "learning_rate": 3.953033268101762e-06, "loss": 0.8147, "step": 309 }, { "epoch": 0.6066536203522505, "grad_norm": 0.134765625, "learning_rate": 3.933463796477495e-06, "loss": 0.809, "step": 310 }, { "epoch": 0.6086105675146771, "grad_norm": 0.21484375, "learning_rate": 3.913894324853229e-06, "loss": 0.7249, "step": 311 }, { "epoch": 0.6105675146771037, "grad_norm": 0.1640625, "learning_rate": 3.894324853228963e-06, "loss": 0.7849, "step": 312 }, { "epoch": 0.6125244618395304, "grad_norm": 0.1474609375, "learning_rate": 3.874755381604696e-06, "loss": 0.8829, "step": 313 }, { "epoch": 0.6144814090019569, "grad_norm": 0.14453125, "learning_rate": 3.855185909980431e-06, "loss": 0.816, "step": 314 }, { "epoch": 0.6164383561643836, "grad_norm": 0.1298828125, "learning_rate": 3.8356164383561645e-06, "loss": 0.8552, "step": 315 }, { "epoch": 0.6183953033268101, "grad_norm": 0.1357421875, "learning_rate": 3.816046966731898e-06, "loss": 0.8737, "step": 316 }, { "epoch": 0.6203522504892368, "grad_norm": 0.1337890625, "learning_rate": 3.7964774951076322e-06, "loss": 0.8471, "step": 317 }, { "epoch": 0.6223091976516634, "grad_norm": 0.173828125, "learning_rate": 3.776908023483366e-06, "loss": 0.7774, "step": 318 }, { "epoch": 0.62426614481409, "grad_norm": 0.138671875, "learning_rate": 3.7573385518591e-06, "loss": 0.8058, "step": 319 }, { "epoch": 0.6262230919765166, "grad_norm": 0.1689453125, "learning_rate": 3.7377690802348336e-06, "loss": 0.7918, "step": 320 }, { "epoch": 0.6281800391389433, "grad_norm": 0.1591796875, "learning_rate": 3.7181996086105677e-06, "loss": 0.824, "step": 321 }, { "epoch": 0.6301369863013698, "grad_norm": 0.1953125, "learning_rate": 3.6986301369863014e-06, "loss": 0.8411, "step": 322 }, { "epoch": 0.6320939334637965, "grad_norm": 0.1484375, "learning_rate": 3.6790606653620354e-06, "loss": 0.7759, "step": 323 }, { "epoch": 0.6340508806262231, "grad_norm": 0.138671875, "learning_rate": 3.659491193737769e-06, "loss": 0.8138, "step": 324 }, { "epoch": 0.6360078277886497, "grad_norm": 0.17578125, "learning_rate": 3.639921722113503e-06, "loss": 0.7988, "step": 325 }, { "epoch": 0.6379647749510763, "grad_norm": 0.171875, "learning_rate": 3.620352250489237e-06, "loss": 0.742, "step": 326 }, { "epoch": 0.639921722113503, "grad_norm": 0.1396484375, "learning_rate": 3.600782778864971e-06, "loss": 0.843, "step": 327 }, { "epoch": 0.6418786692759295, "grad_norm": 0.1435546875, "learning_rate": 3.5812133072407045e-06, "loss": 0.7595, "step": 328 }, { "epoch": 0.6438356164383562, "grad_norm": 0.142578125, "learning_rate": 3.5616438356164386e-06, "loss": 0.8492, "step": 329 }, { "epoch": 0.6457925636007827, "grad_norm": 0.1376953125, "learning_rate": 3.5420743639921723e-06, "loss": 0.7913, "step": 330 }, { "epoch": 0.6477495107632094, "grad_norm": 0.197265625, "learning_rate": 3.5225048923679063e-06, "loss": 0.7671, "step": 331 }, { "epoch": 0.649706457925636, "grad_norm": 0.1474609375, "learning_rate": 3.50293542074364e-06, "loss": 0.7292, "step": 332 }, { "epoch": 0.6516634050880626, "grad_norm": 0.158203125, "learning_rate": 3.483365949119374e-06, "loss": 0.8076, "step": 333 }, { "epoch": 0.6536203522504892, "grad_norm": 0.1669921875, "learning_rate": 3.4637964774951077e-06, "loss": 0.7857, "step": 334 }, { "epoch": 0.6555772994129159, "grad_norm": 0.1376953125, "learning_rate": 3.4442270058708418e-06, "loss": 0.7836, "step": 335 }, { "epoch": 0.6575342465753424, "grad_norm": 0.146484375, "learning_rate": 3.4246575342465754e-06, "loss": 0.8336, "step": 336 }, { "epoch": 0.6594911937377691, "grad_norm": 0.146484375, "learning_rate": 3.405088062622309e-06, "loss": 0.8058, "step": 337 }, { "epoch": 0.6614481409001957, "grad_norm": 0.1240234375, "learning_rate": 3.385518590998043e-06, "loss": 0.8918, "step": 338 }, { "epoch": 0.6634050880626223, "grad_norm": 0.16015625, "learning_rate": 3.365949119373777e-06, "loss": 0.7775, "step": 339 }, { "epoch": 0.6653620352250489, "grad_norm": 0.1669921875, "learning_rate": 3.346379647749511e-06, "loss": 0.7522, "step": 340 }, { "epoch": 0.6673189823874756, "grad_norm": 0.1650390625, "learning_rate": 3.3268101761252445e-06, "loss": 0.7652, "step": 341 }, { "epoch": 0.6692759295499021, "grad_norm": 0.1376953125, "learning_rate": 3.3072407045009786e-06, "loss": 0.8224, "step": 342 }, { "epoch": 0.6712328767123288, "grad_norm": 0.18359375, "learning_rate": 3.2876712328767123e-06, "loss": 0.7782, "step": 343 }, { "epoch": 0.6731898238747553, "grad_norm": 0.1259765625, "learning_rate": 3.2681017612524463e-06, "loss": 0.8399, "step": 344 }, { "epoch": 0.675146771037182, "grad_norm": 0.181640625, "learning_rate": 3.24853228962818e-06, "loss": 0.8376, "step": 345 }, { "epoch": 0.6771037181996086, "grad_norm": 0.1689453125, "learning_rate": 3.228962818003914e-06, "loss": 0.8122, "step": 346 }, { "epoch": 0.6790606653620352, "grad_norm": 0.1689453125, "learning_rate": 3.2093933463796477e-06, "loss": 0.8152, "step": 347 }, { "epoch": 0.6810176125244618, "grad_norm": 0.15234375, "learning_rate": 3.189823874755382e-06, "loss": 0.822, "step": 348 }, { "epoch": 0.6829745596868885, "grad_norm": 0.1474609375, "learning_rate": 3.1702544031311154e-06, "loss": 0.773, "step": 349 }, { "epoch": 0.684931506849315, "grad_norm": 0.197265625, "learning_rate": 3.1506849315068495e-06, "loss": 0.6956, "step": 350 }, { "epoch": 0.6868884540117417, "grad_norm": 0.14453125, "learning_rate": 3.131115459882583e-06, "loss": 0.8239, "step": 351 }, { "epoch": 0.6888454011741683, "grad_norm": 0.1474609375, "learning_rate": 3.1115459882583172e-06, "loss": 0.8126, "step": 352 }, { "epoch": 0.6908023483365949, "grad_norm": 0.1455078125, "learning_rate": 3.091976516634051e-06, "loss": 0.8178, "step": 353 }, { "epoch": 0.6927592954990215, "grad_norm": 0.1435546875, "learning_rate": 3.072407045009785e-06, "loss": 0.902, "step": 354 }, { "epoch": 0.6947162426614482, "grad_norm": 0.15625, "learning_rate": 3.0528375733855186e-06, "loss": 0.7964, "step": 355 }, { "epoch": 0.6966731898238747, "grad_norm": 0.1494140625, "learning_rate": 3.0332681017612527e-06, "loss": 0.7898, "step": 356 }, { "epoch": 0.6986301369863014, "grad_norm": 0.15234375, "learning_rate": 3.0136986301369864e-06, "loss": 0.7629, "step": 357 }, { "epoch": 0.700587084148728, "grad_norm": 0.142578125, "learning_rate": 2.9941291585127204e-06, "loss": 0.8343, "step": 358 }, { "epoch": 0.7025440313111546, "grad_norm": 0.12890625, "learning_rate": 2.974559686888454e-06, "loss": 0.872, "step": 359 }, { "epoch": 0.7045009784735812, "grad_norm": 0.138671875, "learning_rate": 2.954990215264188e-06, "loss": 0.8317, "step": 360 }, { "epoch": 0.7064579256360078, "grad_norm": 0.1455078125, "learning_rate": 2.935420743639922e-06, "loss": 0.8259, "step": 361 }, { "epoch": 0.7084148727984344, "grad_norm": 0.158203125, "learning_rate": 2.9158512720156555e-06, "loss": 0.7376, "step": 362 }, { "epoch": 0.7103718199608611, "grad_norm": 0.1455078125, "learning_rate": 2.8962818003913895e-06, "loss": 0.7962, "step": 363 }, { "epoch": 0.7123287671232876, "grad_norm": 0.138671875, "learning_rate": 2.876712328767123e-06, "loss": 0.8058, "step": 364 }, { "epoch": 0.7142857142857143, "grad_norm": 0.15625, "learning_rate": 2.8571428571428573e-06, "loss": 0.8273, "step": 365 }, { "epoch": 0.7162426614481409, "grad_norm": 0.142578125, "learning_rate": 2.837573385518591e-06, "loss": 0.7809, "step": 366 }, { "epoch": 0.7181996086105675, "grad_norm": 0.138671875, "learning_rate": 2.818003913894325e-06, "loss": 0.7872, "step": 367 }, { "epoch": 0.7201565557729941, "grad_norm": 0.1435546875, "learning_rate": 2.7984344422700586e-06, "loss": 0.8785, "step": 368 }, { "epoch": 0.7221135029354208, "grad_norm": 0.1484375, "learning_rate": 2.7788649706457927e-06, "loss": 0.8299, "step": 369 }, { "epoch": 0.7240704500978473, "grad_norm": 0.146484375, "learning_rate": 2.7592954990215264e-06, "loss": 0.8026, "step": 370 }, { "epoch": 0.726027397260274, "grad_norm": 0.154296875, "learning_rate": 2.7397260273972604e-06, "loss": 0.7787, "step": 371 }, { "epoch": 0.7279843444227005, "grad_norm": 0.173828125, "learning_rate": 2.720156555772994e-06, "loss": 0.7265, "step": 372 }, { "epoch": 0.7299412915851272, "grad_norm": 0.1513671875, "learning_rate": 2.700587084148728e-06, "loss": 0.7992, "step": 373 }, { "epoch": 0.7318982387475538, "grad_norm": 0.1875, "learning_rate": 2.681017612524462e-06, "loss": 0.7495, "step": 374 }, { "epoch": 0.7338551859099804, "grad_norm": 0.150390625, "learning_rate": 2.661448140900196e-06, "loss": 0.8236, "step": 375 }, { "epoch": 0.735812133072407, "grad_norm": 0.13671875, "learning_rate": 2.6418786692759295e-06, "loss": 0.8181, "step": 376 }, { "epoch": 0.7377690802348337, "grad_norm": 0.13671875, "learning_rate": 2.6223091976516636e-06, "loss": 0.7977, "step": 377 }, { "epoch": 0.7397260273972602, "grad_norm": 0.1435546875, "learning_rate": 2.6027397260273973e-06, "loss": 0.7887, "step": 378 }, { "epoch": 0.7416829745596869, "grad_norm": 0.1357421875, "learning_rate": 2.5831702544031313e-06, "loss": 0.8723, "step": 379 }, { "epoch": 0.7436399217221135, "grad_norm": 0.1396484375, "learning_rate": 2.563600782778865e-06, "loss": 0.8492, "step": 380 }, { "epoch": 0.7455968688845401, "grad_norm": 0.1435546875, "learning_rate": 2.544031311154599e-06, "loss": 0.7904, "step": 381 }, { "epoch": 0.7475538160469667, "grad_norm": 0.150390625, "learning_rate": 2.5244618395303327e-06, "loss": 0.7835, "step": 382 }, { "epoch": 0.7495107632093934, "grad_norm": 0.1650390625, "learning_rate": 2.504892367906067e-06, "loss": 0.86, "step": 383 }, { "epoch": 0.7514677103718199, "grad_norm": 0.142578125, "learning_rate": 2.4853228962818004e-06, "loss": 0.8163, "step": 384 }, { "epoch": 0.7534246575342466, "grad_norm": 0.1416015625, "learning_rate": 2.4657534246575345e-06, "loss": 0.7965, "step": 385 }, { "epoch": 0.7553816046966731, "grad_norm": 0.13671875, "learning_rate": 2.446183953033268e-06, "loss": 0.8441, "step": 386 }, { "epoch": 0.7573385518590998, "grad_norm": 0.13671875, "learning_rate": 2.426614481409002e-06, "loss": 0.784, "step": 387 }, { "epoch": 0.7592954990215264, "grad_norm": 0.1513671875, "learning_rate": 2.407045009784736e-06, "loss": 0.8536, "step": 388 }, { "epoch": 0.761252446183953, "grad_norm": 0.1328125, "learning_rate": 2.3874755381604695e-06, "loss": 0.9018, "step": 389 }, { "epoch": 0.7632093933463796, "grad_norm": 0.1298828125, "learning_rate": 2.3679060665362036e-06, "loss": 0.8552, "step": 390 }, { "epoch": 0.7651663405088063, "grad_norm": 0.1689453125, "learning_rate": 2.3483365949119373e-06, "loss": 0.7223, "step": 391 }, { "epoch": 0.7671232876712328, "grad_norm": 0.1494140625, "learning_rate": 2.3287671232876713e-06, "loss": 0.8057, "step": 392 }, { "epoch": 0.7690802348336595, "grad_norm": 0.16796875, "learning_rate": 2.309197651663405e-06, "loss": 0.8436, "step": 393 }, { "epoch": 0.7710371819960861, "grad_norm": 0.1513671875, "learning_rate": 2.289628180039139e-06, "loss": 0.7871, "step": 394 }, { "epoch": 0.7729941291585127, "grad_norm": 0.2001953125, "learning_rate": 2.2700587084148727e-06, "loss": 0.8207, "step": 395 }, { "epoch": 0.7749510763209393, "grad_norm": 0.1513671875, "learning_rate": 2.250489236790607e-06, "loss": 0.8627, "step": 396 }, { "epoch": 0.776908023483366, "grad_norm": 0.1337890625, "learning_rate": 2.2309197651663405e-06, "loss": 0.8278, "step": 397 }, { "epoch": 0.7788649706457925, "grad_norm": 0.1357421875, "learning_rate": 2.2113502935420745e-06, "loss": 0.8097, "step": 398 }, { "epoch": 0.7808219178082192, "grad_norm": 0.1474609375, "learning_rate": 2.191780821917808e-06, "loss": 0.7998, "step": 399 }, { "epoch": 0.7827788649706457, "grad_norm": 0.146484375, "learning_rate": 2.1722113502935423e-06, "loss": 0.8661, "step": 400 }, { "epoch": 0.7847358121330724, "grad_norm": 0.158203125, "learning_rate": 2.152641878669276e-06, "loss": 0.8593, "step": 401 }, { "epoch": 0.786692759295499, "grad_norm": 0.1513671875, "learning_rate": 2.13307240704501e-06, "loss": 0.7968, "step": 402 }, { "epoch": 0.7886497064579256, "grad_norm": 0.130859375, "learning_rate": 2.1135029354207436e-06, "loss": 0.8502, "step": 403 }, { "epoch": 0.7906066536203522, "grad_norm": 0.1455078125, "learning_rate": 2.0939334637964777e-06, "loss": 0.8185, "step": 404 }, { "epoch": 0.7925636007827789, "grad_norm": 0.181640625, "learning_rate": 2.0743639921722114e-06, "loss": 0.768, "step": 405 }, { "epoch": 0.7945205479452054, "grad_norm": 0.1484375, "learning_rate": 2.0547945205479454e-06, "loss": 0.857, "step": 406 }, { "epoch": 0.7964774951076321, "grad_norm": 0.1845703125, "learning_rate": 2.035225048923679e-06, "loss": 0.7867, "step": 407 }, { "epoch": 0.7984344422700587, "grad_norm": 0.146484375, "learning_rate": 2.015655577299413e-06, "loss": 0.8031, "step": 408 }, { "epoch": 0.8003913894324853, "grad_norm": 0.15625, "learning_rate": 1.996086105675147e-06, "loss": 0.8207, "step": 409 }, { "epoch": 0.8023483365949119, "grad_norm": 0.134765625, "learning_rate": 1.976516634050881e-06, "loss": 0.8736, "step": 410 }, { "epoch": 0.8043052837573386, "grad_norm": 0.1455078125, "learning_rate": 1.9569471624266145e-06, "loss": 0.7846, "step": 411 }, { "epoch": 0.8062622309197651, "grad_norm": 0.1455078125, "learning_rate": 1.937377690802348e-06, "loss": 0.7179, "step": 412 }, { "epoch": 0.8082191780821918, "grad_norm": 0.146484375, "learning_rate": 1.9178082191780823e-06, "loss": 0.8121, "step": 413 }, { "epoch": 0.8101761252446184, "grad_norm": 0.1416015625, "learning_rate": 1.8982387475538161e-06, "loss": 0.8256, "step": 414 }, { "epoch": 0.812133072407045, "grad_norm": 0.15234375, "learning_rate": 1.87866927592955e-06, "loss": 0.8072, "step": 415 }, { "epoch": 0.8140900195694716, "grad_norm": 0.1455078125, "learning_rate": 1.8590998043052839e-06, "loss": 0.7735, "step": 416 }, { "epoch": 0.8160469667318982, "grad_norm": 0.171875, "learning_rate": 1.8395303326810177e-06, "loss": 0.8447, "step": 417 }, { "epoch": 0.8180039138943248, "grad_norm": 0.30859375, "learning_rate": 1.8199608610567516e-06, "loss": 0.724, "step": 418 }, { "epoch": 0.8199608610567515, "grad_norm": 0.158203125, "learning_rate": 1.8003913894324854e-06, "loss": 0.8001, "step": 419 }, { "epoch": 0.821917808219178, "grad_norm": 0.150390625, "learning_rate": 1.7808219178082193e-06, "loss": 0.7854, "step": 420 }, { "epoch": 0.8238747553816047, "grad_norm": 0.1630859375, "learning_rate": 1.7612524461839532e-06, "loss": 0.8543, "step": 421 }, { "epoch": 0.8258317025440313, "grad_norm": 0.142578125, "learning_rate": 1.741682974559687e-06, "loss": 0.8381, "step": 422 }, { "epoch": 0.8277886497064579, "grad_norm": 0.150390625, "learning_rate": 1.7221135029354209e-06, "loss": 0.8048, "step": 423 }, { "epoch": 0.8297455968688845, "grad_norm": 0.142578125, "learning_rate": 1.7025440313111545e-06, "loss": 0.8001, "step": 424 }, { "epoch": 0.8317025440313112, "grad_norm": 0.1376953125, "learning_rate": 1.6829745596868884e-06, "loss": 0.8394, "step": 425 }, { "epoch": 0.8336594911937377, "grad_norm": 0.1513671875, "learning_rate": 1.6634050880626223e-06, "loss": 0.8285, "step": 426 }, { "epoch": 0.8356164383561644, "grad_norm": 0.1455078125, "learning_rate": 1.6438356164383561e-06, "loss": 0.7556, "step": 427 }, { "epoch": 0.837573385518591, "grad_norm": 0.18359375, "learning_rate": 1.62426614481409e-06, "loss": 0.8053, "step": 428 }, { "epoch": 0.8395303326810176, "grad_norm": 0.150390625, "learning_rate": 1.6046966731898239e-06, "loss": 0.8353, "step": 429 }, { "epoch": 0.8414872798434442, "grad_norm": 0.1376953125, "learning_rate": 1.5851272015655577e-06, "loss": 0.8326, "step": 430 }, { "epoch": 0.8434442270058709, "grad_norm": 0.1376953125, "learning_rate": 1.5655577299412916e-06, "loss": 0.7238, "step": 431 }, { "epoch": 0.8454011741682974, "grad_norm": 0.1416015625, "learning_rate": 1.5459882583170254e-06, "loss": 0.8071, "step": 432 }, { "epoch": 0.8473581213307241, "grad_norm": 0.1376953125, "learning_rate": 1.5264187866927593e-06, "loss": 0.7282, "step": 433 }, { "epoch": 0.8493150684931506, "grad_norm": 0.150390625, "learning_rate": 1.5068493150684932e-06, "loss": 0.7484, "step": 434 }, { "epoch": 0.8512720156555773, "grad_norm": 0.1416015625, "learning_rate": 1.487279843444227e-06, "loss": 0.8348, "step": 435 }, { "epoch": 0.8532289628180039, "grad_norm": 0.15625, "learning_rate": 1.467710371819961e-06, "loss": 0.8239, "step": 436 }, { "epoch": 0.8551859099804305, "grad_norm": 0.1455078125, "learning_rate": 1.4481409001956948e-06, "loss": 0.7993, "step": 437 }, { "epoch": 0.8571428571428571, "grad_norm": 0.1474609375, "learning_rate": 1.4285714285714286e-06, "loss": 0.8169, "step": 438 }, { "epoch": 0.8590998043052838, "grad_norm": 0.15625, "learning_rate": 1.4090019569471625e-06, "loss": 0.7944, "step": 439 }, { "epoch": 0.8610567514677103, "grad_norm": 0.150390625, "learning_rate": 1.3894324853228964e-06, "loss": 0.8219, "step": 440 }, { "epoch": 0.863013698630137, "grad_norm": 0.1611328125, "learning_rate": 1.3698630136986302e-06, "loss": 0.8306, "step": 441 }, { "epoch": 0.8649706457925636, "grad_norm": 0.14453125, "learning_rate": 1.350293542074364e-06, "loss": 0.8336, "step": 442 }, { "epoch": 0.8669275929549902, "grad_norm": 0.16015625, "learning_rate": 1.330724070450098e-06, "loss": 0.8226, "step": 443 }, { "epoch": 0.8688845401174168, "grad_norm": 0.146484375, "learning_rate": 1.3111545988258318e-06, "loss": 0.8407, "step": 444 }, { "epoch": 0.8708414872798435, "grad_norm": 0.138671875, "learning_rate": 1.2915851272015657e-06, "loss": 0.8366, "step": 445 }, { "epoch": 0.87279843444227, "grad_norm": 0.1865234375, "learning_rate": 1.2720156555772995e-06, "loss": 0.6976, "step": 446 }, { "epoch": 0.8747553816046967, "grad_norm": 0.130859375, "learning_rate": 1.2524461839530334e-06, "loss": 0.8289, "step": 447 }, { "epoch": 0.8767123287671232, "grad_norm": 0.13671875, "learning_rate": 1.2328767123287673e-06, "loss": 0.8101, "step": 448 }, { "epoch": 0.8786692759295499, "grad_norm": 0.138671875, "learning_rate": 1.213307240704501e-06, "loss": 0.8111, "step": 449 }, { "epoch": 0.8806262230919765, "grad_norm": 0.146484375, "learning_rate": 1.1937377690802348e-06, "loss": 0.8277, "step": 450 }, { "epoch": 0.8825831702544031, "grad_norm": 0.1337890625, "learning_rate": 1.1741682974559686e-06, "loss": 0.8005, "step": 451 }, { "epoch": 0.8845401174168297, "grad_norm": 0.1376953125, "learning_rate": 1.1545988258317025e-06, "loss": 0.8484, "step": 452 }, { "epoch": 0.8864970645792564, "grad_norm": 0.1455078125, "learning_rate": 1.1350293542074364e-06, "loss": 0.818, "step": 453 }, { "epoch": 0.8884540117416829, "grad_norm": 0.1796875, "learning_rate": 1.1154598825831702e-06, "loss": 0.8746, "step": 454 }, { "epoch": 0.8904109589041096, "grad_norm": 0.1416015625, "learning_rate": 1.095890410958904e-06, "loss": 0.8444, "step": 455 }, { "epoch": 0.8923679060665362, "grad_norm": 0.142578125, "learning_rate": 1.076320939334638e-06, "loss": 0.7859, "step": 456 }, { "epoch": 0.8943248532289628, "grad_norm": 0.1552734375, "learning_rate": 1.0567514677103718e-06, "loss": 0.7849, "step": 457 }, { "epoch": 0.8962818003913894, "grad_norm": 0.2158203125, "learning_rate": 1.0371819960861057e-06, "loss": 0.7859, "step": 458 }, { "epoch": 0.898238747553816, "grad_norm": 0.146484375, "learning_rate": 1.0176125244618395e-06, "loss": 0.8224, "step": 459 }, { "epoch": 0.9001956947162426, "grad_norm": 0.1611328125, "learning_rate": 9.980430528375734e-07, "loss": 0.7779, "step": 460 }, { "epoch": 0.9021526418786693, "grad_norm": 0.1396484375, "learning_rate": 9.784735812133073e-07, "loss": 0.8562, "step": 461 }, { "epoch": 0.9041095890410958, "grad_norm": 0.1318359375, "learning_rate": 9.589041095890411e-07, "loss": 0.829, "step": 462 }, { "epoch": 0.9060665362035225, "grad_norm": 0.189453125, "learning_rate": 9.39334637964775e-07, "loss": 0.7345, "step": 463 }, { "epoch": 0.9080234833659491, "grad_norm": 0.1533203125, "learning_rate": 9.197651663405089e-07, "loss": 0.8129, "step": 464 }, { "epoch": 0.9099804305283757, "grad_norm": 0.1611328125, "learning_rate": 9.001956947162427e-07, "loss": 0.7889, "step": 465 }, { "epoch": 0.9119373776908023, "grad_norm": 0.1396484375, "learning_rate": 8.806262230919766e-07, "loss": 0.8081, "step": 466 }, { "epoch": 0.913894324853229, "grad_norm": 0.1552734375, "learning_rate": 8.610567514677104e-07, "loss": 0.8084, "step": 467 }, { "epoch": 0.9158512720156555, "grad_norm": 0.1533203125, "learning_rate": 8.414872798434442e-07, "loss": 0.7586, "step": 468 }, { "epoch": 0.9178082191780822, "grad_norm": 0.15234375, "learning_rate": 8.219178082191781e-07, "loss": 0.8032, "step": 469 }, { "epoch": 0.9197651663405088, "grad_norm": 0.1474609375, "learning_rate": 8.023483365949119e-07, "loss": 0.8043, "step": 470 }, { "epoch": 0.9217221135029354, "grad_norm": 0.1474609375, "learning_rate": 7.827788649706458e-07, "loss": 0.8214, "step": 471 }, { "epoch": 0.923679060665362, "grad_norm": 0.1337890625, "learning_rate": 7.632093933463797e-07, "loss": 0.8134, "step": 472 }, { "epoch": 0.9256360078277887, "grad_norm": 0.14453125, "learning_rate": 7.436399217221135e-07, "loss": 0.829, "step": 473 }, { "epoch": 0.9275929549902152, "grad_norm": 0.15625, "learning_rate": 7.240704500978474e-07, "loss": 0.8308, "step": 474 }, { "epoch": 0.9295499021526419, "grad_norm": 0.1484375, "learning_rate": 7.045009784735812e-07, "loss": 0.8163, "step": 475 }, { "epoch": 0.9315068493150684, "grad_norm": 0.2080078125, "learning_rate": 6.849315068493151e-07, "loss": 0.7667, "step": 476 }, { "epoch": 0.9334637964774951, "grad_norm": 0.1337890625, "learning_rate": 6.65362035225049e-07, "loss": 0.8218, "step": 477 }, { "epoch": 0.9354207436399217, "grad_norm": 0.14453125, "learning_rate": 6.457925636007828e-07, "loss": 0.7775, "step": 478 }, { "epoch": 0.9373776908023483, "grad_norm": 0.1416015625, "learning_rate": 6.262230919765167e-07, "loss": 0.8613, "step": 479 }, { "epoch": 0.9393346379647749, "grad_norm": 0.13671875, "learning_rate": 6.066536203522505e-07, "loss": 0.8279, "step": 480 }, { "epoch": 0.9412915851272016, "grad_norm": 0.13671875, "learning_rate": 5.870841487279843e-07, "loss": 0.8247, "step": 481 }, { "epoch": 0.9432485322896281, "grad_norm": 0.142578125, "learning_rate": 5.675146771037182e-07, "loss": 0.8399, "step": 482 }, { "epoch": 0.9452054794520548, "grad_norm": 0.142578125, "learning_rate": 5.47945205479452e-07, "loss": 0.7841, "step": 483 }, { "epoch": 0.9471624266144814, "grad_norm": 0.1494140625, "learning_rate": 5.283757338551859e-07, "loss": 0.7923, "step": 484 }, { "epoch": 0.949119373776908, "grad_norm": 0.14453125, "learning_rate": 5.088062622309198e-07, "loss": 0.8464, "step": 485 }, { "epoch": 0.9510763209393346, "grad_norm": 0.138671875, "learning_rate": 4.892367906066536e-07, "loss": 0.801, "step": 486 }, { "epoch": 0.9530332681017613, "grad_norm": 0.2470703125, "learning_rate": 4.696673189823875e-07, "loss": 0.7939, "step": 487 }, { "epoch": 0.9549902152641878, "grad_norm": 0.1376953125, "learning_rate": 4.5009784735812136e-07, "loss": 0.8744, "step": 488 }, { "epoch": 0.9569471624266145, "grad_norm": 0.15234375, "learning_rate": 4.305283757338552e-07, "loss": 0.8419, "step": 489 }, { "epoch": 0.958904109589041, "grad_norm": 0.208984375, "learning_rate": 4.1095890410958903e-07, "loss": 0.8459, "step": 490 }, { "epoch": 0.9608610567514677, "grad_norm": 0.1416015625, "learning_rate": 3.913894324853229e-07, "loss": 0.8289, "step": 491 }, { "epoch": 0.9628180039138943, "grad_norm": 0.140625, "learning_rate": 3.7181996086105676e-07, "loss": 0.8121, "step": 492 }, { "epoch": 0.9647749510763209, "grad_norm": 0.1494140625, "learning_rate": 3.522504892367906e-07, "loss": 0.8218, "step": 493 }, { "epoch": 0.9667318982387475, "grad_norm": 0.1875, "learning_rate": 3.326810176125245e-07, "loss": 0.7736, "step": 494 }, { "epoch": 0.9686888454011742, "grad_norm": 0.142578125, "learning_rate": 3.1311154598825835e-07, "loss": 0.8321, "step": 495 }, { "epoch": 0.9706457925636007, "grad_norm": 0.154296875, "learning_rate": 2.9354207436399216e-07, "loss": 0.8535, "step": 496 }, { "epoch": 0.9726027397260274, "grad_norm": 0.13671875, "learning_rate": 2.73972602739726e-07, "loss": 0.8368, "step": 497 }, { "epoch": 0.974559686888454, "grad_norm": 0.1650390625, "learning_rate": 2.544031311154599e-07, "loss": 0.8223, "step": 498 }, { "epoch": 0.9765166340508806, "grad_norm": 0.150390625, "learning_rate": 2.3483365949119375e-07, "loss": 0.8236, "step": 499 }, { "epoch": 0.9784735812133072, "grad_norm": 0.1640625, "learning_rate": 2.152641878669276e-07, "loss": 0.8086, "step": 500 }, { "epoch": 0.9804305283757339, "grad_norm": 0.1611328125, "learning_rate": 1.9569471624266145e-07, "loss": 0.7586, "step": 501 }, { "epoch": 0.9823874755381604, "grad_norm": 0.177734375, "learning_rate": 1.761252446183953e-07, "loss": 0.8024, "step": 502 }, { "epoch": 0.9843444227005871, "grad_norm": 0.1337890625, "learning_rate": 1.5655577299412917e-07, "loss": 0.8554, "step": 503 }, { "epoch": 0.9863013698630136, "grad_norm": 0.1953125, "learning_rate": 1.36986301369863e-07, "loss": 0.7178, "step": 504 }, { "epoch": 0.9882583170254403, "grad_norm": 0.1533203125, "learning_rate": 1.1741682974559687e-07, "loss": 0.8381, "step": 505 }, { "epoch": 0.9902152641878669, "grad_norm": 0.142578125, "learning_rate": 9.784735812133072e-08, "loss": 0.8041, "step": 506 }, { "epoch": 0.9921722113502935, "grad_norm": 0.1455078125, "learning_rate": 7.827788649706459e-08, "loss": 0.8308, "step": 507 }, { "epoch": 0.9941291585127201, "grad_norm": 0.1630859375, "learning_rate": 5.870841487279844e-08, "loss": 0.8443, "step": 508 }, { "epoch": 0.9960861056751468, "grad_norm": 0.1376953125, "learning_rate": 3.9138943248532294e-08, "loss": 0.8119, "step": 509 }, { "epoch": 0.9980430528375733, "grad_norm": 0.1513671875, "learning_rate": 1.9569471624266147e-08, "loss": 0.7697, "step": 510 }, { "epoch": 1.0, "grad_norm": 0.1669921875, "learning_rate": 0.0, "loss": 0.8326, "step": 511 } ], "logging_steps": 1.0, "max_steps": 511, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6513747864266998e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }