{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.11582422095049567, "eval_steps": 500, "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.1058949263726486e-05, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.7518, "step": 1 }, { "epoch": 4.211789852745297e-05, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.732, "step": 2 }, { "epoch": 6.317684779117946e-05, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.7501, "step": 3 }, { "epoch": 8.423579705490595e-05, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.7667, "step": 4 }, { "epoch": 0.00010529474631863244, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.7663, "step": 5 }, { "epoch": 0.0001263536955823589, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.7459, "step": 6 }, { "epoch": 0.00014741264484608541, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.7606, "step": 7 }, { "epoch": 0.0001684715941098119, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.7556, "step": 8 }, { "epoch": 0.00018953054337353837, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.7616, "step": 9 }, { "epoch": 0.00021058949263726487, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.7136, "step": 10 }, { "epoch": 0.00023164844190099135, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.741, "step": 11 }, { "epoch": 0.0002527073911647178, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.7446, "step": 12 }, { "epoch": 0.0002737663404284443, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.7817, "step": 13 }, { "epoch": 0.00029482528969217083, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.752, "step": 14 }, { "epoch": 0.0003158842389558973, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.7788, "step": 15 }, { "epoch": 0.0003369431882196238, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.7478, "step": 16 }, { "epoch": 0.0003580021374833503, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.744, "step": 17 }, { "epoch": 0.00037906108674707673, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.7614, "step": 18 }, { "epoch": 0.00040012003601080324, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.7266, "step": 19 }, { "epoch": 0.00042117898527452974, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.756, "step": 20 }, { "epoch": 0.0004422379345382562, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.7207, "step": 21 }, { "epoch": 0.0004632968838019827, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.7491, "step": 22 }, { "epoch": 0.0004843558330657092, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.7283, "step": 23 }, { "epoch": 0.0005054147823294356, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.7468, "step": 24 }, { "epoch": 0.0005264737315931622, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.7473, "step": 25 }, { "epoch": 0.0005475326808568887, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.7557, "step": 26 }, { "epoch": 0.0005685916301206151, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.726, "step": 27 }, { "epoch": 0.0005896505793843417, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.7517, "step": 28 }, { "epoch": 0.0006107095286480681, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.7738, "step": 29 }, { "epoch": 0.0006317684779117946, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.7384, "step": 30 }, { "epoch": 0.0006528274271755211, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.7413, "step": 31 }, { "epoch": 0.0006738863764392476, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.7713, "step": 32 }, { "epoch": 0.000694945325702974, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.7332, "step": 33 }, { "epoch": 0.0007160042749667006, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.7555, "step": 34 }, { "epoch": 0.000737063224230427, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.7619, "step": 35 }, { "epoch": 0.0007581221734941535, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.7695, "step": 36 }, { "epoch": 0.00077918112275788, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.7644, "step": 37 }, { "epoch": 0.0008002400720216065, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.7493, "step": 38 }, { "epoch": 0.0008212990212853329, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 1.7452, "step": 39 }, { "epoch": 0.0008423579705490595, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.6903, "step": 40 }, { "epoch": 0.0008634169198127859, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.754, "step": 41 }, { "epoch": 0.0008844758690765124, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.7764, "step": 42 }, { "epoch": 0.0009055348183402389, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.7231, "step": 43 }, { "epoch": 0.0009265937676039654, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.7465, "step": 44 }, { "epoch": 0.0009476527168676918, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.7356, "step": 45 }, { "epoch": 0.0009687116661314184, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.7407, "step": 46 }, { "epoch": 0.0009897706153951448, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.7647, "step": 47 }, { "epoch": 0.0010108295646588713, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.7212, "step": 48 }, { "epoch": 0.0010318885139225977, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.76, "step": 49 }, { "epoch": 0.0010529474631863244, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.7608, "step": 50 }, { "epoch": 0.0010740064124500509, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.7617, "step": 51 }, { "epoch": 0.0010950653617137773, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.772, "step": 52 }, { "epoch": 0.0011161243109775038, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.7483, "step": 53 }, { "epoch": 0.0011371832602412302, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.7029, "step": 54 }, { "epoch": 0.0011582422095049567, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.7188, "step": 55 }, { "epoch": 0.0011793011587686833, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 1.7435, "step": 56 }, { "epoch": 0.0012003601080324098, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.7238, "step": 57 }, { "epoch": 0.0012214190572961362, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.7492, "step": 58 }, { "epoch": 0.0012424780065598627, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.7301, "step": 59 }, { "epoch": 0.0012635369558235891, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.7522, "step": 60 }, { "epoch": 0.0012845959050873156, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.7609, "step": 61 }, { "epoch": 0.0013056548543510422, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.7026, "step": 62 }, { "epoch": 0.0013267138036147687, "grad_norm": 0.2109375, "learning_rate": 0.0002, "loss": 1.697, "step": 63 }, { "epoch": 0.0013477727528784951, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.7366, "step": 64 }, { "epoch": 0.0013688317021422216, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.6928, "step": 65 }, { "epoch": 0.001389890651405948, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.7447, "step": 66 }, { "epoch": 0.0014109496006696747, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.7125, "step": 67 }, { "epoch": 0.0014320085499334011, "grad_norm": 0.220703125, "learning_rate": 0.0002, "loss": 1.7203, "step": 68 }, { "epoch": 0.0014530674991971276, "grad_norm": 0.2099609375, "learning_rate": 0.0002, "loss": 1.7666, "step": 69 }, { "epoch": 0.001474126448460854, "grad_norm": 0.208984375, "learning_rate": 0.0002, "loss": 1.7205, "step": 70 }, { "epoch": 0.0014951853977245805, "grad_norm": 0.203125, "learning_rate": 0.0002, "loss": 1.726, "step": 71 }, { "epoch": 0.001516244346988307, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.7006, "step": 72 }, { "epoch": 0.0015373032962520336, "grad_norm": 0.2041015625, "learning_rate": 0.0002, "loss": 1.7336, "step": 73 }, { "epoch": 0.00155836224551576, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.7516, "step": 74 }, { "epoch": 0.0015794211947794865, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.7275, "step": 75 }, { "epoch": 0.001600480144043213, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.7405, "step": 76 }, { "epoch": 0.0016215390933069394, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.727, "step": 77 }, { "epoch": 0.0016425980425706658, "grad_norm": 0.20703125, "learning_rate": 0.0002, "loss": 1.7659, "step": 78 }, { "epoch": 0.0016636569918343925, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.7078, "step": 79 }, { "epoch": 0.001684715941098119, "grad_norm": 0.201171875, "learning_rate": 0.0002, "loss": 1.7204, "step": 80 }, { "epoch": 0.0017057748903618454, "grad_norm": 0.2216796875, "learning_rate": 0.0002, "loss": 1.7338, "step": 81 }, { "epoch": 0.0017268338396255719, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.7066, "step": 82 }, { "epoch": 0.0017478927888892983, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.722, "step": 83 }, { "epoch": 0.0017689517381530248, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.7529, "step": 84 }, { "epoch": 0.0017900106874167514, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.7395, "step": 85 }, { "epoch": 0.0018110696366804779, "grad_norm": 0.21484375, "learning_rate": 0.0002, "loss": 1.7222, "step": 86 }, { "epoch": 0.0018321285859442043, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.7122, "step": 87 }, { "epoch": 0.0018531875352079308, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.7129, "step": 88 }, { "epoch": 0.0018742464844716572, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.7576, "step": 89 }, { "epoch": 0.0018953054337353837, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.7223, "step": 90 }, { "epoch": 0.0019163643829991103, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.7014, "step": 91 }, { "epoch": 0.0019374233322628368, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.7119, "step": 92 }, { "epoch": 0.001958482281526563, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.7084, "step": 93 }, { "epoch": 0.0019795412307902897, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.709, "step": 94 }, { "epoch": 0.0020006001800540163, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.7377, "step": 95 }, { "epoch": 0.0020216591293177426, "grad_norm": 0.197265625, "learning_rate": 0.0002, "loss": 1.7108, "step": 96 }, { "epoch": 0.0020427180785814692, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.7172, "step": 97 }, { "epoch": 0.0020637770278451955, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.694, "step": 98 }, { "epoch": 0.002084835977108922, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.7164, "step": 99 }, { "epoch": 0.002105894926372649, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.705, "step": 100 }, { "epoch": 0.002126953875636375, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.7086, "step": 101 }, { "epoch": 0.0021480128249001017, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.7554, "step": 102 }, { "epoch": 0.002169071774163828, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.7301, "step": 103 }, { "epoch": 0.0021901307234275546, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.6998, "step": 104 }, { "epoch": 0.0022111896726912813, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.7334, "step": 105 }, { "epoch": 0.0022322486219550075, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.7066, "step": 106 }, { "epoch": 0.002253307571218734, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.7335, "step": 107 }, { "epoch": 0.0022743665204824604, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.7545, "step": 108 }, { "epoch": 0.002295425469746187, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.7595, "step": 109 }, { "epoch": 0.0023164844190099133, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 1.7129, "step": 110 }, { "epoch": 0.00233754336827364, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.7182, "step": 111 }, { "epoch": 0.0023586023175373666, "grad_norm": 0.1884765625, "learning_rate": 0.0002, "loss": 1.705, "step": 112 }, { "epoch": 0.002379661266801093, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.7472, "step": 113 }, { "epoch": 0.0024007202160648195, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.7069, "step": 114 }, { "epoch": 0.0024217791653285458, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.7329, "step": 115 }, { "epoch": 0.0024428381145922724, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.7351, "step": 116 }, { "epoch": 0.002463897063855999, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.7262, "step": 117 }, { "epoch": 0.0024849560131197253, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.7034, "step": 118 }, { "epoch": 0.002506014962383452, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.722, "step": 119 }, { "epoch": 0.0025270739116471782, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.7172, "step": 120 }, { "epoch": 0.002548132860910905, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.7305, "step": 121 }, { "epoch": 0.002569191810174631, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.7188, "step": 122 }, { "epoch": 0.002590250759438358, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.7248, "step": 123 }, { "epoch": 0.0026113097087020845, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.6953, "step": 124 }, { "epoch": 0.0026323686579658107, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.7362, "step": 125 }, { "epoch": 0.0026534276072295374, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.7352, "step": 126 }, { "epoch": 0.0026744865564932636, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.7389, "step": 127 }, { "epoch": 0.0026955455057569903, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.7134, "step": 128 }, { "epoch": 0.002716604455020717, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6862, "step": 129 }, { "epoch": 0.002737663404284443, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.7311, "step": 130 }, { "epoch": 0.00275872235354817, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.7399, "step": 131 }, { "epoch": 0.002779781302811896, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.7003, "step": 132 }, { "epoch": 0.0028008402520756227, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.7573, "step": 133 }, { "epoch": 0.0028218992013393494, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6674, "step": 134 }, { "epoch": 0.0028429581506030756, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.7218, "step": 135 }, { "epoch": 0.0028640170998668023, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.7181, "step": 136 }, { "epoch": 0.0028850760491305285, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.7341, "step": 137 }, { "epoch": 0.002906134998394255, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.7303, "step": 138 }, { "epoch": 0.0029271939476579814, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.6965, "step": 139 }, { "epoch": 0.002948252896921708, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.7085, "step": 140 }, { "epoch": 0.0029693118461854347, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.7004, "step": 141 }, { "epoch": 0.002990370795449161, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.7088, "step": 142 }, { "epoch": 0.0030114297447128876, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.7003, "step": 143 }, { "epoch": 0.003032488693976614, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.7173, "step": 144 }, { "epoch": 0.0030535476432403405, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.7199, "step": 145 }, { "epoch": 0.003074606592504067, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.7208, "step": 146 }, { "epoch": 0.0030956655417677934, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.7088, "step": 147 }, { "epoch": 0.00311672449103152, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.7232, "step": 148 }, { "epoch": 0.0031377834402952463, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.698, "step": 149 }, { "epoch": 0.003158842389558973, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.7043, "step": 150 }, { "epoch": 0.0031799013388226992, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.7165, "step": 151 }, { "epoch": 0.003200960288086426, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.7049, "step": 152 }, { "epoch": 0.0032220192373501526, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.7164, "step": 153 }, { "epoch": 0.003243078186613879, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.7111, "step": 154 }, { "epoch": 0.0032641371358776055, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.7018, "step": 155 }, { "epoch": 0.0032851960851413317, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.714, "step": 156 }, { "epoch": 0.0033062550344050584, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.7111, "step": 157 }, { "epoch": 0.003327313983668785, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6922, "step": 158 }, { "epoch": 0.0033483729329325113, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.7051, "step": 159 }, { "epoch": 0.003369431882196238, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6804, "step": 160 }, { "epoch": 0.003390490831459964, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.699, "step": 161 }, { "epoch": 0.003411549780723691, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.7237, "step": 162 }, { "epoch": 0.0034326087299874175, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6972, "step": 163 }, { "epoch": 0.0034536676792511437, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.7049, "step": 164 }, { "epoch": 0.0034747266285148704, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.731, "step": 165 }, { "epoch": 0.0034957855777785966, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6945, "step": 166 }, { "epoch": 0.0035168445270423233, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.7267, "step": 167 }, { "epoch": 0.0035379034763060495, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.7396, "step": 168 }, { "epoch": 0.003558962425569776, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.7107, "step": 169 }, { "epoch": 0.003580021374833503, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.7223, "step": 170 }, { "epoch": 0.003601080324097229, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.7131, "step": 171 }, { "epoch": 0.0036221392733609557, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6982, "step": 172 }, { "epoch": 0.003643198222624682, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.7305, "step": 173 }, { "epoch": 0.0036642571718884086, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.7435, "step": 174 }, { "epoch": 0.0036853161211521353, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6927, "step": 175 }, { "epoch": 0.0037063750704158615, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6945, "step": 176 }, { "epoch": 0.003727434019679588, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.7027, "step": 177 }, { "epoch": 0.0037484929689433144, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.7237, "step": 178 }, { "epoch": 0.003769551918207041, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.7273, "step": 179 }, { "epoch": 0.0037906108674707673, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6934, "step": 180 }, { "epoch": 0.003811669816734494, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6898, "step": 181 }, { "epoch": 0.0038327287659982207, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6878, "step": 182 }, { "epoch": 0.003853787715261947, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6875, "step": 183 }, { "epoch": 0.0038748466645256736, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6636, "step": 184 }, { "epoch": 0.0038959056137894, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.7399, "step": 185 }, { "epoch": 0.003916964563053126, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.7184, "step": 186 }, { "epoch": 0.003938023512316853, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6886, "step": 187 }, { "epoch": 0.003959082461580579, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.7033, "step": 188 }, { "epoch": 0.003980141410844306, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.7147, "step": 189 }, { "epoch": 0.004001200360108033, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6534, "step": 190 }, { "epoch": 0.0040222593093717585, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.7233, "step": 191 }, { "epoch": 0.004043318258635485, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.7084, "step": 192 }, { "epoch": 0.004064377207899212, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.7158, "step": 193 }, { "epoch": 0.0040854361571629385, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.7134, "step": 194 }, { "epoch": 0.004106495106426665, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.7054, "step": 195 }, { "epoch": 0.004127554055690391, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6769, "step": 196 }, { "epoch": 0.004148613004954118, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.7065, "step": 197 }, { "epoch": 0.004169671954217844, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.7115, "step": 198 }, { "epoch": 0.004190730903481571, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6873, "step": 199 }, { "epoch": 0.004211789852745298, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.7159, "step": 200 }, { "epoch": 0.004232848802009023, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.714, "step": 201 }, { "epoch": 0.00425390775127275, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.7203, "step": 202 }, { "epoch": 0.004274966700536477, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6976, "step": 203 }, { "epoch": 0.004296025649800203, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6981, "step": 204 }, { "epoch": 0.00431708459906393, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.7151, "step": 205 }, { "epoch": 0.004338143548327656, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.7174, "step": 206 }, { "epoch": 0.0043592024975913825, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.7091, "step": 207 }, { "epoch": 0.004380261446855109, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6981, "step": 208 }, { "epoch": 0.004401320396118836, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6876, "step": 209 }, { "epoch": 0.0044223793453825625, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.7105, "step": 210 }, { "epoch": 0.004443438294646288, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.7091, "step": 211 }, { "epoch": 0.004464497243910015, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.7066, "step": 212 }, { "epoch": 0.004485556193173742, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.7054, "step": 213 }, { "epoch": 0.004506615142437468, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.761, "step": 214 }, { "epoch": 0.004527674091701194, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.7352, "step": 215 }, { "epoch": 0.004548733040964921, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6842, "step": 216 }, { "epoch": 0.0045697919902286475, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6548, "step": 217 }, { "epoch": 0.004590850939492374, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.7116, "step": 218 }, { "epoch": 0.004611909888756101, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6706, "step": 219 }, { "epoch": 0.004632968838019827, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.697, "step": 220 }, { "epoch": 0.004654027787283553, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.7024, "step": 221 }, { "epoch": 0.00467508673654728, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.7163, "step": 222 }, { "epoch": 0.004696145685811007, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.7081, "step": 223 }, { "epoch": 0.004717204635074733, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6975, "step": 224 }, { "epoch": 0.004738263584338459, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6649, "step": 225 }, { "epoch": 0.004759322533602186, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.7132, "step": 226 }, { "epoch": 0.004780381482865912, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.7132, "step": 227 }, { "epoch": 0.004801440432129639, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.7111, "step": 228 }, { "epoch": 0.004822499381393366, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6466, "step": 229 }, { "epoch": 0.0048435583306570915, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6868, "step": 230 }, { "epoch": 0.004864617279920818, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6956, "step": 231 }, { "epoch": 0.004885676229184545, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6505, "step": 232 }, { "epoch": 0.0049067351784482715, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.667, "step": 233 }, { "epoch": 0.004927794127711998, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6902, "step": 234 }, { "epoch": 0.004948853076975724, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.7194, "step": 235 }, { "epoch": 0.004969912026239451, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6822, "step": 236 }, { "epoch": 0.004990970975503177, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.7176, "step": 237 }, { "epoch": 0.005012029924766904, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6858, "step": 238 }, { "epoch": 0.005033088874030631, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6671, "step": 239 }, { "epoch": 0.0050541478232943565, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6725, "step": 240 }, { "epoch": 0.005075206772558083, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.703, "step": 241 }, { "epoch": 0.00509626572182181, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6822, "step": 242 }, { "epoch": 0.0051173246710855365, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.693, "step": 243 }, { "epoch": 0.005138383620349262, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.7021, "step": 244 }, { "epoch": 0.005159442569612989, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6968, "step": 245 }, { "epoch": 0.005180501518876716, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6963, "step": 246 }, { "epoch": 0.005201560468140442, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.7204, "step": 247 }, { "epoch": 0.005222619417404169, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6709, "step": 248 }, { "epoch": 0.005243678366667895, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6899, "step": 249 }, { "epoch": 0.005264737315931621, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6948, "step": 250 }, { "epoch": 0.005285796265195348, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6926, "step": 251 }, { "epoch": 0.005306855214459075, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6856, "step": 252 }, { "epoch": 0.005327914163722801, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.7411, "step": 253 }, { "epoch": 0.005348973112986527, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6778, "step": 254 }, { "epoch": 0.005370032062250254, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6848, "step": 255 }, { "epoch": 0.0053910910115139805, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.6836, "step": 256 }, { "epoch": 0.005412149960777707, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6927, "step": 257 }, { "epoch": 0.005433208910041434, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.7086, "step": 258 }, { "epoch": 0.00545426785930516, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.684, "step": 259 }, { "epoch": 0.005475326808568886, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6969, "step": 260 }, { "epoch": 0.005496385757832613, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.7123, "step": 261 }, { "epoch": 0.00551744470709634, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.7236, "step": 262 }, { "epoch": 0.005538503656360066, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6874, "step": 263 }, { "epoch": 0.005559562605623792, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6924, "step": 264 }, { "epoch": 0.005580621554887519, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6545, "step": 265 }, { "epoch": 0.005601680504151245, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.7146, "step": 266 }, { "epoch": 0.005622739453414972, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.7107, "step": 267 }, { "epoch": 0.005643798402678699, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6644, "step": 268 }, { "epoch": 0.0056648573519424246, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.7131, "step": 269 }, { "epoch": 0.005685916301206151, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6551, "step": 270 }, { "epoch": 0.005706975250469878, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6973, "step": 271 }, { "epoch": 0.0057280341997336046, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.7111, "step": 272 }, { "epoch": 0.00574909314899733, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6686, "step": 273 }, { "epoch": 0.005770152098261057, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6846, "step": 274 }, { "epoch": 0.005791211047524784, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.7044, "step": 275 }, { "epoch": 0.00581226999678851, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6833, "step": 276 }, { "epoch": 0.005833328946052237, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.7105, "step": 277 }, { "epoch": 0.005854387895315963, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6638, "step": 278 }, { "epoch": 0.0058754468445796895, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6967, "step": 279 }, { "epoch": 0.005896505793843416, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6958, "step": 280 }, { "epoch": 0.005917564743107143, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.7417, "step": 281 }, { "epoch": 0.0059386236923708695, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6871, "step": 282 }, { "epoch": 0.005959682641634595, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.7185, "step": 283 }, { "epoch": 0.005980741590898322, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.6819, "step": 284 }, { "epoch": 0.006001800540162049, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6993, "step": 285 }, { "epoch": 0.006022859489425775, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6664, "step": 286 }, { "epoch": 0.006043918438689502, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.7015, "step": 287 }, { "epoch": 0.006064977387953228, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6846, "step": 288 }, { "epoch": 0.006086036337216954, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.7087, "step": 289 }, { "epoch": 0.006107095286480681, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6947, "step": 290 }, { "epoch": 0.006128154235744408, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6634, "step": 291 }, { "epoch": 0.006149213185008134, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6947, "step": 292 }, { "epoch": 0.00617027213427186, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6972, "step": 293 }, { "epoch": 0.006191331083535587, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.693, "step": 294 }, { "epoch": 0.0062123900327993135, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.717, "step": 295 }, { "epoch": 0.00623344898206304, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6691, "step": 296 }, { "epoch": 0.006254507931326767, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.7448, "step": 297 }, { "epoch": 0.006275566880590493, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.7048, "step": 298 }, { "epoch": 0.006296625829854219, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.7012, "step": 299 }, { "epoch": 0.006317684779117946, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.7019, "step": 300 }, { "epoch": 0.006338743728381673, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6798, "step": 301 }, { "epoch": 0.0063598026776453985, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6943, "step": 302 }, { "epoch": 0.006380861626909125, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6669, "step": 303 }, { "epoch": 0.006401920576172852, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6784, "step": 304 }, { "epoch": 0.0064229795254365785, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6824, "step": 305 }, { "epoch": 0.006444038474700305, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6717, "step": 306 }, { "epoch": 0.006465097423964031, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.7211, "step": 307 }, { "epoch": 0.006486156373227758, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.7237, "step": 308 }, { "epoch": 0.006507215322491484, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.7176, "step": 309 }, { "epoch": 0.006528274271755211, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6662, "step": 310 }, { "epoch": 0.006549333221018938, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6955, "step": 311 }, { "epoch": 0.006570392170282663, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6601, "step": 312 }, { "epoch": 0.00659145111954639, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6981, "step": 313 }, { "epoch": 0.006612510068810117, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.7239, "step": 314 }, { "epoch": 0.006633569018073843, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6826, "step": 315 }, { "epoch": 0.00665462796733757, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6939, "step": 316 }, { "epoch": 0.006675686916601296, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.7061, "step": 317 }, { "epoch": 0.0066967458658650225, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.7104, "step": 318 }, { "epoch": 0.006717804815128749, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6612, "step": 319 }, { "epoch": 0.006738863764392476, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.7063, "step": 320 }, { "epoch": 0.0067599227136562025, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6798, "step": 321 }, { "epoch": 0.006780981662919928, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6625, "step": 322 }, { "epoch": 0.006802040612183655, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.7182, "step": 323 }, { "epoch": 0.006823099561447382, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6498, "step": 324 }, { "epoch": 0.006844158510711108, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6928, "step": 325 }, { "epoch": 0.006865217459974835, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.7006, "step": 326 }, { "epoch": 0.006886276409238561, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6825, "step": 327 }, { "epoch": 0.0069073353585022874, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.686, "step": 328 }, { "epoch": 0.006928394307766014, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.6604, "step": 329 }, { "epoch": 0.006949453257029741, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.695, "step": 330 }, { "epoch": 0.006970512206293467, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6883, "step": 331 }, { "epoch": 0.006991571155557193, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.6402, "step": 332 }, { "epoch": 0.00701263010482092, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.701, "step": 333 }, { "epoch": 0.007033689054084647, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6617, "step": 334 }, { "epoch": 0.007054748003348373, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.7031, "step": 335 }, { "epoch": 0.007075806952612099, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6707, "step": 336 }, { "epoch": 0.007096865901875826, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6682, "step": 337 }, { "epoch": 0.007117924851139552, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6762, "step": 338 }, { "epoch": 0.007138983800403279, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.7078, "step": 339 }, { "epoch": 0.007160042749667006, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6818, "step": 340 }, { "epoch": 0.0071811016989307315, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.635, "step": 341 }, { "epoch": 0.007202160648194458, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.6842, "step": 342 }, { "epoch": 0.007223219597458185, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6659, "step": 343 }, { "epoch": 0.0072442785467219115, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.7083, "step": 344 }, { "epoch": 0.007265337495985638, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6705, "step": 345 }, { "epoch": 0.007286396445249364, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.649, "step": 346 }, { "epoch": 0.007307455394513091, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6753, "step": 347 }, { "epoch": 0.007328514343776817, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.6722, "step": 348 }, { "epoch": 0.007349573293040544, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.7328, "step": 349 }, { "epoch": 0.007370632242304271, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6997, "step": 350 }, { "epoch": 0.007391691191567996, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6963, "step": 351 }, { "epoch": 0.007412750140831723, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6563, "step": 352 }, { "epoch": 0.00743380909009545, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.7243, "step": 353 }, { "epoch": 0.007454868039359176, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6871, "step": 354 }, { "epoch": 0.007475926988622903, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6619, "step": 355 }, { "epoch": 0.007496985937886629, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6938, "step": 356 }, { "epoch": 0.0075180448871503555, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6559, "step": 357 }, { "epoch": 0.007539103836414082, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.7006, "step": 358 }, { "epoch": 0.007560162785677809, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6549, "step": 359 }, { "epoch": 0.007581221734941535, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6891, "step": 360 }, { "epoch": 0.007602280684205261, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.687, "step": 361 }, { "epoch": 0.007623339633468988, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6972, "step": 362 }, { "epoch": 0.007644398582732715, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6662, "step": 363 }, { "epoch": 0.007665457531996441, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6789, "step": 364 }, { "epoch": 0.007686516481260167, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6769, "step": 365 }, { "epoch": 0.007707575430523894, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.7075, "step": 366 }, { "epoch": 0.0077286343797876205, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6437, "step": 367 }, { "epoch": 0.007749693329051347, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6559, "step": 368 }, { "epoch": 0.007770752278315074, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.702, "step": 369 }, { "epoch": 0.0077918112275788, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.7217, "step": 370 }, { "epoch": 0.007812870176842526, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6892, "step": 371 }, { "epoch": 0.007833929126106252, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6711, "step": 372 }, { "epoch": 0.00785498807536998, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6981, "step": 373 }, { "epoch": 0.007876047024633705, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.7112, "step": 374 }, { "epoch": 0.007897105973897433, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6832, "step": 375 }, { "epoch": 0.007918164923161159, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6634, "step": 376 }, { "epoch": 0.007939223872424885, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6878, "step": 377 }, { "epoch": 0.007960282821688612, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6416, "step": 378 }, { "epoch": 0.007981341770952338, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6833, "step": 379 }, { "epoch": 0.008002400720216065, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6796, "step": 380 }, { "epoch": 0.008023459669479791, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6958, "step": 381 }, { "epoch": 0.008044518618743517, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.7005, "step": 382 }, { "epoch": 0.008065577568007245, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6801, "step": 383 }, { "epoch": 0.00808663651727097, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.694, "step": 384 }, { "epoch": 0.008107695466534698, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.659, "step": 385 }, { "epoch": 0.008128754415798424, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6733, "step": 386 }, { "epoch": 0.00814981336506215, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.7008, "step": 387 }, { "epoch": 0.008170872314325877, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6701, "step": 388 }, { "epoch": 0.008191931263589603, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.6823, "step": 389 }, { "epoch": 0.00821299021285333, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.6684, "step": 390 }, { "epoch": 0.008234049162117056, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6669, "step": 391 }, { "epoch": 0.008255108111380782, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6896, "step": 392 }, { "epoch": 0.00827616706064451, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.7016, "step": 393 }, { "epoch": 0.008297226009908235, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6645, "step": 394 }, { "epoch": 0.008318284959171963, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6648, "step": 395 }, { "epoch": 0.008339343908435689, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.689, "step": 396 }, { "epoch": 0.008360402857699414, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6276, "step": 397 }, { "epoch": 0.008381461806963142, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6383, "step": 398 }, { "epoch": 0.008402520756226868, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.6336, "step": 399 }, { "epoch": 0.008423579705490595, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6381, "step": 400 }, { "epoch": 0.008444638654754321, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6355, "step": 401 }, { "epoch": 0.008465697604018047, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6611, "step": 402 }, { "epoch": 0.008486756553281774, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6604, "step": 403 }, { "epoch": 0.0085078155025455, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6407, "step": 404 }, { "epoch": 0.008528874451809228, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6809, "step": 405 }, { "epoch": 0.008549933401072954, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6693, "step": 406 }, { "epoch": 0.00857099235033668, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6621, "step": 407 }, { "epoch": 0.008592051299600407, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.679, "step": 408 }, { "epoch": 0.008613110248864133, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6417, "step": 409 }, { "epoch": 0.00863416919812786, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6702, "step": 410 }, { "epoch": 0.008655228147391586, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6776, "step": 411 }, { "epoch": 0.008676287096655312, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.668, "step": 412 }, { "epoch": 0.00869734604591904, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.6659, "step": 413 }, { "epoch": 0.008718404995182765, "grad_norm": 0.130859375, "learning_rate": 0.0002, "loss": 1.6605, "step": 414 }, { "epoch": 0.008739463944446493, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6779, "step": 415 }, { "epoch": 0.008760522893710218, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6728, "step": 416 }, { "epoch": 0.008781581842973944, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6723, "step": 417 }, { "epoch": 0.008802640792237672, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.686, "step": 418 }, { "epoch": 0.008823699741501398, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6508, "step": 419 }, { "epoch": 0.008844758690765125, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6437, "step": 420 }, { "epoch": 0.008865817640028851, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6623, "step": 421 }, { "epoch": 0.008886876589292577, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6476, "step": 422 }, { "epoch": 0.008907935538556304, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6803, "step": 423 }, { "epoch": 0.00892899448782003, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6654, "step": 424 }, { "epoch": 0.008950053437083758, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.7083, "step": 425 }, { "epoch": 0.008971112386347483, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6945, "step": 426 }, { "epoch": 0.00899217133561121, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6757, "step": 427 }, { "epoch": 0.009013230284874937, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6935, "step": 428 }, { "epoch": 0.009034289234138662, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6196, "step": 429 }, { "epoch": 0.009055348183402388, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6461, "step": 430 }, { "epoch": 0.009076407132666116, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6708, "step": 431 }, { "epoch": 0.009097466081929842, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.664, "step": 432 }, { "epoch": 0.00911852503119357, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6939, "step": 433 }, { "epoch": 0.009139583980457295, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6849, "step": 434 }, { "epoch": 0.00916064292972102, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6325, "step": 435 }, { "epoch": 0.009181701878984748, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6732, "step": 436 }, { "epoch": 0.009202760828248474, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6798, "step": 437 }, { "epoch": 0.009223819777512202, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6865, "step": 438 }, { "epoch": 0.009244878726775927, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6791, "step": 439 }, { "epoch": 0.009265937676039653, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.661, "step": 440 }, { "epoch": 0.00928699662530338, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6631, "step": 441 }, { "epoch": 0.009308055574567107, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.668, "step": 442 }, { "epoch": 0.009329114523830834, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6636, "step": 443 }, { "epoch": 0.00935017347309456, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6786, "step": 444 }, { "epoch": 0.009371232422358286, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6953, "step": 445 }, { "epoch": 0.009392291371622013, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6817, "step": 446 }, { "epoch": 0.009413350320885739, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6796, "step": 447 }, { "epoch": 0.009434409270149467, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6598, "step": 448 }, { "epoch": 0.009455468219413192, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6756, "step": 449 }, { "epoch": 0.009476527168676918, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6549, "step": 450 }, { "epoch": 0.009497586117940646, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6718, "step": 451 }, { "epoch": 0.009518645067204371, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6319, "step": 452 }, { "epoch": 0.009539704016468099, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6852, "step": 453 }, { "epoch": 0.009560762965731825, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6703, "step": 454 }, { "epoch": 0.00958182191499555, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6476, "step": 455 }, { "epoch": 0.009602880864259278, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6494, "step": 456 }, { "epoch": 0.009623939813523004, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.704, "step": 457 }, { "epoch": 0.009644998762786731, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6249, "step": 458 }, { "epoch": 0.009666057712050457, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.647, "step": 459 }, { "epoch": 0.009687116661314183, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.679, "step": 460 }, { "epoch": 0.00970817561057791, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6862, "step": 461 }, { "epoch": 0.009729234559841636, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6769, "step": 462 }, { "epoch": 0.009750293509105364, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6558, "step": 463 }, { "epoch": 0.00977135245836909, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6806, "step": 464 }, { "epoch": 0.009792411407632816, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.656, "step": 465 }, { "epoch": 0.009813470356896543, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6627, "step": 466 }, { "epoch": 0.009834529306160269, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6666, "step": 467 }, { "epoch": 0.009855588255423996, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6836, "step": 468 }, { "epoch": 0.009876647204687722, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6361, "step": 469 }, { "epoch": 0.009897706153951448, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.644, "step": 470 }, { "epoch": 0.009918765103215176, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.647, "step": 471 }, { "epoch": 0.009939824052478901, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6547, "step": 472 }, { "epoch": 0.009960883001742629, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6954, "step": 473 }, { "epoch": 0.009981941951006355, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6745, "step": 474 }, { "epoch": 0.01000300090027008, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6825, "step": 475 }, { "epoch": 0.010024059849533808, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6629, "step": 476 }, { "epoch": 0.010045118798797534, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6731, "step": 477 }, { "epoch": 0.010066177748061261, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6636, "step": 478 }, { "epoch": 0.010087236697324987, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6697, "step": 479 }, { "epoch": 0.010108295646588713, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6577, "step": 480 }, { "epoch": 0.01012935459585244, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.6549, "step": 481 }, { "epoch": 0.010150413545116166, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6758, "step": 482 }, { "epoch": 0.010171472494379894, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6576, "step": 483 }, { "epoch": 0.01019253144364362, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.654, "step": 484 }, { "epoch": 0.010213590392907345, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6547, "step": 485 }, { "epoch": 0.010234649342171073, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6847, "step": 486 }, { "epoch": 0.010255708291434799, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6548, "step": 487 }, { "epoch": 0.010276767240698524, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.638, "step": 488 }, { "epoch": 0.010297826189962252, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.7198, "step": 489 }, { "epoch": 0.010318885139225978, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6823, "step": 490 }, { "epoch": 0.010339944088489705, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6795, "step": 491 }, { "epoch": 0.010361003037753431, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6807, "step": 492 }, { "epoch": 0.010382061987017157, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6428, "step": 493 }, { "epoch": 0.010403120936280884, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6855, "step": 494 }, { "epoch": 0.01042417988554461, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.658, "step": 495 }, { "epoch": 0.010445238834808338, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.69, "step": 496 }, { "epoch": 0.010466297784072064, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6481, "step": 497 }, { "epoch": 0.01048735673333579, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6509, "step": 498 }, { "epoch": 0.010508415682599517, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6285, "step": 499 }, { "epoch": 0.010529474631863243, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6663, "step": 500 }, { "epoch": 0.010529474631863243, "eval_loss": 1.6845688819885254, "eval_runtime": 901.4075, "eval_samples_per_second": 68.559, "eval_steps_per_second": 2.143, "step": 500 }, { "epoch": 0.01055053358112697, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6882, "step": 501 }, { "epoch": 0.010571592530390696, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6395, "step": 502 }, { "epoch": 0.010592651479654422, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6245, "step": 503 }, { "epoch": 0.01061371042891815, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6856, "step": 504 }, { "epoch": 0.010634769378181875, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6788, "step": 505 }, { "epoch": 0.010655828327445603, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6188, "step": 506 }, { "epoch": 0.010676887276709329, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6678, "step": 507 }, { "epoch": 0.010697946225973054, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6698, "step": 508 }, { "epoch": 0.010719005175236782, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6639, "step": 509 }, { "epoch": 0.010740064124500508, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6506, "step": 510 }, { "epoch": 0.010761123073764235, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6731, "step": 511 }, { "epoch": 0.010782182023027961, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6258, "step": 512 }, { "epoch": 0.010803240972291687, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6701, "step": 513 }, { "epoch": 0.010824299921555414, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6559, "step": 514 }, { "epoch": 0.01084535887081914, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6539, "step": 515 }, { "epoch": 0.010866417820082868, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.7107, "step": 516 }, { "epoch": 0.010887476769346593, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6344, "step": 517 }, { "epoch": 0.01090853571861032, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6496, "step": 518 }, { "epoch": 0.010929594667874047, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6438, "step": 519 }, { "epoch": 0.010950653617137773, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.7041, "step": 520 }, { "epoch": 0.0109717125664015, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6439, "step": 521 }, { "epoch": 0.010992771515665226, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6754, "step": 522 }, { "epoch": 0.011013830464928952, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6861, "step": 523 }, { "epoch": 0.01103488941419268, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6511, "step": 524 }, { "epoch": 0.011055948363456405, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6635, "step": 525 }, { "epoch": 0.011077007312720133, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6327, "step": 526 }, { "epoch": 0.011098066261983858, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6757, "step": 527 }, { "epoch": 0.011119125211247584, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6672, "step": 528 }, { "epoch": 0.011140184160511312, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6608, "step": 529 }, { "epoch": 0.011161243109775038, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6848, "step": 530 }, { "epoch": 0.011182302059038765, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6913, "step": 531 }, { "epoch": 0.01120336100830249, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6567, "step": 532 }, { "epoch": 0.011224419957566217, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6535, "step": 533 }, { "epoch": 0.011245478906829944, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.6928, "step": 534 }, { "epoch": 0.01126653785609367, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6937, "step": 535 }, { "epoch": 0.011287596805357398, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6642, "step": 536 }, { "epoch": 0.011308655754621123, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6671, "step": 537 }, { "epoch": 0.011329714703884849, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6381, "step": 538 }, { "epoch": 0.011350773653148577, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6785, "step": 539 }, { "epoch": 0.011371832602412302, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6847, "step": 540 }, { "epoch": 0.01139289155167603, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6246, "step": 541 }, { "epoch": 0.011413950500939756, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6677, "step": 542 }, { "epoch": 0.011435009450203482, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6887, "step": 543 }, { "epoch": 0.011456068399467209, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6619, "step": 544 }, { "epoch": 0.011477127348730935, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6387, "step": 545 }, { "epoch": 0.01149818629799466, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6443, "step": 546 }, { "epoch": 0.011519245247258388, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6393, "step": 547 }, { "epoch": 0.011540304196522114, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6595, "step": 548 }, { "epoch": 0.011561363145785842, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6662, "step": 549 }, { "epoch": 0.011582422095049567, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6484, "step": 550 }, { "epoch": 0.011603481044313293, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6609, "step": 551 }, { "epoch": 0.01162453999357702, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6948, "step": 552 }, { "epoch": 0.011645598942840747, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6618, "step": 553 }, { "epoch": 0.011666657892104474, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6587, "step": 554 }, { "epoch": 0.0116877168413682, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6424, "step": 555 }, { "epoch": 0.011708775790631926, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6329, "step": 556 }, { "epoch": 0.011729834739895653, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6556, "step": 557 }, { "epoch": 0.011750893689159379, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6084, "step": 558 }, { "epoch": 0.011771952638423107, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6447, "step": 559 }, { "epoch": 0.011793011587686832, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6314, "step": 560 }, { "epoch": 0.011814070536950558, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6335, "step": 561 }, { "epoch": 0.011835129486214286, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6567, "step": 562 }, { "epoch": 0.011856188435478011, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6627, "step": 563 }, { "epoch": 0.011877247384741739, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6662, "step": 564 }, { "epoch": 0.011898306334005465, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6493, "step": 565 }, { "epoch": 0.01191936528326919, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6504, "step": 566 }, { "epoch": 0.011940424232532918, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6583, "step": 567 }, { "epoch": 0.011961483181796644, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6457, "step": 568 }, { "epoch": 0.011982542131060371, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6701, "step": 569 }, { "epoch": 0.012003601080324097, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6665, "step": 570 }, { "epoch": 0.012024660029587823, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6699, "step": 571 }, { "epoch": 0.01204571897885155, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6524, "step": 572 }, { "epoch": 0.012066777928115276, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6298, "step": 573 }, { "epoch": 0.012087836877379004, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6595, "step": 574 }, { "epoch": 0.01210889582664273, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6377, "step": 575 }, { "epoch": 0.012129954775906455, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6637, "step": 576 }, { "epoch": 0.012151013725170183, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.7005, "step": 577 }, { "epoch": 0.012172072674433909, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6646, "step": 578 }, { "epoch": 0.012193131623697636, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6449, "step": 579 }, { "epoch": 0.012214190572961362, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6782, "step": 580 }, { "epoch": 0.012235249522225088, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6523, "step": 581 }, { "epoch": 0.012256308471488815, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6457, "step": 582 }, { "epoch": 0.012277367420752541, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6428, "step": 583 }, { "epoch": 0.012298426370016269, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6509, "step": 584 }, { "epoch": 0.012319485319279995, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6971, "step": 585 }, { "epoch": 0.01234054426854372, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.635, "step": 586 }, { "epoch": 0.012361603217807448, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6193, "step": 587 }, { "epoch": 0.012382662167071174, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6599, "step": 588 }, { "epoch": 0.012403721116334901, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6789, "step": 589 }, { "epoch": 0.012424780065598627, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6879, "step": 590 }, { "epoch": 0.012445839014862353, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6231, "step": 591 }, { "epoch": 0.01246689796412608, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6422, "step": 592 }, { "epoch": 0.012487956913389806, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.654, "step": 593 }, { "epoch": 0.012509015862653534, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.621, "step": 594 }, { "epoch": 0.01253007481191726, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6217, "step": 595 }, { "epoch": 0.012551133761180985, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6868, "step": 596 }, { "epoch": 0.012572192710444713, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6763, "step": 597 }, { "epoch": 0.012593251659708439, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6488, "step": 598 }, { "epoch": 0.012614310608972164, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6702, "step": 599 }, { "epoch": 0.012635369558235892, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6282, "step": 600 }, { "epoch": 0.012656428507499618, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6202, "step": 601 }, { "epoch": 0.012677487456763345, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6627, "step": 602 }, { "epoch": 0.012698546406027071, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6607, "step": 603 }, { "epoch": 0.012719605355290797, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6873, "step": 604 }, { "epoch": 0.012740664304554524, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6425, "step": 605 }, { "epoch": 0.01276172325381825, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6564, "step": 606 }, { "epoch": 0.012782782203081978, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6118, "step": 607 }, { "epoch": 0.012803841152345704, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6689, "step": 608 }, { "epoch": 0.01282490010160943, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6296, "step": 609 }, { "epoch": 0.012845959050873157, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6626, "step": 610 }, { "epoch": 0.012867018000136883, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6505, "step": 611 }, { "epoch": 0.01288807694940061, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6708, "step": 612 }, { "epoch": 0.012909135898664336, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6498, "step": 613 }, { "epoch": 0.012930194847928062, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6399, "step": 614 }, { "epoch": 0.01295125379719179, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6354, "step": 615 }, { "epoch": 0.012972312746455515, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6614, "step": 616 }, { "epoch": 0.012993371695719243, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6468, "step": 617 }, { "epoch": 0.013014430644982969, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5986, "step": 618 }, { "epoch": 0.013035489594246694, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.662, "step": 619 }, { "epoch": 0.013056548543510422, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6803, "step": 620 }, { "epoch": 0.013077607492774148, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6713, "step": 621 }, { "epoch": 0.013098666442037875, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6621, "step": 622 }, { "epoch": 0.013119725391301601, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6544, "step": 623 }, { "epoch": 0.013140784340565327, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6452, "step": 624 }, { "epoch": 0.013161843289829054, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6553, "step": 625 }, { "epoch": 0.01318290223909278, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6345, "step": 626 }, { "epoch": 0.013203961188356508, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6224, "step": 627 }, { "epoch": 0.013225020137620233, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6556, "step": 628 }, { "epoch": 0.01324607908688396, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6781, "step": 629 }, { "epoch": 0.013267138036147687, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6771, "step": 630 }, { "epoch": 0.013288196985411413, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6476, "step": 631 }, { "epoch": 0.01330925593467514, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6193, "step": 632 }, { "epoch": 0.013330314883938866, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6583, "step": 633 }, { "epoch": 0.013351373833202592, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6644, "step": 634 }, { "epoch": 0.01337243278246632, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6457, "step": 635 }, { "epoch": 0.013393491731730045, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6568, "step": 636 }, { "epoch": 0.013414550680993773, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6341, "step": 637 }, { "epoch": 0.013435609630257498, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6321, "step": 638 }, { "epoch": 0.013456668579521224, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6557, "step": 639 }, { "epoch": 0.013477727528784952, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6666, "step": 640 }, { "epoch": 0.013498786478048677, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6195, "step": 641 }, { "epoch": 0.013519845427312405, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6456, "step": 642 }, { "epoch": 0.01354090437657613, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6543, "step": 643 }, { "epoch": 0.013561963325839857, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6581, "step": 644 }, { "epoch": 0.013583022275103584, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6332, "step": 645 }, { "epoch": 0.01360408122436731, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6455, "step": 646 }, { "epoch": 0.013625140173631037, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6543, "step": 647 }, { "epoch": 0.013646199122894763, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.644, "step": 648 }, { "epoch": 0.013667258072158489, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6882, "step": 649 }, { "epoch": 0.013688317021422217, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6404, "step": 650 }, { "epoch": 0.013709375970685942, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6521, "step": 651 }, { "epoch": 0.01373043491994967, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6522, "step": 652 }, { "epoch": 0.013751493869213396, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6337, "step": 653 }, { "epoch": 0.013772552818477122, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6688, "step": 654 }, { "epoch": 0.013793611767740849, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.655, "step": 655 }, { "epoch": 0.013814670717004575, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6473, "step": 656 }, { "epoch": 0.0138357296662683, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6272, "step": 657 }, { "epoch": 0.013856788615532028, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6573, "step": 658 }, { "epoch": 0.013877847564795754, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6141, "step": 659 }, { "epoch": 0.013898906514059482, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6223, "step": 660 }, { "epoch": 0.013919965463323207, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6754, "step": 661 }, { "epoch": 0.013941024412586933, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6506, "step": 662 }, { "epoch": 0.01396208336185066, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6826, "step": 663 }, { "epoch": 0.013983142311114386, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6228, "step": 664 }, { "epoch": 0.014004201260378114, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6452, "step": 665 }, { "epoch": 0.01402526020964184, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6496, "step": 666 }, { "epoch": 0.014046319158905566, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6473, "step": 667 }, { "epoch": 0.014067378108169293, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6585, "step": 668 }, { "epoch": 0.014088437057433019, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6528, "step": 669 }, { "epoch": 0.014109496006696746, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6477, "step": 670 }, { "epoch": 0.014130554955960472, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6507, "step": 671 }, { "epoch": 0.014151613905224198, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6641, "step": 672 }, { "epoch": 0.014172672854487926, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.67, "step": 673 }, { "epoch": 0.014193731803751651, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6261, "step": 674 }, { "epoch": 0.014214790753015379, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6615, "step": 675 }, { "epoch": 0.014235849702279105, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.6664, "step": 676 }, { "epoch": 0.01425690865154283, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6183, "step": 677 }, { "epoch": 0.014277967600806558, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6347, "step": 678 }, { "epoch": 0.014299026550070284, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6616, "step": 679 }, { "epoch": 0.014320085499334011, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6433, "step": 680 }, { "epoch": 0.014341144448597737, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6371, "step": 681 }, { "epoch": 0.014362203397861463, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6353, "step": 682 }, { "epoch": 0.01438326234712519, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6687, "step": 683 }, { "epoch": 0.014404321296388916, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6445, "step": 684 }, { "epoch": 0.014425380245652644, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6346, "step": 685 }, { "epoch": 0.01444643919491637, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6891, "step": 686 }, { "epoch": 0.014467498144180095, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6539, "step": 687 }, { "epoch": 0.014488557093443823, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6643, "step": 688 }, { "epoch": 0.014509616042707549, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6504, "step": 689 }, { "epoch": 0.014530674991971276, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6665, "step": 690 }, { "epoch": 0.014551733941235002, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6852, "step": 691 }, { "epoch": 0.014572792890498728, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6548, "step": 692 }, { "epoch": 0.014593851839762455, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6446, "step": 693 }, { "epoch": 0.014614910789026181, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6912, "step": 694 }, { "epoch": 0.014635969738289909, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6635, "step": 695 }, { "epoch": 0.014657028687553635, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6431, "step": 696 }, { "epoch": 0.01467808763681736, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6485, "step": 697 }, { "epoch": 0.014699146586081088, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6323, "step": 698 }, { "epoch": 0.014720205535344814, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5934, "step": 699 }, { "epoch": 0.014741264484608541, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6785, "step": 700 }, { "epoch": 0.014762323433872267, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6674, "step": 701 }, { "epoch": 0.014783382383135993, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6426, "step": 702 }, { "epoch": 0.01480444133239972, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6188, "step": 703 }, { "epoch": 0.014825500281663446, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6374, "step": 704 }, { "epoch": 0.014846559230927174, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.648, "step": 705 }, { "epoch": 0.0148676181801909, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6413, "step": 706 }, { "epoch": 0.014888677129454625, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6218, "step": 707 }, { "epoch": 0.014909736078718353, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6394, "step": 708 }, { "epoch": 0.014930795027982079, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6801, "step": 709 }, { "epoch": 0.014951853977245806, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6215, "step": 710 }, { "epoch": 0.014972912926509532, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6423, "step": 711 }, { "epoch": 0.014993971875773258, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6765, "step": 712 }, { "epoch": 0.015015030825036985, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6326, "step": 713 }, { "epoch": 0.015036089774300711, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.66, "step": 714 }, { "epoch": 0.015057148723564437, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6558, "step": 715 }, { "epoch": 0.015078207672828164, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6495, "step": 716 }, { "epoch": 0.01509926662209189, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6677, "step": 717 }, { "epoch": 0.015120325571355618, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6524, "step": 718 }, { "epoch": 0.015141384520619344, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6446, "step": 719 }, { "epoch": 0.01516244346988307, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6619, "step": 720 }, { "epoch": 0.015183502419146797, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.626, "step": 721 }, { "epoch": 0.015204561368410523, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6303, "step": 722 }, { "epoch": 0.01522562031767425, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6285, "step": 723 }, { "epoch": 0.015246679266937976, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6365, "step": 724 }, { "epoch": 0.015267738216201702, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6346, "step": 725 }, { "epoch": 0.01528879716546543, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6301, "step": 726 }, { "epoch": 0.015309856114729155, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6371, "step": 727 }, { "epoch": 0.015330915063992883, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6369, "step": 728 }, { "epoch": 0.015351974013256608, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6331, "step": 729 }, { "epoch": 0.015373032962520334, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6505, "step": 730 }, { "epoch": 0.015394091911784062, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6174, "step": 731 }, { "epoch": 0.015415150861047788, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6263, "step": 732 }, { "epoch": 0.015436209810311515, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6613, "step": 733 }, { "epoch": 0.015457268759575241, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6725, "step": 734 }, { "epoch": 0.015478327708838967, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.634, "step": 735 }, { "epoch": 0.015499386658102694, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6206, "step": 736 }, { "epoch": 0.01552044560736642, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6131, "step": 737 }, { "epoch": 0.015541504556630148, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6379, "step": 738 }, { "epoch": 0.015562563505893873, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.6567, "step": 739 }, { "epoch": 0.0155836224551576, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6372, "step": 740 }, { "epoch": 0.015604681404421327, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6601, "step": 741 }, { "epoch": 0.015625740353685053, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.633, "step": 742 }, { "epoch": 0.01564679930294878, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6288, "step": 743 }, { "epoch": 0.015667858252212504, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6437, "step": 744 }, { "epoch": 0.01568891720147623, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6546, "step": 745 }, { "epoch": 0.01570997615073996, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6488, "step": 746 }, { "epoch": 0.015731035100003687, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6262, "step": 747 }, { "epoch": 0.01575209404926741, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6411, "step": 748 }, { "epoch": 0.01577315299853114, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6126, "step": 749 }, { "epoch": 0.015794211947794866, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6531, "step": 750 }, { "epoch": 0.01581527089705859, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6314, "step": 751 }, { "epoch": 0.015836329846322317, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6277, "step": 752 }, { "epoch": 0.015857388795586045, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6466, "step": 753 }, { "epoch": 0.01587844774484977, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.652, "step": 754 }, { "epoch": 0.015899506694113497, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6087, "step": 755 }, { "epoch": 0.015920565643377224, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6437, "step": 756 }, { "epoch": 0.01594162459264095, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6394, "step": 757 }, { "epoch": 0.015962683541904676, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6047, "step": 758 }, { "epoch": 0.015983742491168403, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6286, "step": 759 }, { "epoch": 0.01600480144043213, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6467, "step": 760 }, { "epoch": 0.016025860389695855, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6179, "step": 761 }, { "epoch": 0.016046919338959582, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6751, "step": 762 }, { "epoch": 0.01606797828822331, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6604, "step": 763 }, { "epoch": 0.016089037237487034, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6213, "step": 764 }, { "epoch": 0.01611009618675076, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.654, "step": 765 }, { "epoch": 0.01613115513601449, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6117, "step": 766 }, { "epoch": 0.016152214085278217, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6573, "step": 767 }, { "epoch": 0.01617327303454194, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6697, "step": 768 }, { "epoch": 0.016194331983805668, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6277, "step": 769 }, { "epoch": 0.016215390933069396, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6315, "step": 770 }, { "epoch": 0.01623644988233312, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6593, "step": 771 }, { "epoch": 0.016257508831596847, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.648, "step": 772 }, { "epoch": 0.016278567780860575, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6388, "step": 773 }, { "epoch": 0.0162996267301243, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6772, "step": 774 }, { "epoch": 0.016320685679388026, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6438, "step": 775 }, { "epoch": 0.016341744628651754, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6444, "step": 776 }, { "epoch": 0.01636280357791548, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6093, "step": 777 }, { "epoch": 0.016383862527179206, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6548, "step": 778 }, { "epoch": 0.016404921476442933, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6248, "step": 779 }, { "epoch": 0.01642598042570666, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6233, "step": 780 }, { "epoch": 0.016447039374970385, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6477, "step": 781 }, { "epoch": 0.016468098324234112, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6413, "step": 782 }, { "epoch": 0.01648915727349784, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6376, "step": 783 }, { "epoch": 0.016510216222761564, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.6424, "step": 784 }, { "epoch": 0.01653127517202529, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.6407, "step": 785 }, { "epoch": 0.01655233412128902, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6373, "step": 786 }, { "epoch": 0.016573393070552746, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6551, "step": 787 }, { "epoch": 0.01659445201981647, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6339, "step": 788 }, { "epoch": 0.016615510969080198, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6303, "step": 789 }, { "epoch": 0.016636569918343926, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6764, "step": 790 }, { "epoch": 0.01665762886760765, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6175, "step": 791 }, { "epoch": 0.016678687816871377, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.64, "step": 792 }, { "epoch": 0.016699746766135105, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.635, "step": 793 }, { "epoch": 0.01672080571539883, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6122, "step": 794 }, { "epoch": 0.016741864664662556, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6562, "step": 795 }, { "epoch": 0.016762923613926284, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6398, "step": 796 }, { "epoch": 0.01678398256319001, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6267, "step": 797 }, { "epoch": 0.016805041512453735, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6705, "step": 798 }, { "epoch": 0.016826100461717463, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.6333, "step": 799 }, { "epoch": 0.01684715941098119, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6618, "step": 800 }, { "epoch": 0.016868218360244915, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6206, "step": 801 }, { "epoch": 0.016889277309508642, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6114, "step": 802 }, { "epoch": 0.01691033625877237, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6424, "step": 803 }, { "epoch": 0.016931395208036094, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6494, "step": 804 }, { "epoch": 0.01695245415729982, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6269, "step": 805 }, { "epoch": 0.01697351310656355, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.609, "step": 806 }, { "epoch": 0.016994572055827273, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.645, "step": 807 }, { "epoch": 0.017015631005091, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.668, "step": 808 }, { "epoch": 0.017036689954354728, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6493, "step": 809 }, { "epoch": 0.017057748903618455, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6446, "step": 810 }, { "epoch": 0.01707880785288218, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.663, "step": 811 }, { "epoch": 0.017099866802145907, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6333, "step": 812 }, { "epoch": 0.017120925751409635, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6601, "step": 813 }, { "epoch": 0.01714198470067336, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6445, "step": 814 }, { "epoch": 0.017163043649937086, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6298, "step": 815 }, { "epoch": 0.017184102599200814, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6612, "step": 816 }, { "epoch": 0.017205161548464538, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6217, "step": 817 }, { "epoch": 0.017226220497728265, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6361, "step": 818 }, { "epoch": 0.017247279446991993, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6586, "step": 819 }, { "epoch": 0.01726833839625572, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6218, "step": 820 }, { "epoch": 0.017289397345519444, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6224, "step": 821 }, { "epoch": 0.017310456294783172, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.625, "step": 822 }, { "epoch": 0.0173315152440469, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6813, "step": 823 }, { "epoch": 0.017352574193310624, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6438, "step": 824 }, { "epoch": 0.01737363314257435, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6271, "step": 825 }, { "epoch": 0.01739469209183808, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6339, "step": 826 }, { "epoch": 0.017415751041101803, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6257, "step": 827 }, { "epoch": 0.01743680999036553, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6781, "step": 828 }, { "epoch": 0.017457868939629258, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6028, "step": 829 }, { "epoch": 0.017478927888892985, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6335, "step": 830 }, { "epoch": 0.01749998683815671, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6194, "step": 831 }, { "epoch": 0.017521045787420437, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6902, "step": 832 }, { "epoch": 0.017542104736684164, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6452, "step": 833 }, { "epoch": 0.01756316368594789, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6163, "step": 834 }, { "epoch": 0.017584222635211616, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6649, "step": 835 }, { "epoch": 0.017605281584475344, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6409, "step": 836 }, { "epoch": 0.017626340533739068, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6236, "step": 837 }, { "epoch": 0.017647399483002795, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6355, "step": 838 }, { "epoch": 0.017668458432266523, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6378, "step": 839 }, { "epoch": 0.01768951738153025, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6293, "step": 840 }, { "epoch": 0.017710576330793974, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6538, "step": 841 }, { "epoch": 0.017731635280057702, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6452, "step": 842 }, { "epoch": 0.01775269422932143, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6397, "step": 843 }, { "epoch": 0.017773753178585153, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5787, "step": 844 }, { "epoch": 0.01779481212784888, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6356, "step": 845 }, { "epoch": 0.01781587107711261, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6498, "step": 846 }, { "epoch": 0.017836930026376333, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6298, "step": 847 }, { "epoch": 0.01785798897564006, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.623, "step": 848 }, { "epoch": 0.017879047924903788, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.649, "step": 849 }, { "epoch": 0.017900106874167515, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6529, "step": 850 }, { "epoch": 0.01792116582343124, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6536, "step": 851 }, { "epoch": 0.017942224772694967, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6239, "step": 852 }, { "epoch": 0.017963283721958694, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6501, "step": 853 }, { "epoch": 0.01798434267122242, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6017, "step": 854 }, { "epoch": 0.018005401620486146, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6279, "step": 855 }, { "epoch": 0.018026460569749873, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6455, "step": 856 }, { "epoch": 0.018047519519013597, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6619, "step": 857 }, { "epoch": 0.018068578468277325, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6174, "step": 858 }, { "epoch": 0.018089637417541053, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6331, "step": 859 }, { "epoch": 0.018110696366804777, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6318, "step": 860 }, { "epoch": 0.018131755316068504, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6585, "step": 861 }, { "epoch": 0.01815281426533223, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6266, "step": 862 }, { "epoch": 0.01817387321459596, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6365, "step": 863 }, { "epoch": 0.018194932163859683, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6406, "step": 864 }, { "epoch": 0.01821599111312341, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6525, "step": 865 }, { "epoch": 0.01823705006238714, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6476, "step": 866 }, { "epoch": 0.018258109011650862, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6244, "step": 867 }, { "epoch": 0.01827916796091459, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5978, "step": 868 }, { "epoch": 0.018300226910178317, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6055, "step": 869 }, { "epoch": 0.01832128585944204, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6666, "step": 870 }, { "epoch": 0.01834234480870577, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5938, "step": 871 }, { "epoch": 0.018363403757969497, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6593, "step": 872 }, { "epoch": 0.018384462707233224, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.648, "step": 873 }, { "epoch": 0.018405521656496948, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6886, "step": 874 }, { "epoch": 0.018426580605760676, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6267, "step": 875 }, { "epoch": 0.018447639555024403, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6445, "step": 876 }, { "epoch": 0.018468698504288127, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6372, "step": 877 }, { "epoch": 0.018489757453551855, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6797, "step": 878 }, { "epoch": 0.018510816402815582, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6379, "step": 879 }, { "epoch": 0.018531875352079306, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6374, "step": 880 }, { "epoch": 0.018552934301343034, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6227, "step": 881 }, { "epoch": 0.01857399325060676, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6266, "step": 882 }, { "epoch": 0.01859505219987049, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.63, "step": 883 }, { "epoch": 0.018616111149134213, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6311, "step": 884 }, { "epoch": 0.01863717009839794, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6426, "step": 885 }, { "epoch": 0.018658229047661668, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6624, "step": 886 }, { "epoch": 0.018679287996925392, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6369, "step": 887 }, { "epoch": 0.01870034694618912, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6102, "step": 888 }, { "epoch": 0.018721405895452847, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6295, "step": 889 }, { "epoch": 0.01874246484471657, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.6711, "step": 890 }, { "epoch": 0.0187635237939803, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.628, "step": 891 }, { "epoch": 0.018784582743244026, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6459, "step": 892 }, { "epoch": 0.018805641692507754, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6375, "step": 893 }, { "epoch": 0.018826700641771478, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6038, "step": 894 }, { "epoch": 0.018847759591035206, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6387, "step": 895 }, { "epoch": 0.018868818540298933, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6274, "step": 896 }, { "epoch": 0.018889877489562657, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6084, "step": 897 }, { "epoch": 0.018910936438826385, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.607, "step": 898 }, { "epoch": 0.018931995388090112, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6388, "step": 899 }, { "epoch": 0.018953054337353836, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.6062, "step": 900 }, { "epoch": 0.018974113286617564, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6045, "step": 901 }, { "epoch": 0.01899517223588129, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5999, "step": 902 }, { "epoch": 0.01901623118514502, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6442, "step": 903 }, { "epoch": 0.019037290134408743, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.648, "step": 904 }, { "epoch": 0.01905834908367247, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6321, "step": 905 }, { "epoch": 0.019079408032936198, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.621, "step": 906 }, { "epoch": 0.019100466982199922, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6163, "step": 907 }, { "epoch": 0.01912152593146365, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6196, "step": 908 }, { "epoch": 0.019142584880727377, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.629, "step": 909 }, { "epoch": 0.0191636438299911, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.665, "step": 910 }, { "epoch": 0.01918470277925483, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6315, "step": 911 }, { "epoch": 0.019205761728518556, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6545, "step": 912 }, { "epoch": 0.01922682067778228, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6372, "step": 913 }, { "epoch": 0.019247879627046008, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6359, "step": 914 }, { "epoch": 0.019268938576309735, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6585, "step": 915 }, { "epoch": 0.019289997525573463, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.624, "step": 916 }, { "epoch": 0.019311056474837187, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6265, "step": 917 }, { "epoch": 0.019332115424100915, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6249, "step": 918 }, { "epoch": 0.019353174373364642, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6165, "step": 919 }, { "epoch": 0.019374233322628366, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.611, "step": 920 }, { "epoch": 0.019395292271892094, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6338, "step": 921 }, { "epoch": 0.01941635122115582, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6137, "step": 922 }, { "epoch": 0.019437410170419545, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6039, "step": 923 }, { "epoch": 0.019458469119683273, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6348, "step": 924 }, { "epoch": 0.019479528068947, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6254, "step": 925 }, { "epoch": 0.019500587018210728, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6149, "step": 926 }, { "epoch": 0.019521645967474452, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6296, "step": 927 }, { "epoch": 0.01954270491673818, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.6434, "step": 928 }, { "epoch": 0.019563763866001907, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.6116, "step": 929 }, { "epoch": 0.01958482281526563, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6404, "step": 930 }, { "epoch": 0.01960588176452936, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.659, "step": 931 }, { "epoch": 0.019626940713793086, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6163, "step": 932 }, { "epoch": 0.01964799966305681, "grad_norm": 0.1318359375, "learning_rate": 0.0002, "loss": 1.6564, "step": 933 }, { "epoch": 0.019669058612320538, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6478, "step": 934 }, { "epoch": 0.019690117561584265, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.618, "step": 935 }, { "epoch": 0.019711176510847993, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6503, "step": 936 }, { "epoch": 0.019732235460111717, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6146, "step": 937 }, { "epoch": 0.019753294409375444, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6315, "step": 938 }, { "epoch": 0.019774353358639172, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6167, "step": 939 }, { "epoch": 0.019795412307902896, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5994, "step": 940 }, { "epoch": 0.019816471257166623, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6217, "step": 941 }, { "epoch": 0.01983753020643035, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6347, "step": 942 }, { "epoch": 0.019858589155694075, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6399, "step": 943 }, { "epoch": 0.019879648104957803, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6524, "step": 944 }, { "epoch": 0.01990070705422153, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6498, "step": 945 }, { "epoch": 0.019921766003485258, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6563, "step": 946 }, { "epoch": 0.019942824952748982, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6275, "step": 947 }, { "epoch": 0.01996388390201271, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6546, "step": 948 }, { "epoch": 0.019984942851276437, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6079, "step": 949 }, { "epoch": 0.02000600180054016, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6183, "step": 950 }, { "epoch": 0.02002706074980389, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.642, "step": 951 }, { "epoch": 0.020048119699067616, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6106, "step": 952 }, { "epoch": 0.02006917864833134, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6926, "step": 953 }, { "epoch": 0.020090237597595068, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.644, "step": 954 }, { "epoch": 0.020111296546858795, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6402, "step": 955 }, { "epoch": 0.020132355496122523, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5973, "step": 956 }, { "epoch": 0.020153414445386247, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6506, "step": 957 }, { "epoch": 0.020174473394649974, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6483, "step": 958 }, { "epoch": 0.020195532343913702, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6261, "step": 959 }, { "epoch": 0.020216591293177426, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6237, "step": 960 }, { "epoch": 0.020237650242441153, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6382, "step": 961 }, { "epoch": 0.02025870919170488, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6411, "step": 962 }, { "epoch": 0.020279768140968605, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6045, "step": 963 }, { "epoch": 0.020300827090232332, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6084, "step": 964 }, { "epoch": 0.02032188603949606, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6348, "step": 965 }, { "epoch": 0.020342944988759788, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6363, "step": 966 }, { "epoch": 0.02036400393802351, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6232, "step": 967 }, { "epoch": 0.02038506288728724, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6178, "step": 968 }, { "epoch": 0.020406121836550967, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6237, "step": 969 }, { "epoch": 0.02042718078581469, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6666, "step": 970 }, { "epoch": 0.02044823973507842, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.614, "step": 971 }, { "epoch": 0.020469298684342146, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6207, "step": 972 }, { "epoch": 0.02049035763360587, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6401, "step": 973 }, { "epoch": 0.020511416582869597, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6113, "step": 974 }, { "epoch": 0.020532475532133325, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6171, "step": 975 }, { "epoch": 0.02055353448139705, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6284, "step": 976 }, { "epoch": 0.020574593430660777, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.627, "step": 977 }, { "epoch": 0.020595652379924504, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.634, "step": 978 }, { "epoch": 0.02061671132918823, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6277, "step": 979 }, { "epoch": 0.020637770278451956, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6225, "step": 980 }, { "epoch": 0.020658829227715683, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5944, "step": 981 }, { "epoch": 0.02067988817697941, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6136, "step": 982 }, { "epoch": 0.020700947126243135, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6257, "step": 983 }, { "epoch": 0.020722006075506862, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6533, "step": 984 }, { "epoch": 0.02074306502477059, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6294, "step": 985 }, { "epoch": 0.020764123974034314, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6164, "step": 986 }, { "epoch": 0.02078518292329804, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6284, "step": 987 }, { "epoch": 0.02080624187256177, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6287, "step": 988 }, { "epoch": 0.020827300821825497, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6626, "step": 989 }, { "epoch": 0.02084835977108922, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6201, "step": 990 }, { "epoch": 0.020869418720352948, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6195, "step": 991 }, { "epoch": 0.020890477669616676, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6268, "step": 992 }, { "epoch": 0.0209115366188804, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6077, "step": 993 }, { "epoch": 0.020932595568144127, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6278, "step": 994 }, { "epoch": 0.020953654517407855, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6191, "step": 995 }, { "epoch": 0.02097471346667158, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.612, "step": 996 }, { "epoch": 0.020995772415935306, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6431, "step": 997 }, { "epoch": 0.021016831365199034, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6593, "step": 998 }, { "epoch": 0.02103789031446276, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5955, "step": 999 }, { "epoch": 0.021058949263726486, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6325, "step": 1000 }, { "epoch": 0.021058949263726486, "eval_loss": 1.6914067268371582, "eval_runtime": 898.0675, "eval_samples_per_second": 68.814, "eval_steps_per_second": 2.151, "step": 1000 }, { "epoch": 0.021080008212990213, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6336, "step": 1001 }, { "epoch": 0.02110106716225394, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6411, "step": 1002 }, { "epoch": 0.021122126111517665, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6576, "step": 1003 }, { "epoch": 0.021143185060781392, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6168, "step": 1004 }, { "epoch": 0.02116424401004512, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6233, "step": 1005 }, { "epoch": 0.021185302959308844, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6156, "step": 1006 }, { "epoch": 0.02120636190857257, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6707, "step": 1007 }, { "epoch": 0.0212274208578363, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6217, "step": 1008 }, { "epoch": 0.021248479807100026, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6352, "step": 1009 }, { "epoch": 0.02126953875636375, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6402, "step": 1010 }, { "epoch": 0.021290597705627478, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.6176, "step": 1011 }, { "epoch": 0.021311656654891206, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.622, "step": 1012 }, { "epoch": 0.02133271560415493, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6245, "step": 1013 }, { "epoch": 0.021353774553418657, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6714, "step": 1014 }, { "epoch": 0.021374833502682385, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6911, "step": 1015 }, { "epoch": 0.02139589245194611, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6083, "step": 1016 }, { "epoch": 0.021416951401209836, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6309, "step": 1017 }, { "epoch": 0.021438010350473564, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6159, "step": 1018 }, { "epoch": 0.02145906929973729, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6106, "step": 1019 }, { "epoch": 0.021480128249001015, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6292, "step": 1020 }, { "epoch": 0.021501187198264743, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6053, "step": 1021 }, { "epoch": 0.02152224614752847, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6189, "step": 1022 }, { "epoch": 0.021543305096792194, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6064, "step": 1023 }, { "epoch": 0.021564364046055922, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6111, "step": 1024 }, { "epoch": 0.02158542299531965, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6346, "step": 1025 }, { "epoch": 0.021606481944583374, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6228, "step": 1026 }, { "epoch": 0.0216275408938471, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6298, "step": 1027 }, { "epoch": 0.02164859984311083, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6337, "step": 1028 }, { "epoch": 0.021669658792374553, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.6006, "step": 1029 }, { "epoch": 0.02169071774163828, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6476, "step": 1030 }, { "epoch": 0.021711776690902008, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5851, "step": 1031 }, { "epoch": 0.021732835640165735, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6288, "step": 1032 }, { "epoch": 0.02175389458942946, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5907, "step": 1033 }, { "epoch": 0.021774953538693187, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5862, "step": 1034 }, { "epoch": 0.021796012487956914, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6431, "step": 1035 }, { "epoch": 0.02181707143722064, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6375, "step": 1036 }, { "epoch": 0.021838130386484366, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6364, "step": 1037 }, { "epoch": 0.021859189335748094, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6236, "step": 1038 }, { "epoch": 0.021880248285011818, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6356, "step": 1039 }, { "epoch": 0.021901307234275545, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5813, "step": 1040 }, { "epoch": 0.021922366183539273, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6238, "step": 1041 }, { "epoch": 0.021943425132803, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6377, "step": 1042 }, { "epoch": 0.021964484082066724, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.632, "step": 1043 }, { "epoch": 0.021985543031330452, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6066, "step": 1044 }, { "epoch": 0.02200660198059418, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6219, "step": 1045 }, { "epoch": 0.022027660929857903, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6233, "step": 1046 }, { "epoch": 0.02204871987912163, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6028, "step": 1047 }, { "epoch": 0.02206977882838536, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.62, "step": 1048 }, { "epoch": 0.022090837777649083, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6118, "step": 1049 }, { "epoch": 0.02211189672691281, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6278, "step": 1050 }, { "epoch": 0.022132955676176538, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.637, "step": 1051 }, { "epoch": 0.022154014625440265, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6312, "step": 1052 }, { "epoch": 0.02217507357470399, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.61, "step": 1053 }, { "epoch": 0.022196132523967717, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6094, "step": 1054 }, { "epoch": 0.022217191473231444, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6337, "step": 1055 }, { "epoch": 0.02223825042249517, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6495, "step": 1056 }, { "epoch": 0.022259309371758896, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6151, "step": 1057 }, { "epoch": 0.022280368321022623, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6204, "step": 1058 }, { "epoch": 0.022301427270286348, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5997, "step": 1059 }, { "epoch": 0.022322486219550075, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6116, "step": 1060 }, { "epoch": 0.022343545168813803, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6374, "step": 1061 }, { "epoch": 0.02236460411807753, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6257, "step": 1062 }, { "epoch": 0.022385663067341254, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6663, "step": 1063 }, { "epoch": 0.02240672201660498, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6162, "step": 1064 }, { "epoch": 0.02242778096586871, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6099, "step": 1065 }, { "epoch": 0.022448839915132433, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6203, "step": 1066 }, { "epoch": 0.02246989886439616, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.662, "step": 1067 }, { "epoch": 0.02249095781365989, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.619, "step": 1068 }, { "epoch": 0.022512016762923612, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6197, "step": 1069 }, { "epoch": 0.02253307571218734, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6199, "step": 1070 }, { "epoch": 0.022554134661451068, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6183, "step": 1071 }, { "epoch": 0.022575193610714795, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6073, "step": 1072 }, { "epoch": 0.02259625255997852, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6481, "step": 1073 }, { "epoch": 0.022617311509242247, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6485, "step": 1074 }, { "epoch": 0.022638370458505974, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5991, "step": 1075 }, { "epoch": 0.022659429407769698, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6267, "step": 1076 }, { "epoch": 0.022680488357033426, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6284, "step": 1077 }, { "epoch": 0.022701547306297153, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6177, "step": 1078 }, { "epoch": 0.022722606255560877, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6306, "step": 1079 }, { "epoch": 0.022743665204824605, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5567, "step": 1080 }, { "epoch": 0.022764724154088332, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6132, "step": 1081 }, { "epoch": 0.02278578310335206, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6195, "step": 1082 }, { "epoch": 0.022806842052615784, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6194, "step": 1083 }, { "epoch": 0.02282790100187951, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6262, "step": 1084 }, { "epoch": 0.02284895995114324, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6283, "step": 1085 }, { "epoch": 0.022870018900406963, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6024, "step": 1086 }, { "epoch": 0.02289107784967069, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6494, "step": 1087 }, { "epoch": 0.022912136798934418, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.619, "step": 1088 }, { "epoch": 0.022933195748198142, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.625, "step": 1089 }, { "epoch": 0.02295425469746187, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5949, "step": 1090 }, { "epoch": 0.022975313646725597, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6353, "step": 1091 }, { "epoch": 0.02299637259598932, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6138, "step": 1092 }, { "epoch": 0.02301743154525305, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6221, "step": 1093 }, { "epoch": 0.023038490494516776, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6223, "step": 1094 }, { "epoch": 0.023059549443780504, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6076, "step": 1095 }, { "epoch": 0.023080608393044228, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6176, "step": 1096 }, { "epoch": 0.023101667342307956, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6344, "step": 1097 }, { "epoch": 0.023122726291571683, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6199, "step": 1098 }, { "epoch": 0.023143785240835407, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5975, "step": 1099 }, { "epoch": 0.023164844190099135, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5918, "step": 1100 }, { "epoch": 0.023185903139362862, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6169, "step": 1101 }, { "epoch": 0.023206962088626586, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6261, "step": 1102 }, { "epoch": 0.023228021037890314, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.623, "step": 1103 }, { "epoch": 0.02324907998715404, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6211, "step": 1104 }, { "epoch": 0.02327013893641777, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6389, "step": 1105 }, { "epoch": 0.023291197885681493, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.5897, "step": 1106 }, { "epoch": 0.02331225683494522, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6222, "step": 1107 }, { "epoch": 0.023333315784208948, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6309, "step": 1108 }, { "epoch": 0.023354374733472672, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.617, "step": 1109 }, { "epoch": 0.0233754336827364, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6224, "step": 1110 }, { "epoch": 0.023396492632000127, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6505, "step": 1111 }, { "epoch": 0.02341755158126385, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6487, "step": 1112 }, { "epoch": 0.02343861053052758, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.6249, "step": 1113 }, { "epoch": 0.023459669479791306, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6512, "step": 1114 }, { "epoch": 0.023480728429055034, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6008, "step": 1115 }, { "epoch": 0.023501787378318758, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6267, "step": 1116 }, { "epoch": 0.023522846327582485, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6223, "step": 1117 }, { "epoch": 0.023543905276846213, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6225, "step": 1118 }, { "epoch": 0.023564964226109937, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6139, "step": 1119 }, { "epoch": 0.023586023175373665, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6082, "step": 1120 }, { "epoch": 0.023607082124637392, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6414, "step": 1121 }, { "epoch": 0.023628141073901116, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6348, "step": 1122 }, { "epoch": 0.023649200023164844, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6217, "step": 1123 }, { "epoch": 0.02367025897242857, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5911, "step": 1124 }, { "epoch": 0.0236913179216923, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6044, "step": 1125 }, { "epoch": 0.023712376870956023, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6305, "step": 1126 }, { "epoch": 0.02373343582021975, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6247, "step": 1127 }, { "epoch": 0.023754494769483478, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5967, "step": 1128 }, { "epoch": 0.023775553718747202, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6199, "step": 1129 }, { "epoch": 0.02379661266801093, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6135, "step": 1130 }, { "epoch": 0.023817671617274657, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6403, "step": 1131 }, { "epoch": 0.02383873056653838, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.624, "step": 1132 }, { "epoch": 0.02385978951580211, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6105, "step": 1133 }, { "epoch": 0.023880848465065836, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6437, "step": 1134 }, { "epoch": 0.023901907414329564, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5916, "step": 1135 }, { "epoch": 0.023922966363593288, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6119, "step": 1136 }, { "epoch": 0.023944025312857015, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6003, "step": 1137 }, { "epoch": 0.023965084262120743, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6303, "step": 1138 }, { "epoch": 0.023986143211384467, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6294, "step": 1139 }, { "epoch": 0.024007202160648194, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6217, "step": 1140 }, { "epoch": 0.024028261109911922, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6383, "step": 1141 }, { "epoch": 0.024049320059175646, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6308, "step": 1142 }, { "epoch": 0.024070379008439374, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6204, "step": 1143 }, { "epoch": 0.0240914379577031, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6054, "step": 1144 }, { "epoch": 0.024112496906966825, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6421, "step": 1145 }, { "epoch": 0.024133555856230553, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6038, "step": 1146 }, { "epoch": 0.02415461480549428, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6592, "step": 1147 }, { "epoch": 0.024175673754758008, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6276, "step": 1148 }, { "epoch": 0.024196732704021732, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5886, "step": 1149 }, { "epoch": 0.02421779165328546, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6242, "step": 1150 }, { "epoch": 0.024238850602549187, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.638, "step": 1151 }, { "epoch": 0.02425990955181291, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.599, "step": 1152 }, { "epoch": 0.02428096850107664, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.605, "step": 1153 }, { "epoch": 0.024302027450340366, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5756, "step": 1154 }, { "epoch": 0.02432308639960409, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.633, "step": 1155 }, { "epoch": 0.024344145348867818, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6105, "step": 1156 }, { "epoch": 0.024365204298131545, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6181, "step": 1157 }, { "epoch": 0.024386263247395273, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.587, "step": 1158 }, { "epoch": 0.024407322196658997, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5906, "step": 1159 }, { "epoch": 0.024428381145922724, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6298, "step": 1160 }, { "epoch": 0.024449440095186452, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5965, "step": 1161 }, { "epoch": 0.024470499044450176, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.608, "step": 1162 }, { "epoch": 0.024491557993713903, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6367, "step": 1163 }, { "epoch": 0.02451261694297763, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6158, "step": 1164 }, { "epoch": 0.024533675892241355, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6029, "step": 1165 }, { "epoch": 0.024554734841505083, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6462, "step": 1166 }, { "epoch": 0.02457579379076881, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6016, "step": 1167 }, { "epoch": 0.024596852740032538, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6033, "step": 1168 }, { "epoch": 0.02461791168929626, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6193, "step": 1169 }, { "epoch": 0.02463897063855999, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6367, "step": 1170 }, { "epoch": 0.024660029587823717, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6041, "step": 1171 }, { "epoch": 0.02468108853708744, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6419, "step": 1172 }, { "epoch": 0.02470214748635117, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5985, "step": 1173 }, { "epoch": 0.024723206435614896, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6452, "step": 1174 }, { "epoch": 0.02474426538487862, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6152, "step": 1175 }, { "epoch": 0.024765324334142347, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5864, "step": 1176 }, { "epoch": 0.024786383283406075, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6626, "step": 1177 }, { "epoch": 0.024807442232669803, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6221, "step": 1178 }, { "epoch": 0.024828501181933527, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.636, "step": 1179 }, { "epoch": 0.024849560131197254, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6143, "step": 1180 }, { "epoch": 0.02487061908046098, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.5912, "step": 1181 }, { "epoch": 0.024891678029724706, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6055, "step": 1182 }, { "epoch": 0.024912736978988433, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6034, "step": 1183 }, { "epoch": 0.02493379592825216, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6055, "step": 1184 }, { "epoch": 0.024954854877515885, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.624, "step": 1185 }, { "epoch": 0.024975913826779612, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6475, "step": 1186 }, { "epoch": 0.02499697277604334, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6437, "step": 1187 }, { "epoch": 0.025018031725307067, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6337, "step": 1188 }, { "epoch": 0.02503909067457079, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6257, "step": 1189 }, { "epoch": 0.02506014962383452, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6502, "step": 1190 }, { "epoch": 0.025081208573098247, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.597, "step": 1191 }, { "epoch": 0.02510226752236197, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6389, "step": 1192 }, { "epoch": 0.025123326471625698, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6071, "step": 1193 }, { "epoch": 0.025144385420889426, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6186, "step": 1194 }, { "epoch": 0.02516544437015315, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.568, "step": 1195 }, { "epoch": 0.025186503319416877, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6266, "step": 1196 }, { "epoch": 0.025207562268680605, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.635, "step": 1197 }, { "epoch": 0.02522862121794433, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6003, "step": 1198 }, { "epoch": 0.025249680167208056, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6364, "step": 1199 }, { "epoch": 0.025270739116471784, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5939, "step": 1200 }, { "epoch": 0.02529179806573551, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6229, "step": 1201 }, { "epoch": 0.025312857014999236, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.583, "step": 1202 }, { "epoch": 0.025333915964262963, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.642, "step": 1203 }, { "epoch": 0.02535497491352669, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.581, "step": 1204 }, { "epoch": 0.025376033862790415, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.594, "step": 1205 }, { "epoch": 0.025397092812054142, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6383, "step": 1206 }, { "epoch": 0.02541815176131787, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5929, "step": 1207 }, { "epoch": 0.025439210710581594, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.636, "step": 1208 }, { "epoch": 0.02546026965984532, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6064, "step": 1209 }, { "epoch": 0.02548132860910905, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.619, "step": 1210 }, { "epoch": 0.025502387558372776, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6399, "step": 1211 }, { "epoch": 0.0255234465076365, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6246, "step": 1212 }, { "epoch": 0.025544505456900228, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6223, "step": 1213 }, { "epoch": 0.025565564406163956, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6077, "step": 1214 }, { "epoch": 0.02558662335542768, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6345, "step": 1215 }, { "epoch": 0.025607682304691407, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6388, "step": 1216 }, { "epoch": 0.025628741253955135, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5735, "step": 1217 }, { "epoch": 0.02564980020321886, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6032, "step": 1218 }, { "epoch": 0.025670859152482586, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6221, "step": 1219 }, { "epoch": 0.025691918101746314, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6351, "step": 1220 }, { "epoch": 0.02571297705101004, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6198, "step": 1221 }, { "epoch": 0.025734036000273765, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6674, "step": 1222 }, { "epoch": 0.025755094949537493, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6306, "step": 1223 }, { "epoch": 0.02577615389880122, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5934, "step": 1224 }, { "epoch": 0.025797212848064945, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6132, "step": 1225 }, { "epoch": 0.025818271797328672, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5858, "step": 1226 }, { "epoch": 0.0258393307465924, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.597, "step": 1227 }, { "epoch": 0.025860389695856124, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5878, "step": 1228 }, { "epoch": 0.02588144864511985, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6001, "step": 1229 }, { "epoch": 0.02590250759438358, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6302, "step": 1230 }, { "epoch": 0.025923566543647306, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6049, "step": 1231 }, { "epoch": 0.02594462549291103, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.6234, "step": 1232 }, { "epoch": 0.025965684442174758, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5932, "step": 1233 }, { "epoch": 0.025986743391438485, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6165, "step": 1234 }, { "epoch": 0.02600780234070221, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5812, "step": 1235 }, { "epoch": 0.026028861289965937, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5991, "step": 1236 }, { "epoch": 0.026049920239229665, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5866, "step": 1237 }, { "epoch": 0.02607097918849339, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6271, "step": 1238 }, { "epoch": 0.026092038137757116, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6186, "step": 1239 }, { "epoch": 0.026113097087020844, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.5955, "step": 1240 }, { "epoch": 0.02613415603628457, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6197, "step": 1241 }, { "epoch": 0.026155214985548295, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6009, "step": 1242 }, { "epoch": 0.026176273934812023, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6218, "step": 1243 }, { "epoch": 0.02619733288407575, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5983, "step": 1244 }, { "epoch": 0.026218391833339474, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5949, "step": 1245 }, { "epoch": 0.026239450782603202, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.592, "step": 1246 }, { "epoch": 0.02626050973186693, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.642, "step": 1247 }, { "epoch": 0.026281568681130654, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5977, "step": 1248 }, { "epoch": 0.02630262763039438, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6097, "step": 1249 }, { "epoch": 0.02632368657965811, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5939, "step": 1250 }, { "epoch": 0.026344745528921836, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6219, "step": 1251 }, { "epoch": 0.02636580447818556, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6273, "step": 1252 }, { "epoch": 0.026386863427449288, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6068, "step": 1253 }, { "epoch": 0.026407922376713015, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5923, "step": 1254 }, { "epoch": 0.02642898132597674, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.635, "step": 1255 }, { "epoch": 0.026450040275240467, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.62, "step": 1256 }, { "epoch": 0.026471099224504194, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5957, "step": 1257 }, { "epoch": 0.02649215817376792, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6102, "step": 1258 }, { "epoch": 0.026513217123031646, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5821, "step": 1259 }, { "epoch": 0.026534276072295374, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.605, "step": 1260 }, { "epoch": 0.026555335021559098, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6108, "step": 1261 }, { "epoch": 0.026576393970822825, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6175, "step": 1262 }, { "epoch": 0.026597452920086553, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6737, "step": 1263 }, { "epoch": 0.02661851186935028, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6139, "step": 1264 }, { "epoch": 0.026639570818614004, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6389, "step": 1265 }, { "epoch": 0.026660629767877732, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6079, "step": 1266 }, { "epoch": 0.02668168871714146, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5814, "step": 1267 }, { "epoch": 0.026702747666405183, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.622, "step": 1268 }, { "epoch": 0.02672380661566891, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6231, "step": 1269 }, { "epoch": 0.02674486556493264, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6189, "step": 1270 }, { "epoch": 0.026765924514196363, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6342, "step": 1271 }, { "epoch": 0.02678698346346009, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6329, "step": 1272 }, { "epoch": 0.026808042412723818, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6079, "step": 1273 }, { "epoch": 0.026829101361987545, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6079, "step": 1274 }, { "epoch": 0.02685016031125127, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6479, "step": 1275 }, { "epoch": 0.026871219260514997, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6052, "step": 1276 }, { "epoch": 0.026892278209778724, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6283, "step": 1277 }, { "epoch": 0.02691333715904245, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5903, "step": 1278 }, { "epoch": 0.026934396108306176, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6379, "step": 1279 }, { "epoch": 0.026955455057569903, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6033, "step": 1280 }, { "epoch": 0.026976514006833627, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5896, "step": 1281 }, { "epoch": 0.026997572956097355, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5876, "step": 1282 }, { "epoch": 0.027018631905361083, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.6024, "step": 1283 }, { "epoch": 0.02703969085462481, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6562, "step": 1284 }, { "epoch": 0.027060749803888534, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5927, "step": 1285 }, { "epoch": 0.02708180875315226, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6031, "step": 1286 }, { "epoch": 0.02710286770241599, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5868, "step": 1287 }, { "epoch": 0.027123926651679713, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6199, "step": 1288 }, { "epoch": 0.02714498560094344, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5953, "step": 1289 }, { "epoch": 0.02716604455020717, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5842, "step": 1290 }, { "epoch": 0.027187103499470892, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6007, "step": 1291 }, { "epoch": 0.02720816244873462, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.624, "step": 1292 }, { "epoch": 0.027229221397998347, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6411, "step": 1293 }, { "epoch": 0.027250280347262075, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.634, "step": 1294 }, { "epoch": 0.0272713392965258, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.6284, "step": 1295 }, { "epoch": 0.027292398245789527, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5982, "step": 1296 }, { "epoch": 0.027313457195053254, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6197, "step": 1297 }, { "epoch": 0.027334516144316978, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6282, "step": 1298 }, { "epoch": 0.027355575093580706, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5901, "step": 1299 }, { "epoch": 0.027376634042844433, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6165, "step": 1300 }, { "epoch": 0.027397692992108157, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.6153, "step": 1301 }, { "epoch": 0.027418751941371885, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5959, "step": 1302 }, { "epoch": 0.027439810890635612, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6139, "step": 1303 }, { "epoch": 0.02746086983989934, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6059, "step": 1304 }, { "epoch": 0.027481928789163064, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.5903, "step": 1305 }, { "epoch": 0.02750298773842679, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.623, "step": 1306 }, { "epoch": 0.02752404668769052, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6088, "step": 1307 }, { "epoch": 0.027545105636954243, "grad_norm": 0.1328125, "learning_rate": 0.0002, "loss": 1.6319, "step": 1308 }, { "epoch": 0.02756616458621797, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6179, "step": 1309 }, { "epoch": 0.027587223535481698, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.618, "step": 1310 }, { "epoch": 0.027608282484745422, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6211, "step": 1311 }, { "epoch": 0.02762934143400915, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6086, "step": 1312 }, { "epoch": 0.027650400383272877, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6581, "step": 1313 }, { "epoch": 0.0276714593325366, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.578, "step": 1314 }, { "epoch": 0.02769251828180033, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6041, "step": 1315 }, { "epoch": 0.027713577231064056, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5705, "step": 1316 }, { "epoch": 0.027734636180327784, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6264, "step": 1317 }, { "epoch": 0.027755695129591508, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6015, "step": 1318 }, { "epoch": 0.027776754078855236, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.642, "step": 1319 }, { "epoch": 0.027797813028118963, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6443, "step": 1320 }, { "epoch": 0.027818871977382687, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6177, "step": 1321 }, { "epoch": 0.027839930926646415, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6113, "step": 1322 }, { "epoch": 0.027860989875910142, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5953, "step": 1323 }, { "epoch": 0.027882048825173866, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.613, "step": 1324 }, { "epoch": 0.027903107774437594, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6342, "step": 1325 }, { "epoch": 0.02792416672370132, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6219, "step": 1326 }, { "epoch": 0.02794522567296505, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6043, "step": 1327 }, { "epoch": 0.027966284622228773, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6305, "step": 1328 }, { "epoch": 0.0279873435714925, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6052, "step": 1329 }, { "epoch": 0.028008402520756228, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6224, "step": 1330 }, { "epoch": 0.028029461470019952, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6226, "step": 1331 }, { "epoch": 0.02805052041928368, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6226, "step": 1332 }, { "epoch": 0.028071579368547407, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6256, "step": 1333 }, { "epoch": 0.02809263831781113, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.6391, "step": 1334 }, { "epoch": 0.02811369726707486, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6069, "step": 1335 }, { "epoch": 0.028134756216338586, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6228, "step": 1336 }, { "epoch": 0.028155815165602314, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5944, "step": 1337 }, { "epoch": 0.028176874114866038, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6236, "step": 1338 }, { "epoch": 0.028197933064129765, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5985, "step": 1339 }, { "epoch": 0.028218992013393493, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6352, "step": 1340 }, { "epoch": 0.028240050962657217, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6386, "step": 1341 }, { "epoch": 0.028261109911920945, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6032, "step": 1342 }, { "epoch": 0.028282168861184672, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6072, "step": 1343 }, { "epoch": 0.028303227810448396, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.632, "step": 1344 }, { "epoch": 0.028324286759712124, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6416, "step": 1345 }, { "epoch": 0.02834534570897585, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.623, "step": 1346 }, { "epoch": 0.02836640465823958, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.6257, "step": 1347 }, { "epoch": 0.028387463607503303, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5952, "step": 1348 }, { "epoch": 0.02840852255676703, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6117, "step": 1349 }, { "epoch": 0.028429581506030758, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6349, "step": 1350 }, { "epoch": 0.028450640455294482, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6076, "step": 1351 }, { "epoch": 0.02847169940455821, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.627, "step": 1352 }, { "epoch": 0.028492758353821937, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6404, "step": 1353 }, { "epoch": 0.02851381730308566, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5884, "step": 1354 }, { "epoch": 0.02853487625234939, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5958, "step": 1355 }, { "epoch": 0.028555935201613116, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6203, "step": 1356 }, { "epoch": 0.028576994150876844, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5905, "step": 1357 }, { "epoch": 0.028598053100140568, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6162, "step": 1358 }, { "epoch": 0.028619112049404295, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6322, "step": 1359 }, { "epoch": 0.028640170998668023, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.626, "step": 1360 }, { "epoch": 0.028661229947931747, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.594, "step": 1361 }, { "epoch": 0.028682288897195474, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.565, "step": 1362 }, { "epoch": 0.028703347846459202, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6193, "step": 1363 }, { "epoch": 0.028724406795722926, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.622, "step": 1364 }, { "epoch": 0.028745465744986654, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5932, "step": 1365 }, { "epoch": 0.02876652469425038, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6126, "step": 1366 }, { "epoch": 0.02878758364351411, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6159, "step": 1367 }, { "epoch": 0.028808642592777833, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6521, "step": 1368 }, { "epoch": 0.02882970154204156, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6255, "step": 1369 }, { "epoch": 0.028850760491305288, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5906, "step": 1370 }, { "epoch": 0.028871819440569012, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6229, "step": 1371 }, { "epoch": 0.02889287838983274, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6187, "step": 1372 }, { "epoch": 0.028913937339096467, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5872, "step": 1373 }, { "epoch": 0.02893499628836019, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6163, "step": 1374 }, { "epoch": 0.02895605523762392, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6029, "step": 1375 }, { "epoch": 0.028977114186887646, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6159, "step": 1376 }, { "epoch": 0.02899817313615137, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6296, "step": 1377 }, { "epoch": 0.029019232085415098, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6194, "step": 1378 }, { "epoch": 0.029040291034678825, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6062, "step": 1379 }, { "epoch": 0.029061349983942553, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6184, "step": 1380 }, { "epoch": 0.029082408933206277, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6045, "step": 1381 }, { "epoch": 0.029103467882470004, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6155, "step": 1382 }, { "epoch": 0.029124526831733732, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.615, "step": 1383 }, { "epoch": 0.029145585780997456, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.618, "step": 1384 }, { "epoch": 0.029166644730261183, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5576, "step": 1385 }, { "epoch": 0.02918770367952491, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6689, "step": 1386 }, { "epoch": 0.029208762628788635, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6087, "step": 1387 }, { "epoch": 0.029229821578052363, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6104, "step": 1388 }, { "epoch": 0.02925088052731609, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6187, "step": 1389 }, { "epoch": 0.029271939476579818, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5843, "step": 1390 }, { "epoch": 0.02929299842584354, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6471, "step": 1391 }, { "epoch": 0.02931405737510727, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.616, "step": 1392 }, { "epoch": 0.029335116324370997, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6148, "step": 1393 }, { "epoch": 0.02935617527363472, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6599, "step": 1394 }, { "epoch": 0.02937723422289845, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.6011, "step": 1395 }, { "epoch": 0.029398293172162176, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6202, "step": 1396 }, { "epoch": 0.0294193521214259, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6336, "step": 1397 }, { "epoch": 0.029440411070689627, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6078, "step": 1398 }, { "epoch": 0.029461470019953355, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6393, "step": 1399 }, { "epoch": 0.029482528969217082, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6385, "step": 1400 }, { "epoch": 0.029503587918480807, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6185, "step": 1401 }, { "epoch": 0.029524646867744534, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.636, "step": 1402 }, { "epoch": 0.02954570581700826, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5728, "step": 1403 }, { "epoch": 0.029566764766271986, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6191, "step": 1404 }, { "epoch": 0.029587823715535713, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6167, "step": 1405 }, { "epoch": 0.02960888266479944, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.619, "step": 1406 }, { "epoch": 0.029629941614063165, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6011, "step": 1407 }, { "epoch": 0.029651000563326892, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5973, "step": 1408 }, { "epoch": 0.02967205951259062, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6077, "step": 1409 }, { "epoch": 0.029693118461854347, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6238, "step": 1410 }, { "epoch": 0.02971417741111807, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6273, "step": 1411 }, { "epoch": 0.0297352363603818, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6019, "step": 1412 }, { "epoch": 0.029756295309645527, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5945, "step": 1413 }, { "epoch": 0.02977735425890925, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.637, "step": 1414 }, { "epoch": 0.029798413208172978, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5983, "step": 1415 }, { "epoch": 0.029819472157436706, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6058, "step": 1416 }, { "epoch": 0.02984053110670043, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6207, "step": 1417 }, { "epoch": 0.029861590055964157, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.614, "step": 1418 }, { "epoch": 0.029882649005227885, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6096, "step": 1419 }, { "epoch": 0.029903707954491612, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5942, "step": 1420 }, { "epoch": 0.029924766903755336, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6093, "step": 1421 }, { "epoch": 0.029945825853019064, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.6512, "step": 1422 }, { "epoch": 0.02996688480228279, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6429, "step": 1423 }, { "epoch": 0.029987943751546516, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5726, "step": 1424 }, { "epoch": 0.030009002700810243, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5582, "step": 1425 }, { "epoch": 0.03003006165007397, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6285, "step": 1426 }, { "epoch": 0.030051120599337695, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6457, "step": 1427 }, { "epoch": 0.030072179548601422, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6293, "step": 1428 }, { "epoch": 0.03009323849786515, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6092, "step": 1429 }, { "epoch": 0.030114297447128874, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6335, "step": 1430 }, { "epoch": 0.0301353563963926, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5601, "step": 1431 }, { "epoch": 0.03015641534565633, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.634, "step": 1432 }, { "epoch": 0.030177474294920056, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5892, "step": 1433 }, { "epoch": 0.03019853324418378, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6294, "step": 1434 }, { "epoch": 0.030219592193447508, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6103, "step": 1435 }, { "epoch": 0.030240651142711236, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6137, "step": 1436 }, { "epoch": 0.03026171009197496, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6093, "step": 1437 }, { "epoch": 0.030282769041238687, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5959, "step": 1438 }, { "epoch": 0.030303827990502415, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6133, "step": 1439 }, { "epoch": 0.03032488693976614, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5965, "step": 1440 }, { "epoch": 0.030345945889029866, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.651, "step": 1441 }, { "epoch": 0.030367004838293594, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6389, "step": 1442 }, { "epoch": 0.03038806378755732, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6395, "step": 1443 }, { "epoch": 0.030409122736821045, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5959, "step": 1444 }, { "epoch": 0.030430181686084773, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5967, "step": 1445 }, { "epoch": 0.0304512406353485, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6281, "step": 1446 }, { "epoch": 0.030472299584612225, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.624, "step": 1447 }, { "epoch": 0.030493358533875952, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6063, "step": 1448 }, { "epoch": 0.03051441748313968, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6102, "step": 1449 }, { "epoch": 0.030535476432403404, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6239, "step": 1450 }, { "epoch": 0.03055653538166713, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6007, "step": 1451 }, { "epoch": 0.03057759433093086, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6162, "step": 1452 }, { "epoch": 0.030598653280194586, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6271, "step": 1453 }, { "epoch": 0.03061971222945831, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6057, "step": 1454 }, { "epoch": 0.030640771178722038, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6431, "step": 1455 }, { "epoch": 0.030661830127985765, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6073, "step": 1456 }, { "epoch": 0.03068288907724949, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6251, "step": 1457 }, { "epoch": 0.030703948026513217, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6312, "step": 1458 }, { "epoch": 0.030725006975776945, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6331, "step": 1459 }, { "epoch": 0.03074606592504067, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6186, "step": 1460 }, { "epoch": 0.030767124874304396, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5831, "step": 1461 }, { "epoch": 0.030788183823568124, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6044, "step": 1462 }, { "epoch": 0.03080924277283185, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5956, "step": 1463 }, { "epoch": 0.030830301722095575, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.58, "step": 1464 }, { "epoch": 0.030851360671359303, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5907, "step": 1465 }, { "epoch": 0.03087241962062303, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5986, "step": 1466 }, { "epoch": 0.030893478569886754, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6344, "step": 1467 }, { "epoch": 0.030914537519150482, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5903, "step": 1468 }, { "epoch": 0.03093559646841421, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6097, "step": 1469 }, { "epoch": 0.030956655417677933, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5944, "step": 1470 }, { "epoch": 0.03097771436694166, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6254, "step": 1471 }, { "epoch": 0.03099877331620539, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5912, "step": 1472 }, { "epoch": 0.031019832265469116, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6188, "step": 1473 }, { "epoch": 0.03104089121473284, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.633, "step": 1474 }, { "epoch": 0.031061950163996568, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5669, "step": 1475 }, { "epoch": 0.031083009113260295, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6276, "step": 1476 }, { "epoch": 0.03110406806252402, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6197, "step": 1477 }, { "epoch": 0.031125127011787747, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5737, "step": 1478 }, { "epoch": 0.031146185961051474, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6145, "step": 1479 }, { "epoch": 0.0311672449103152, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6141, "step": 1480 }, { "epoch": 0.031188303859578926, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6139, "step": 1481 }, { "epoch": 0.031209362808842653, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.631, "step": 1482 }, { "epoch": 0.031230421758106378, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5815, "step": 1483 }, { "epoch": 0.031251480707370105, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5871, "step": 1484 }, { "epoch": 0.03127253965663383, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6116, "step": 1485 }, { "epoch": 0.03129359860589756, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5951, "step": 1486 }, { "epoch": 0.03131465755516129, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6043, "step": 1487 }, { "epoch": 0.03133571650442501, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6482, "step": 1488 }, { "epoch": 0.031356775453688736, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6271, "step": 1489 }, { "epoch": 0.03137783440295246, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5916, "step": 1490 }, { "epoch": 0.03139889335221619, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.605, "step": 1491 }, { "epoch": 0.03141995230147992, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6277, "step": 1492 }, { "epoch": 0.031441011250743646, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.629, "step": 1493 }, { "epoch": 0.031462070200007373, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6005, "step": 1494 }, { "epoch": 0.031483129149271094, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6365, "step": 1495 }, { "epoch": 0.03150418809853482, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.59, "step": 1496 }, { "epoch": 0.03152524704779855, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6138, "step": 1497 }, { "epoch": 0.03154630599706228, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6167, "step": 1498 }, { "epoch": 0.031567364946326004, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6102, "step": 1499 }, { "epoch": 0.03158842389558973, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6836, "step": 1500 }, { "epoch": 0.03158842389558973, "eval_loss": 1.7313117980957031, "eval_runtime": 898.7214, "eval_samples_per_second": 68.764, "eval_steps_per_second": 2.15, "step": 1500 }, { "epoch": 0.03160948284485346, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5911, "step": 1501 }, { "epoch": 0.03163054179411718, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6335, "step": 1502 }, { "epoch": 0.03165160074338091, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6045, "step": 1503 }, { "epoch": 0.031672659692644635, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5905, "step": 1504 }, { "epoch": 0.03169371864190836, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5942, "step": 1505 }, { "epoch": 0.03171477759117209, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6136, "step": 1506 }, { "epoch": 0.03173583654043582, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6084, "step": 1507 }, { "epoch": 0.03175689548969954, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5827, "step": 1508 }, { "epoch": 0.031777954438963266, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6446, "step": 1509 }, { "epoch": 0.03179901338822699, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6009, "step": 1510 }, { "epoch": 0.03182007233749072, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5915, "step": 1511 }, { "epoch": 0.03184113128675445, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6117, "step": 1512 }, { "epoch": 0.031862190236018176, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6005, "step": 1513 }, { "epoch": 0.0318832491852819, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.626, "step": 1514 }, { "epoch": 0.031904308134545624, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6183, "step": 1515 }, { "epoch": 0.03192536708380935, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5971, "step": 1516 }, { "epoch": 0.03194642603307308, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6177, "step": 1517 }, { "epoch": 0.031967484982336807, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.615, "step": 1518 }, { "epoch": 0.031988543931600534, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5933, "step": 1519 }, { "epoch": 0.03200960288086426, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5807, "step": 1520 }, { "epoch": 0.03203066183012799, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6462, "step": 1521 }, { "epoch": 0.03205172077939171, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6035, "step": 1522 }, { "epoch": 0.03207277972865544, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5976, "step": 1523 }, { "epoch": 0.032093838677919165, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5999, "step": 1524 }, { "epoch": 0.03211489762718289, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6359, "step": 1525 }, { "epoch": 0.03213595657644662, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6127, "step": 1526 }, { "epoch": 0.03215701552571035, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6089, "step": 1527 }, { "epoch": 0.03217807447497407, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6326, "step": 1528 }, { "epoch": 0.032199133424237796, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5891, "step": 1529 }, { "epoch": 0.03222019237350152, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5946, "step": 1530 }, { "epoch": 0.03224125132276525, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6025, "step": 1531 }, { "epoch": 0.03226231027202898, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6143, "step": 1532 }, { "epoch": 0.032283369221292706, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6418, "step": 1533 }, { "epoch": 0.03230442817055643, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6017, "step": 1534 }, { "epoch": 0.032325487119820154, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6064, "step": 1535 }, { "epoch": 0.03234654606908388, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6064, "step": 1536 }, { "epoch": 0.03236760501834761, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6277, "step": 1537 }, { "epoch": 0.032388663967611336, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5879, "step": 1538 }, { "epoch": 0.032409722916875064, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6506, "step": 1539 }, { "epoch": 0.03243078186613879, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6257, "step": 1540 }, { "epoch": 0.03245184081540251, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.621, "step": 1541 }, { "epoch": 0.03247289976466624, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6056, "step": 1542 }, { "epoch": 0.03249395871392997, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6221, "step": 1543 }, { "epoch": 0.032515017663193695, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6464, "step": 1544 }, { "epoch": 0.03253607661245742, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6132, "step": 1545 }, { "epoch": 0.03255713556172115, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.628, "step": 1546 }, { "epoch": 0.03257819451098488, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6171, "step": 1547 }, { "epoch": 0.0325992534602486, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5599, "step": 1548 }, { "epoch": 0.032620312409512325, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.574, "step": 1549 }, { "epoch": 0.03264137135877605, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5811, "step": 1550 }, { "epoch": 0.03266243030803978, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5824, "step": 1551 }, { "epoch": 0.03268348925730351, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6265, "step": 1552 }, { "epoch": 0.032704548206567235, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6438, "step": 1553 }, { "epoch": 0.03272560715583096, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6078, "step": 1554 }, { "epoch": 0.032746666105094684, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6268, "step": 1555 }, { "epoch": 0.03276772505435841, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5757, "step": 1556 }, { "epoch": 0.03278878400362214, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6165, "step": 1557 }, { "epoch": 0.032809842952885866, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6161, "step": 1558 }, { "epoch": 0.032830901902149594, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6304, "step": 1559 }, { "epoch": 0.03285196085141332, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6509, "step": 1560 }, { "epoch": 0.03287301980067704, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5895, "step": 1561 }, { "epoch": 0.03289407874994077, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6204, "step": 1562 }, { "epoch": 0.0329151376992045, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5967, "step": 1563 }, { "epoch": 0.032936196648468224, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.627, "step": 1564 }, { "epoch": 0.03295725559773195, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6286, "step": 1565 }, { "epoch": 0.03297831454699568, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.588, "step": 1566 }, { "epoch": 0.03299937349625941, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6567, "step": 1567 }, { "epoch": 0.03302043244552313, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6112, "step": 1568 }, { "epoch": 0.033041491394786855, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5737, "step": 1569 }, { "epoch": 0.03306255034405058, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.606, "step": 1570 }, { "epoch": 0.03308360929331431, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.6129, "step": 1571 }, { "epoch": 0.03310466824257804, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5904, "step": 1572 }, { "epoch": 0.033125727191841765, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6233, "step": 1573 }, { "epoch": 0.03314678614110549, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6132, "step": 1574 }, { "epoch": 0.03316784509036921, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5877, "step": 1575 }, { "epoch": 0.03318890403963294, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5909, "step": 1576 }, { "epoch": 0.03320996298889667, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6259, "step": 1577 }, { "epoch": 0.033231021938160396, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6395, "step": 1578 }, { "epoch": 0.033252080887424124, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6033, "step": 1579 }, { "epoch": 0.03327313983668785, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5902, "step": 1580 }, { "epoch": 0.03329419878595157, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6249, "step": 1581 }, { "epoch": 0.0333152577352153, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6291, "step": 1582 }, { "epoch": 0.03333631668447903, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6002, "step": 1583 }, { "epoch": 0.033357375633742754, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5868, "step": 1584 }, { "epoch": 0.03337843458300648, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5746, "step": 1585 }, { "epoch": 0.03339949353227021, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6275, "step": 1586 }, { "epoch": 0.03342055248153394, "grad_norm": 0.12890625, "learning_rate": 0.0002, "loss": 1.5987, "step": 1587 }, { "epoch": 0.03344161143079766, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6521, "step": 1588 }, { "epoch": 0.033462670380061385, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5996, "step": 1589 }, { "epoch": 0.03348372932932511, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6361, "step": 1590 }, { "epoch": 0.03350478827858884, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6203, "step": 1591 }, { "epoch": 0.03352584722785257, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5911, "step": 1592 }, { "epoch": 0.033546906177116295, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.629, "step": 1593 }, { "epoch": 0.03356796512638002, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6434, "step": 1594 }, { "epoch": 0.03358902407564374, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6314, "step": 1595 }, { "epoch": 0.03361008302490747, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6035, "step": 1596 }, { "epoch": 0.0336311419741712, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6554, "step": 1597 }, { "epoch": 0.033652200923434926, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6018, "step": 1598 }, { "epoch": 0.03367325987269865, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5738, "step": 1599 }, { "epoch": 0.03369431882196238, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5935, "step": 1600 }, { "epoch": 0.0337153777712261, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6285, "step": 1601 }, { "epoch": 0.03373643672048983, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6414, "step": 1602 }, { "epoch": 0.03375749566975356, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6071, "step": 1603 }, { "epoch": 0.033778554619017284, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6192, "step": 1604 }, { "epoch": 0.03379961356828101, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6178, "step": 1605 }, { "epoch": 0.03382067251754474, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6154, "step": 1606 }, { "epoch": 0.03384173146680847, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6021, "step": 1607 }, { "epoch": 0.03386279041607219, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6044, "step": 1608 }, { "epoch": 0.033883849365335915, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6268, "step": 1609 }, { "epoch": 0.03390490831459964, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6323, "step": 1610 }, { "epoch": 0.03392596726386337, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6262, "step": 1611 }, { "epoch": 0.0339470262131271, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6007, "step": 1612 }, { "epoch": 0.033968085162390825, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5836, "step": 1613 }, { "epoch": 0.033989144111654546, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6018, "step": 1614 }, { "epoch": 0.03401020306091827, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.614, "step": 1615 }, { "epoch": 0.034031262010182, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6194, "step": 1616 }, { "epoch": 0.03405232095944573, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.589, "step": 1617 }, { "epoch": 0.034073379908709456, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6081, "step": 1618 }, { "epoch": 0.03409443885797318, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5886, "step": 1619 }, { "epoch": 0.03411549780723691, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6132, "step": 1620 }, { "epoch": 0.03413655675650063, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6266, "step": 1621 }, { "epoch": 0.03415761570576436, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6185, "step": 1622 }, { "epoch": 0.034178674655028086, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5941, "step": 1623 }, { "epoch": 0.034199733604291814, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6127, "step": 1624 }, { "epoch": 0.03422079255355554, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6169, "step": 1625 }, { "epoch": 0.03424185150281927, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6155, "step": 1626 }, { "epoch": 0.034262910452083, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5958, "step": 1627 }, { "epoch": 0.03428396940134672, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6116, "step": 1628 }, { "epoch": 0.034305028350610445, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.628, "step": 1629 }, { "epoch": 0.03432608729987417, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5775, "step": 1630 }, { "epoch": 0.0343471462491379, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5662, "step": 1631 }, { "epoch": 0.03436820519840163, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5872, "step": 1632 }, { "epoch": 0.034389264147665355, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6273, "step": 1633 }, { "epoch": 0.034410323096929075, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5463, "step": 1634 }, { "epoch": 0.0344313820461928, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.605, "step": 1635 }, { "epoch": 0.03445244099545653, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6065, "step": 1636 }, { "epoch": 0.03447349994472026, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6039, "step": 1637 }, { "epoch": 0.034494558893983986, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6063, "step": 1638 }, { "epoch": 0.03451561784324771, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6061, "step": 1639 }, { "epoch": 0.03453667679251144, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6303, "step": 1640 }, { "epoch": 0.03455773574177516, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6254, "step": 1641 }, { "epoch": 0.03457879469103889, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5806, "step": 1642 }, { "epoch": 0.034599853640302616, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6324, "step": 1643 }, { "epoch": 0.034620912589566344, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6107, "step": 1644 }, { "epoch": 0.03464197153883007, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6329, "step": 1645 }, { "epoch": 0.0346630304880938, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5787, "step": 1646 }, { "epoch": 0.034684089437357526, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6358, "step": 1647 }, { "epoch": 0.03470514838662125, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5805, "step": 1648 }, { "epoch": 0.034726207335884975, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6357, "step": 1649 }, { "epoch": 0.0347472662851487, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.6247, "step": 1650 }, { "epoch": 0.03476832523441243, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5739, "step": 1651 }, { "epoch": 0.03478938418367616, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.611, "step": 1652 }, { "epoch": 0.034810443132939885, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5943, "step": 1653 }, { "epoch": 0.034831502082203605, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6355, "step": 1654 }, { "epoch": 0.03485256103146733, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6046, "step": 1655 }, { "epoch": 0.03487361998073106, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5793, "step": 1656 }, { "epoch": 0.03489467892999479, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5959, "step": 1657 }, { "epoch": 0.034915737879258515, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6051, "step": 1658 }, { "epoch": 0.03493679682852224, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5824, "step": 1659 }, { "epoch": 0.03495785577778597, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6045, "step": 1660 }, { "epoch": 0.03497891472704969, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6156, "step": 1661 }, { "epoch": 0.03499997367631342, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6329, "step": 1662 }, { "epoch": 0.035021032625577146, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6073, "step": 1663 }, { "epoch": 0.035042091574840874, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.664, "step": 1664 }, { "epoch": 0.0350631505241046, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6037, "step": 1665 }, { "epoch": 0.03508420947336833, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5835, "step": 1666 }, { "epoch": 0.03510526842263205, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5762, "step": 1667 }, { "epoch": 0.03512632737189578, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6209, "step": 1668 }, { "epoch": 0.035147386321159504, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5672, "step": 1669 }, { "epoch": 0.03516844527042323, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6377, "step": 1670 }, { "epoch": 0.03518950421968696, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.61, "step": 1671 }, { "epoch": 0.03521056316895069, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6655, "step": 1672 }, { "epoch": 0.035231622118214415, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6076, "step": 1673 }, { "epoch": 0.035252681067478135, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6174, "step": 1674 }, { "epoch": 0.03527374001674186, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6268, "step": 1675 }, { "epoch": 0.03529479896600559, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6115, "step": 1676 }, { "epoch": 0.03531585791526932, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6236, "step": 1677 }, { "epoch": 0.035336916864533045, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.592, "step": 1678 }, { "epoch": 0.03535797581379677, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5799, "step": 1679 }, { "epoch": 0.0353790347630605, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6257, "step": 1680 }, { "epoch": 0.03540009371232422, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5978, "step": 1681 }, { "epoch": 0.03542115266158795, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.575, "step": 1682 }, { "epoch": 0.035442211610851676, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6092, "step": 1683 }, { "epoch": 0.035463270560115404, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6042, "step": 1684 }, { "epoch": 0.03548432950937913, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6263, "step": 1685 }, { "epoch": 0.03550538845864286, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6079, "step": 1686 }, { "epoch": 0.03552644740790658, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5731, "step": 1687 }, { "epoch": 0.03554750635717031, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5906, "step": 1688 }, { "epoch": 0.035568565306434034, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5599, "step": 1689 }, { "epoch": 0.03558962425569776, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6101, "step": 1690 }, { "epoch": 0.03561068320496149, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5947, "step": 1691 }, { "epoch": 0.03563174215422522, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.5805, "step": 1692 }, { "epoch": 0.035652801103488944, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6101, "step": 1693 }, { "epoch": 0.035673860052752665, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6348, "step": 1694 }, { "epoch": 0.03569491900201639, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5941, "step": 1695 }, { "epoch": 0.03571597795128012, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5894, "step": 1696 }, { "epoch": 0.03573703690054385, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5782, "step": 1697 }, { "epoch": 0.035758095849807575, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.6318, "step": 1698 }, { "epoch": 0.0357791547990713, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5965, "step": 1699 }, { "epoch": 0.03580021374833503, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5629, "step": 1700 }, { "epoch": 0.03582127269759875, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5906, "step": 1701 }, { "epoch": 0.03584233164686248, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5796, "step": 1702 }, { "epoch": 0.035863390596126206, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5959, "step": 1703 }, { "epoch": 0.03588444954538993, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6076, "step": 1704 }, { "epoch": 0.03590550849465366, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5982, "step": 1705 }, { "epoch": 0.03592656744391739, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6165, "step": 1706 }, { "epoch": 0.03594762639318111, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6494, "step": 1707 }, { "epoch": 0.03596868534244484, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5864, "step": 1708 }, { "epoch": 0.035989744291708564, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6305, "step": 1709 }, { "epoch": 0.03601080324097229, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5995, "step": 1710 }, { "epoch": 0.03603186219023602, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6088, "step": 1711 }, { "epoch": 0.03605292113949975, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6147, "step": 1712 }, { "epoch": 0.036073980088763474, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6155, "step": 1713 }, { "epoch": 0.036095039038027195, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6134, "step": 1714 }, { "epoch": 0.03611609798729092, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5811, "step": 1715 }, { "epoch": 0.03613715693655465, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5833, "step": 1716 }, { "epoch": 0.03615821588581838, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6356, "step": 1717 }, { "epoch": 0.036179274835082105, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5993, "step": 1718 }, { "epoch": 0.03620033378434583, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6024, "step": 1719 }, { "epoch": 0.03622139273360955, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6354, "step": 1720 }, { "epoch": 0.03624245168287328, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6091, "step": 1721 }, { "epoch": 0.03626351063213701, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5913, "step": 1722 }, { "epoch": 0.036284569581400736, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5904, "step": 1723 }, { "epoch": 0.03630562853066446, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6041, "step": 1724 }, { "epoch": 0.03632668747992819, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6058, "step": 1725 }, { "epoch": 0.03634774642919192, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5993, "step": 1726 }, { "epoch": 0.03636880537845564, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6136, "step": 1727 }, { "epoch": 0.036389864327719366, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6139, "step": 1728 }, { "epoch": 0.036410923276983094, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5759, "step": 1729 }, { "epoch": 0.03643198222624682, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.592, "step": 1730 }, { "epoch": 0.03645304117551055, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5542, "step": 1731 }, { "epoch": 0.03647410012477428, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6252, "step": 1732 }, { "epoch": 0.036495159074038004, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5766, "step": 1733 }, { "epoch": 0.036516218023301725, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6156, "step": 1734 }, { "epoch": 0.03653727697256545, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6018, "step": 1735 }, { "epoch": 0.03655833592182918, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6105, "step": 1736 }, { "epoch": 0.03657939487109291, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.613, "step": 1737 }, { "epoch": 0.036600453820356635, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5816, "step": 1738 }, { "epoch": 0.03662151276962036, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6076, "step": 1739 }, { "epoch": 0.03664257171888408, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6547, "step": 1740 }, { "epoch": 0.03666363066814781, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6218, "step": 1741 }, { "epoch": 0.03668468961741154, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6065, "step": 1742 }, { "epoch": 0.036705748566675266, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6045, "step": 1743 }, { "epoch": 0.03672680751593899, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5764, "step": 1744 }, { "epoch": 0.03674786646520272, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6176, "step": 1745 }, { "epoch": 0.03676892541446645, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6423, "step": 1746 }, { "epoch": 0.03678998436373017, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6186, "step": 1747 }, { "epoch": 0.036811043312993896, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6053, "step": 1748 }, { "epoch": 0.036832102262257624, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6318, "step": 1749 }, { "epoch": 0.03685316121152135, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6391, "step": 1750 }, { "epoch": 0.03687422016078508, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5968, "step": 1751 }, { "epoch": 0.036895279110048806, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6101, "step": 1752 }, { "epoch": 0.036916338059312534, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6168, "step": 1753 }, { "epoch": 0.036937397008576255, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5969, "step": 1754 }, { "epoch": 0.03695845595783998, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5838, "step": 1755 }, { "epoch": 0.03697951490710371, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5844, "step": 1756 }, { "epoch": 0.03700057385636744, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.5927, "step": 1757 }, { "epoch": 0.037021632805631165, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6227, "step": 1758 }, { "epoch": 0.03704269175489489, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6242, "step": 1759 }, { "epoch": 0.03706375070415861, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5955, "step": 1760 }, { "epoch": 0.03708480965342234, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6174, "step": 1761 }, { "epoch": 0.03710586860268607, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.6206, "step": 1762 }, { "epoch": 0.037126927551949795, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6082, "step": 1763 }, { "epoch": 0.03714798650121352, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.579, "step": 1764 }, { "epoch": 0.03716904545047725, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5848, "step": 1765 }, { "epoch": 0.03719010439974098, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6277, "step": 1766 }, { "epoch": 0.0372111633490047, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6189, "step": 1767 }, { "epoch": 0.037232222298268426, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5865, "step": 1768 }, { "epoch": 0.037253281247532154, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.6111, "step": 1769 }, { "epoch": 0.03727434019679588, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5833, "step": 1770 }, { "epoch": 0.03729539914605961, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6287, "step": 1771 }, { "epoch": 0.037316458095323336, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5966, "step": 1772 }, { "epoch": 0.03733751704458706, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6174, "step": 1773 }, { "epoch": 0.037358575993850784, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6008, "step": 1774 }, { "epoch": 0.03737963494311451, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6294, "step": 1775 }, { "epoch": 0.03740069389237824, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5896, "step": 1776 }, { "epoch": 0.03742175284164197, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6171, "step": 1777 }, { "epoch": 0.037442811790905695, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5954, "step": 1778 }, { "epoch": 0.03746387074016942, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6256, "step": 1779 }, { "epoch": 0.03748492968943314, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5872, "step": 1780 }, { "epoch": 0.03750598863869687, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5856, "step": 1781 }, { "epoch": 0.0375270475879606, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5942, "step": 1782 }, { "epoch": 0.037548106537224325, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5825, "step": 1783 }, { "epoch": 0.03756916548648805, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5898, "step": 1784 }, { "epoch": 0.03759022443575178, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6136, "step": 1785 }, { "epoch": 0.03761128338501551, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5853, "step": 1786 }, { "epoch": 0.03763234233427923, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6042, "step": 1787 }, { "epoch": 0.037653401283542956, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5865, "step": 1788 }, { "epoch": 0.037674460232806684, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5853, "step": 1789 }, { "epoch": 0.03769551918207041, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5975, "step": 1790 }, { "epoch": 0.03771657813133414, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6065, "step": 1791 }, { "epoch": 0.037737637080597866, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.608, "step": 1792 }, { "epoch": 0.03775869602986159, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5896, "step": 1793 }, { "epoch": 0.037779754979125314, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6268, "step": 1794 }, { "epoch": 0.03780081392838904, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5722, "step": 1795 }, { "epoch": 0.03782187287765277, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6395, "step": 1796 }, { "epoch": 0.0378429318269165, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.595, "step": 1797 }, { "epoch": 0.037863990776180224, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6087, "step": 1798 }, { "epoch": 0.03788504972544395, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6041, "step": 1799 }, { "epoch": 0.03790610867470767, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6147, "step": 1800 }, { "epoch": 0.0379271676239714, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5663, "step": 1801 }, { "epoch": 0.03794822657323513, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5874, "step": 1802 }, { "epoch": 0.037969285522498855, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5665, "step": 1803 }, { "epoch": 0.03799034447176258, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6276, "step": 1804 }, { "epoch": 0.03801140342102631, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6161, "step": 1805 }, { "epoch": 0.03803246237029004, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6328, "step": 1806 }, { "epoch": 0.03805352131955376, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6246, "step": 1807 }, { "epoch": 0.038074580268817486, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5932, "step": 1808 }, { "epoch": 0.03809563921808121, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6207, "step": 1809 }, { "epoch": 0.03811669816734494, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6203, "step": 1810 }, { "epoch": 0.03813775711660867, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6118, "step": 1811 }, { "epoch": 0.038158816065872396, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6038, "step": 1812 }, { "epoch": 0.03817987501513612, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.5791, "step": 1813 }, { "epoch": 0.038200933964399844, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6181, "step": 1814 }, { "epoch": 0.03822199291366357, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5981, "step": 1815 }, { "epoch": 0.0382430518629273, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6115, "step": 1816 }, { "epoch": 0.03826411081219103, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6006, "step": 1817 }, { "epoch": 0.038285169761454754, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6372, "step": 1818 }, { "epoch": 0.03830622871071848, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6259, "step": 1819 }, { "epoch": 0.0383272876599822, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.5941, "step": 1820 }, { "epoch": 0.03834834660924593, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6278, "step": 1821 }, { "epoch": 0.03836940555850966, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5952, "step": 1822 }, { "epoch": 0.038390464507773385, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5975, "step": 1823 }, { "epoch": 0.03841152345703711, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5658, "step": 1824 }, { "epoch": 0.03843258240630084, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6069, "step": 1825 }, { "epoch": 0.03845364135556456, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6112, "step": 1826 }, { "epoch": 0.03847470030482829, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6003, "step": 1827 }, { "epoch": 0.038495759254092016, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6071, "step": 1828 }, { "epoch": 0.03851681820335574, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5912, "step": 1829 }, { "epoch": 0.03853787715261947, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5888, "step": 1830 }, { "epoch": 0.0385589361018832, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5842, "step": 1831 }, { "epoch": 0.038579995051146926, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6217, "step": 1832 }, { "epoch": 0.038601054000410646, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.574, "step": 1833 }, { "epoch": 0.038622112949674374, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6071, "step": 1834 }, { "epoch": 0.0386431718989381, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5787, "step": 1835 }, { "epoch": 0.03866423084820183, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6114, "step": 1836 }, { "epoch": 0.03868528979746556, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5771, "step": 1837 }, { "epoch": 0.038706348746729284, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5815, "step": 1838 }, { "epoch": 0.03872740769599301, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5942, "step": 1839 }, { "epoch": 0.03874846664525673, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5875, "step": 1840 }, { "epoch": 0.03876952559452046, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5964, "step": 1841 }, { "epoch": 0.03879058454378419, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5991, "step": 1842 }, { "epoch": 0.038811643493047915, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.582, "step": 1843 }, { "epoch": 0.03883270244231164, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5934, "step": 1844 }, { "epoch": 0.03885376139157537, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6071, "step": 1845 }, { "epoch": 0.03887482034083909, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.606, "step": 1846 }, { "epoch": 0.03889587929010282, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5832, "step": 1847 }, { "epoch": 0.038916938239366546, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6305, "step": 1848 }, { "epoch": 0.03893799718863027, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5599, "step": 1849 }, { "epoch": 0.038959056137894, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5752, "step": 1850 }, { "epoch": 0.03898011508715773, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5825, "step": 1851 }, { "epoch": 0.039001174036421456, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6291, "step": 1852 }, { "epoch": 0.039022232985685176, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6348, "step": 1853 }, { "epoch": 0.039043291934948904, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5951, "step": 1854 }, { "epoch": 0.03906435088421263, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6028, "step": 1855 }, { "epoch": 0.03908540983347636, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6309, "step": 1856 }, { "epoch": 0.039106468782740086, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5804, "step": 1857 }, { "epoch": 0.039127527732003814, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6422, "step": 1858 }, { "epoch": 0.03914858668126754, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6407, "step": 1859 }, { "epoch": 0.03916964563053126, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5876, "step": 1860 }, { "epoch": 0.03919070457979499, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6223, "step": 1861 }, { "epoch": 0.03921176352905872, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5936, "step": 1862 }, { "epoch": 0.039232822478322445, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5957, "step": 1863 }, { "epoch": 0.03925388142758617, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.5882, "step": 1864 }, { "epoch": 0.0392749403768499, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6099, "step": 1865 }, { "epoch": 0.03929599932611362, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5659, "step": 1866 }, { "epoch": 0.03931705827537735, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6221, "step": 1867 }, { "epoch": 0.039338117224641075, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5818, "step": 1868 }, { "epoch": 0.0393591761739048, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6202, "step": 1869 }, { "epoch": 0.03938023512316853, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.58, "step": 1870 }, { "epoch": 0.03940129407243226, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5926, "step": 1871 }, { "epoch": 0.039422353021695986, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6076, "step": 1872 }, { "epoch": 0.039443411970959706, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5995, "step": 1873 }, { "epoch": 0.039464470920223434, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6201, "step": 1874 }, { "epoch": 0.03948552986948716, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6056, "step": 1875 }, { "epoch": 0.03950658881875089, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5909, "step": 1876 }, { "epoch": 0.039527647768014616, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6122, "step": 1877 }, { "epoch": 0.039548706717278344, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5896, "step": 1878 }, { "epoch": 0.03956976566654207, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6212, "step": 1879 }, { "epoch": 0.03959082461580579, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5957, "step": 1880 }, { "epoch": 0.03961188356506952, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5789, "step": 1881 }, { "epoch": 0.03963294251433325, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6168, "step": 1882 }, { "epoch": 0.039654001463596975, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5993, "step": 1883 }, { "epoch": 0.0396750604128607, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5937, "step": 1884 }, { "epoch": 0.03969611936212443, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5667, "step": 1885 }, { "epoch": 0.03971717831138815, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6094, "step": 1886 }, { "epoch": 0.03973823726065188, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6117, "step": 1887 }, { "epoch": 0.039759296209915605, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5944, "step": 1888 }, { "epoch": 0.03978035515917933, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6253, "step": 1889 }, { "epoch": 0.03980141410844306, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.6042, "step": 1890 }, { "epoch": 0.03982247305770679, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6088, "step": 1891 }, { "epoch": 0.039843532006970515, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6036, "step": 1892 }, { "epoch": 0.039864590956234236, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5838, "step": 1893 }, { "epoch": 0.039885649905497964, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6039, "step": 1894 }, { "epoch": 0.03990670885476169, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6171, "step": 1895 }, { "epoch": 0.03992776780402542, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5895, "step": 1896 }, { "epoch": 0.039948826753289146, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6371, "step": 1897 }, { "epoch": 0.039969885702552874, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6087, "step": 1898 }, { "epoch": 0.039990944651816594, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5916, "step": 1899 }, { "epoch": 0.04001200360108032, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5966, "step": 1900 }, { "epoch": 0.04003306255034405, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6115, "step": 1901 }, { "epoch": 0.04005412149960778, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6112, "step": 1902 }, { "epoch": 0.040075180448871504, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5854, "step": 1903 }, { "epoch": 0.04009623939813523, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5728, "step": 1904 }, { "epoch": 0.04011729834739896, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6113, "step": 1905 }, { "epoch": 0.04013835729666268, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.6155, "step": 1906 }, { "epoch": 0.04015941624592641, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6017, "step": 1907 }, { "epoch": 0.040180475195190135, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5607, "step": 1908 }, { "epoch": 0.04020153414445386, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5974, "step": 1909 }, { "epoch": 0.04022259309371759, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5876, "step": 1910 }, { "epoch": 0.04024365204298132, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6018, "step": 1911 }, { "epoch": 0.040264710992245045, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6007, "step": 1912 }, { "epoch": 0.040285769941508766, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.593, "step": 1913 }, { "epoch": 0.04030682889077249, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6258, "step": 1914 }, { "epoch": 0.04032788784003622, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5898, "step": 1915 }, { "epoch": 0.04034894678929995, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.607, "step": 1916 }, { "epoch": 0.040370005738563676, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6196, "step": 1917 }, { "epoch": 0.040391064687827404, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6182, "step": 1918 }, { "epoch": 0.040412123637091124, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6237, "step": 1919 }, { "epoch": 0.04043318258635485, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6046, "step": 1920 }, { "epoch": 0.04045424153561858, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.565, "step": 1921 }, { "epoch": 0.04047530048488231, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5871, "step": 1922 }, { "epoch": 0.040496359434146034, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6143, "step": 1923 }, { "epoch": 0.04051741838340976, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5711, "step": 1924 }, { "epoch": 0.04053847733267349, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6167, "step": 1925 }, { "epoch": 0.04055953628193721, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5939, "step": 1926 }, { "epoch": 0.04058059523120094, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5911, "step": 1927 }, { "epoch": 0.040601654180464665, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6079, "step": 1928 }, { "epoch": 0.04062271312972839, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5844, "step": 1929 }, { "epoch": 0.04064377207899212, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6206, "step": 1930 }, { "epoch": 0.04066483102825585, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5936, "step": 1931 }, { "epoch": 0.040685889977519575, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5745, "step": 1932 }, { "epoch": 0.040706948926783296, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5739, "step": 1933 }, { "epoch": 0.04072800787604702, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5967, "step": 1934 }, { "epoch": 0.04074906682531075, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5997, "step": 1935 }, { "epoch": 0.04077012577457448, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.576, "step": 1936 }, { "epoch": 0.040791184723838206, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6067, "step": 1937 }, { "epoch": 0.04081224367310193, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5923, "step": 1938 }, { "epoch": 0.040833302622365654, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5872, "step": 1939 }, { "epoch": 0.04085436157162938, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5733, "step": 1940 }, { "epoch": 0.04087542052089311, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6116, "step": 1941 }, { "epoch": 0.04089647947015684, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6228, "step": 1942 }, { "epoch": 0.040917538419420564, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5874, "step": 1943 }, { "epoch": 0.04093859736868429, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6016, "step": 1944 }, { "epoch": 0.04095965631794802, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.587, "step": 1945 }, { "epoch": 0.04098071526721174, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5799, "step": 1946 }, { "epoch": 0.04100177421647547, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5829, "step": 1947 }, { "epoch": 0.041022833165739195, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6216, "step": 1948 }, { "epoch": 0.04104389211500292, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6079, "step": 1949 }, { "epoch": 0.04106495106426665, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5933, "step": 1950 }, { "epoch": 0.04108601001353038, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5821, "step": 1951 }, { "epoch": 0.0411070689627941, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.584, "step": 1952 }, { "epoch": 0.041128127912057826, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5867, "step": 1953 }, { "epoch": 0.04114918686132155, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.617, "step": 1954 }, { "epoch": 0.04117024581058528, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6158, "step": 1955 }, { "epoch": 0.04119130475984901, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5998, "step": 1956 }, { "epoch": 0.041212363709112736, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6092, "step": 1957 }, { "epoch": 0.04123342265837646, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6213, "step": 1958 }, { "epoch": 0.041254481607640184, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5695, "step": 1959 }, { "epoch": 0.04127554055690391, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.6084, "step": 1960 }, { "epoch": 0.04129659950616764, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5859, "step": 1961 }, { "epoch": 0.041317658455431366, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6044, "step": 1962 }, { "epoch": 0.041338717404695094, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6034, "step": 1963 }, { "epoch": 0.04135977635395882, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5724, "step": 1964 }, { "epoch": 0.04138083530322255, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5941, "step": 1965 }, { "epoch": 0.04140189425248627, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5908, "step": 1966 }, { "epoch": 0.04142295320175, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6162, "step": 1967 }, { "epoch": 0.041444012151013725, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6222, "step": 1968 }, { "epoch": 0.04146507110027745, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6013, "step": 1969 }, { "epoch": 0.04148613004954118, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5867, "step": 1970 }, { "epoch": 0.04150718899880491, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6077, "step": 1971 }, { "epoch": 0.04152824794806863, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5653, "step": 1972 }, { "epoch": 0.041549306897332355, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6137, "step": 1973 }, { "epoch": 0.04157036584659608, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6051, "step": 1974 }, { "epoch": 0.04159142479585981, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5789, "step": 1975 }, { "epoch": 0.04161248374512354, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6153, "step": 1976 }, { "epoch": 0.041633542694387266, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5798, "step": 1977 }, { "epoch": 0.04165460164365099, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6105, "step": 1978 }, { "epoch": 0.041675660592914714, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5493, "step": 1979 }, { "epoch": 0.04169671954217844, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5896, "step": 1980 }, { "epoch": 0.04171777849144217, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5841, "step": 1981 }, { "epoch": 0.041738837440705896, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5943, "step": 1982 }, { "epoch": 0.041759896389969624, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.612, "step": 1983 }, { "epoch": 0.04178095533923335, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6343, "step": 1984 }, { "epoch": 0.04180201428849708, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6086, "step": 1985 }, { "epoch": 0.0418230732377608, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6427, "step": 1986 }, { "epoch": 0.04184413218702453, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5919, "step": 1987 }, { "epoch": 0.041865191136288255, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.601, "step": 1988 }, { "epoch": 0.04188625008555198, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5932, "step": 1989 }, { "epoch": 0.04190730903481571, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5902, "step": 1990 }, { "epoch": 0.04192836798407944, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.6053, "step": 1991 }, { "epoch": 0.04194942693334316, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5728, "step": 1992 }, { "epoch": 0.041970485882606885, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5954, "step": 1993 }, { "epoch": 0.04199154483187061, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5832, "step": 1994 }, { "epoch": 0.04201260378113434, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5696, "step": 1995 }, { "epoch": 0.04203366273039807, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6157, "step": 1996 }, { "epoch": 0.042054721679661795, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5868, "step": 1997 }, { "epoch": 0.04207578062892552, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6228, "step": 1998 }, { "epoch": 0.042096839578189243, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5957, "step": 1999 }, { "epoch": 0.04211789852745297, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5743, "step": 2000 }, { "epoch": 0.04211789852745297, "eval_loss": 1.7890796661376953, "eval_runtime": 898.0231, "eval_samples_per_second": 68.818, "eval_steps_per_second": 2.151, "step": 2000 }, { "epoch": 0.0421389574767167, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5621, "step": 2001 }, { "epoch": 0.042160016425980426, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5761, "step": 2002 }, { "epoch": 0.042181075375244154, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6679, "step": 2003 }, { "epoch": 0.04220213432450788, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5733, "step": 2004 }, { "epoch": 0.0422231932737716, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6052, "step": 2005 }, { "epoch": 0.04224425222303533, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5922, "step": 2006 }, { "epoch": 0.04226531117229906, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6025, "step": 2007 }, { "epoch": 0.042286370121562784, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5753, "step": 2008 }, { "epoch": 0.04230742907082651, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.574, "step": 2009 }, { "epoch": 0.04232848802009024, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6063, "step": 2010 }, { "epoch": 0.04234954696935397, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5711, "step": 2011 }, { "epoch": 0.04237060591861769, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5817, "step": 2012 }, { "epoch": 0.042391664867881415, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5955, "step": 2013 }, { "epoch": 0.04241272381714514, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.59, "step": 2014 }, { "epoch": 0.04243378276640887, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6189, "step": 2015 }, { "epoch": 0.0424548417156726, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5851, "step": 2016 }, { "epoch": 0.042475900664936325, "grad_norm": 0.134765625, "learning_rate": 0.0002, "loss": 1.6274, "step": 2017 }, { "epoch": 0.04249695961420005, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6306, "step": 2018 }, { "epoch": 0.04251801856346377, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5685, "step": 2019 }, { "epoch": 0.0425390775127275, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6, "step": 2020 }, { "epoch": 0.04256013646199123, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5981, "step": 2021 }, { "epoch": 0.042581195411254956, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6069, "step": 2022 }, { "epoch": 0.042602254360518683, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5855, "step": 2023 }, { "epoch": 0.04262331330978241, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6244, "step": 2024 }, { "epoch": 0.04264437225904613, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.611, "step": 2025 }, { "epoch": 0.04266543120830986, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5772, "step": 2026 }, { "epoch": 0.04268649015757359, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5933, "step": 2027 }, { "epoch": 0.042707549106837314, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5734, "step": 2028 }, { "epoch": 0.04272860805610104, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5894, "step": 2029 }, { "epoch": 0.04274966700536477, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5752, "step": 2030 }, { "epoch": 0.0427707259546285, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6598, "step": 2031 }, { "epoch": 0.04279178490389222, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5786, "step": 2032 }, { "epoch": 0.042812843853155945, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5795, "step": 2033 }, { "epoch": 0.04283390280241967, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6147, "step": 2034 }, { "epoch": 0.0428549617516834, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6133, "step": 2035 }, { "epoch": 0.04287602070094713, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5375, "step": 2036 }, { "epoch": 0.042897079650210855, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5853, "step": 2037 }, { "epoch": 0.04291813859947458, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.5774, "step": 2038 }, { "epoch": 0.0429391975487383, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.626, "step": 2039 }, { "epoch": 0.04296025649800203, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.595, "step": 2040 }, { "epoch": 0.04298131544726576, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5977, "step": 2041 }, { "epoch": 0.043002374396529486, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5909, "step": 2042 }, { "epoch": 0.04302343334579321, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6905, "step": 2043 }, { "epoch": 0.04304449229505694, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5851, "step": 2044 }, { "epoch": 0.04306555124432066, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6042, "step": 2045 }, { "epoch": 0.04308661019358439, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5979, "step": 2046 }, { "epoch": 0.043107669142848117, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5956, "step": 2047 }, { "epoch": 0.043128728092111844, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6312, "step": 2048 }, { "epoch": 0.04314978704137557, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.561, "step": 2049 }, { "epoch": 0.0431708459906393, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5861, "step": 2050 }, { "epoch": 0.04319190493990303, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5792, "step": 2051 }, { "epoch": 0.04321296388916675, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5846, "step": 2052 }, { "epoch": 0.043234022838430475, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5737, "step": 2053 }, { "epoch": 0.0432550817876942, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6068, "step": 2054 }, { "epoch": 0.04327614073695793, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5881, "step": 2055 }, { "epoch": 0.04329719968622166, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6127, "step": 2056 }, { "epoch": 0.043318258635485385, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5723, "step": 2057 }, { "epoch": 0.043339317584749106, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6329, "step": 2058 }, { "epoch": 0.04336037653401283, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6023, "step": 2059 }, { "epoch": 0.04338143548327656, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5703, "step": 2060 }, { "epoch": 0.04340249443254029, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.552, "step": 2061 }, { "epoch": 0.043423553381804016, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6157, "step": 2062 }, { "epoch": 0.04344461233106774, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6175, "step": 2063 }, { "epoch": 0.04346567128033147, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.5873, "step": 2064 }, { "epoch": 0.04348673022959519, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5981, "step": 2065 }, { "epoch": 0.04350778917885892, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.618, "step": 2066 }, { "epoch": 0.043528848128122646, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5708, "step": 2067 }, { "epoch": 0.043549907077386374, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.59, "step": 2068 }, { "epoch": 0.0435709660266501, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6023, "step": 2069 }, { "epoch": 0.04359202497591383, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5976, "step": 2070 }, { "epoch": 0.043613083925177556, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5735, "step": 2071 }, { "epoch": 0.04363414287444128, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6009, "step": 2072 }, { "epoch": 0.043655201823705005, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6065, "step": 2073 }, { "epoch": 0.04367626077296873, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6232, "step": 2074 }, { "epoch": 0.04369731972223246, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5797, "step": 2075 }, { "epoch": 0.04371837867149619, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5828, "step": 2076 }, { "epoch": 0.043739437620759915, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5933, "step": 2077 }, { "epoch": 0.043760496570023635, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5795, "step": 2078 }, { "epoch": 0.04378155551928736, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6197, "step": 2079 }, { "epoch": 0.04380261446855109, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6049, "step": 2080 }, { "epoch": 0.04382367341781482, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6032, "step": 2081 }, { "epoch": 0.043844732367078545, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6002, "step": 2082 }, { "epoch": 0.04386579131634227, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6031, "step": 2083 }, { "epoch": 0.043886850265606, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5946, "step": 2084 }, { "epoch": 0.04390790921486972, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5992, "step": 2085 }, { "epoch": 0.04392896816413345, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5925, "step": 2086 }, { "epoch": 0.043950027113397176, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5633, "step": 2087 }, { "epoch": 0.043971086062660904, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5859, "step": 2088 }, { "epoch": 0.04399214501192463, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5983, "step": 2089 }, { "epoch": 0.04401320396118836, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6131, "step": 2090 }, { "epoch": 0.044034262910452086, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5873, "step": 2091 }, { "epoch": 0.04405532185971581, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6104, "step": 2092 }, { "epoch": 0.044076380808979534, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6049, "step": 2093 }, { "epoch": 0.04409743975824326, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6067, "step": 2094 }, { "epoch": 0.04411849870750699, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5931, "step": 2095 }, { "epoch": 0.04413955765677072, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5595, "step": 2096 }, { "epoch": 0.044160616606034445, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5983, "step": 2097 }, { "epoch": 0.044181675555298165, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5902, "step": 2098 }, { "epoch": 0.04420273450456189, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5605, "step": 2099 }, { "epoch": 0.04422379345382562, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6205, "step": 2100 }, { "epoch": 0.04424485240308935, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5896, "step": 2101 }, { "epoch": 0.044265911352353075, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5778, "step": 2102 }, { "epoch": 0.0442869703016168, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5794, "step": 2103 }, { "epoch": 0.04430802925088053, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5911, "step": 2104 }, { "epoch": 0.04432908820014425, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5852, "step": 2105 }, { "epoch": 0.04435014714940798, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5999, "step": 2106 }, { "epoch": 0.044371206098671706, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.615, "step": 2107 }, { "epoch": 0.044392265047935434, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.6074, "step": 2108 }, { "epoch": 0.04441332399719916, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5793, "step": 2109 }, { "epoch": 0.04443438294646289, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.575, "step": 2110 }, { "epoch": 0.04445544189572661, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6551, "step": 2111 }, { "epoch": 0.04447650084499034, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6081, "step": 2112 }, { "epoch": 0.044497559794254064, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6188, "step": 2113 }, { "epoch": 0.04451861874351779, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6099, "step": 2114 }, { "epoch": 0.04453967769278152, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5805, "step": 2115 }, { "epoch": 0.04456073664204525, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6216, "step": 2116 }, { "epoch": 0.044581795591308974, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5917, "step": 2117 }, { "epoch": 0.044602854540572695, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.593, "step": 2118 }, { "epoch": 0.04462391348983642, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.6089, "step": 2119 }, { "epoch": 0.04464497243910015, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.627, "step": 2120 }, { "epoch": 0.04466603138836388, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6215, "step": 2121 }, { "epoch": 0.044687090337627605, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6516, "step": 2122 }, { "epoch": 0.04470814928689133, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6005, "step": 2123 }, { "epoch": 0.04472920823615506, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6134, "step": 2124 }, { "epoch": 0.04475026718541878, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6151, "step": 2125 }, { "epoch": 0.04477132613468251, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6272, "step": 2126 }, { "epoch": 0.044792385083946236, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5782, "step": 2127 }, { "epoch": 0.04481344403320996, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5806, "step": 2128 }, { "epoch": 0.04483450298247369, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5827, "step": 2129 }, { "epoch": 0.04485556193173742, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.61, "step": 2130 }, { "epoch": 0.04487662088100114, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6023, "step": 2131 }, { "epoch": 0.04489767983026487, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6042, "step": 2132 }, { "epoch": 0.044918738779528594, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5673, "step": 2133 }, { "epoch": 0.04493979772879232, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5994, "step": 2134 }, { "epoch": 0.04496085667805605, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5844, "step": 2135 }, { "epoch": 0.04498191562731978, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.579, "step": 2136 }, { "epoch": 0.045002974576583504, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6008, "step": 2137 }, { "epoch": 0.045024033525847225, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.603, "step": 2138 }, { "epoch": 0.04504509247511095, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5966, "step": 2139 }, { "epoch": 0.04506615142437468, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6106, "step": 2140 }, { "epoch": 0.04508721037363841, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5794, "step": 2141 }, { "epoch": 0.045108269322902135, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5868, "step": 2142 }, { "epoch": 0.04512932827216586, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6446, "step": 2143 }, { "epoch": 0.04515038722142959, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6136, "step": 2144 }, { "epoch": 0.04517144617069331, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.6259, "step": 2145 }, { "epoch": 0.04519250511995704, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6116, "step": 2146 }, { "epoch": 0.045213564069220766, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5756, "step": 2147 }, { "epoch": 0.04523462301848449, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6134, "step": 2148 }, { "epoch": 0.04525568196774822, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5883, "step": 2149 }, { "epoch": 0.04527674091701195, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5584, "step": 2150 }, { "epoch": 0.04529779986627567, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5829, "step": 2151 }, { "epoch": 0.045318858815539396, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5994, "step": 2152 }, { "epoch": 0.045339917764803124, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6224, "step": 2153 }, { "epoch": 0.04536097671406685, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5991, "step": 2154 }, { "epoch": 0.04538203566333058, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6016, "step": 2155 }, { "epoch": 0.04540309461259431, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5804, "step": 2156 }, { "epoch": 0.045424153561858034, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5816, "step": 2157 }, { "epoch": 0.045445212511121755, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6024, "step": 2158 }, { "epoch": 0.04546627146038548, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5725, "step": 2159 }, { "epoch": 0.04548733040964921, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5904, "step": 2160 }, { "epoch": 0.04550838935891294, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5804, "step": 2161 }, { "epoch": 0.045529448308176665, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5867, "step": 2162 }, { "epoch": 0.04555050725744039, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.6093, "step": 2163 }, { "epoch": 0.04557156620670412, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5837, "step": 2164 }, { "epoch": 0.04559262515596784, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5825, "step": 2165 }, { "epoch": 0.04561368410523157, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6057, "step": 2166 }, { "epoch": 0.045634743054495296, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.559, "step": 2167 }, { "epoch": 0.04565580200375902, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5964, "step": 2168 }, { "epoch": 0.04567686095302275, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6379, "step": 2169 }, { "epoch": 0.04569791990228648, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6101, "step": 2170 }, { "epoch": 0.0457189788515502, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5986, "step": 2171 }, { "epoch": 0.045740037800813926, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5735, "step": 2172 }, { "epoch": 0.045761096750077654, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5861, "step": 2173 }, { "epoch": 0.04578215569934138, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5571, "step": 2174 }, { "epoch": 0.04580321464860511, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6198, "step": 2175 }, { "epoch": 0.045824273597868836, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5953, "step": 2176 }, { "epoch": 0.045845332547132564, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6039, "step": 2177 }, { "epoch": 0.045866391496396285, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6041, "step": 2178 }, { "epoch": 0.04588745044566001, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6119, "step": 2179 }, { "epoch": 0.04590850939492374, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5567, "step": 2180 }, { "epoch": 0.04592956834418747, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5945, "step": 2181 }, { "epoch": 0.045950627293451195, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5728, "step": 2182 }, { "epoch": 0.04597168624271492, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6189, "step": 2183 }, { "epoch": 0.04599274519197864, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6273, "step": 2184 }, { "epoch": 0.04601380414124237, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5951, "step": 2185 }, { "epoch": 0.0460348630905061, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5727, "step": 2186 }, { "epoch": 0.046055922039769825, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5695, "step": 2187 }, { "epoch": 0.04607698098903355, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5949, "step": 2188 }, { "epoch": 0.04609803993829728, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5592, "step": 2189 }, { "epoch": 0.04611909888756101, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.5942, "step": 2190 }, { "epoch": 0.04614015783682473, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6061, "step": 2191 }, { "epoch": 0.046161216786088456, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.5963, "step": 2192 }, { "epoch": 0.046182275735352184, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5956, "step": 2193 }, { "epoch": 0.04620333468461591, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6178, "step": 2194 }, { "epoch": 0.04622439363387964, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5679, "step": 2195 }, { "epoch": 0.046245452583143366, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5601, "step": 2196 }, { "epoch": 0.046266511532407094, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.584, "step": 2197 }, { "epoch": 0.046287570481670814, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5911, "step": 2198 }, { "epoch": 0.04630862943093454, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5917, "step": 2199 }, { "epoch": 0.04632968838019827, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6175, "step": 2200 }, { "epoch": 0.046350747329462, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6204, "step": 2201 }, { "epoch": 0.046371806278725725, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6096, "step": 2202 }, { "epoch": 0.04639286522798945, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6021, "step": 2203 }, { "epoch": 0.04641392417725317, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.6191, "step": 2204 }, { "epoch": 0.0464349831265169, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5732, "step": 2205 }, { "epoch": 0.04645604207578063, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6043, "step": 2206 }, { "epoch": 0.046477101025044355, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6177, "step": 2207 }, { "epoch": 0.04649815997430808, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6073, "step": 2208 }, { "epoch": 0.04651921892357181, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5933, "step": 2209 }, { "epoch": 0.04654027787283554, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.5746, "step": 2210 }, { "epoch": 0.04656133682209926, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5902, "step": 2211 }, { "epoch": 0.046582395771362986, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6061, "step": 2212 }, { "epoch": 0.046603454720626714, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5973, "step": 2213 }, { "epoch": 0.04662451366989044, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6374, "step": 2214 }, { "epoch": 0.04664557261915417, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5846, "step": 2215 }, { "epoch": 0.046666631568417896, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5722, "step": 2216 }, { "epoch": 0.046687690517681624, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.6209, "step": 2217 }, { "epoch": 0.046708749466945344, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5989, "step": 2218 }, { "epoch": 0.04672980841620907, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5628, "step": 2219 }, { "epoch": 0.0467508673654728, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.628, "step": 2220 }, { "epoch": 0.04677192631473653, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.579, "step": 2221 }, { "epoch": 0.046792985264000254, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5852, "step": 2222 }, { "epoch": 0.04681404421326398, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5906, "step": 2223 }, { "epoch": 0.0468351031625277, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5768, "step": 2224 }, { "epoch": 0.04685616211179143, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5892, "step": 2225 }, { "epoch": 0.04687722106105516, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5862, "step": 2226 }, { "epoch": 0.046898280010318885, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.6256, "step": 2227 }, { "epoch": 0.04691933895958261, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5428, "step": 2228 }, { "epoch": 0.04694039790884634, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5973, "step": 2229 }, { "epoch": 0.04696145685811007, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6115, "step": 2230 }, { "epoch": 0.04698251580737379, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5907, "step": 2231 }, { "epoch": 0.047003574756637516, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5975, "step": 2232 }, { "epoch": 0.04702463370590124, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5617, "step": 2233 }, { "epoch": 0.04704569265516497, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5845, "step": 2234 }, { "epoch": 0.0470667516044287, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5938, "step": 2235 }, { "epoch": 0.047087810553692426, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.617, "step": 2236 }, { "epoch": 0.04710886950295615, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5549, "step": 2237 }, { "epoch": 0.047129928452219874, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5941, "step": 2238 }, { "epoch": 0.0471509874014836, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6139, "step": 2239 }, { "epoch": 0.04717204635074733, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5802, "step": 2240 }, { "epoch": 0.04719310530001106, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5661, "step": 2241 }, { "epoch": 0.047214164249274784, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5734, "step": 2242 }, { "epoch": 0.04723522319853851, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5963, "step": 2243 }, { "epoch": 0.04725628214780223, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6141, "step": 2244 }, { "epoch": 0.04727734109706596, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5899, "step": 2245 }, { "epoch": 0.04729840004632969, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5898, "step": 2246 }, { "epoch": 0.047319458995593415, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6051, "step": 2247 }, { "epoch": 0.04734051794485714, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5814, "step": 2248 }, { "epoch": 0.04736157689412087, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.604, "step": 2249 }, { "epoch": 0.0473826358433846, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5657, "step": 2250 }, { "epoch": 0.04740369479264832, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.643, "step": 2251 }, { "epoch": 0.047424753741912046, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5994, "step": 2252 }, { "epoch": 0.04744581269117577, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5882, "step": 2253 }, { "epoch": 0.0474668716404395, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.605, "step": 2254 }, { "epoch": 0.04748793058970323, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.593, "step": 2255 }, { "epoch": 0.047508989538966956, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6179, "step": 2256 }, { "epoch": 0.047530048488230676, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6007, "step": 2257 }, { "epoch": 0.047551107437494404, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.587, "step": 2258 }, { "epoch": 0.04757216638675813, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6399, "step": 2259 }, { "epoch": 0.04759322533602186, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5978, "step": 2260 }, { "epoch": 0.04761428428528559, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5981, "step": 2261 }, { "epoch": 0.047635343234549314, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6017, "step": 2262 }, { "epoch": 0.04765640218381304, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5893, "step": 2263 }, { "epoch": 0.04767746113307676, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.597, "step": 2264 }, { "epoch": 0.04769852008234049, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.6333, "step": 2265 }, { "epoch": 0.04771957903160422, "grad_norm": 0.2158203125, "learning_rate": 0.0002, "loss": 1.6663, "step": 2266 }, { "epoch": 0.047740637980867945, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.616, "step": 2267 }, { "epoch": 0.04776169693013167, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5909, "step": 2268 }, { "epoch": 0.0477827558793954, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5946, "step": 2269 }, { "epoch": 0.04780381482865913, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5683, "step": 2270 }, { "epoch": 0.04782487377792285, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5998, "step": 2271 }, { "epoch": 0.047845932727186576, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5991, "step": 2272 }, { "epoch": 0.0478669916764503, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5833, "step": 2273 }, { "epoch": 0.04788805062571403, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6072, "step": 2274 }, { "epoch": 0.04790910957497776, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6102, "step": 2275 }, { "epoch": 0.047930168524241486, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6282, "step": 2276 }, { "epoch": 0.047951227473505206, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6009, "step": 2277 }, { "epoch": 0.047972286422768934, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6353, "step": 2278 }, { "epoch": 0.04799334537203266, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6021, "step": 2279 }, { "epoch": 0.04801440432129639, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.603, "step": 2280 }, { "epoch": 0.048035463270560116, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5933, "step": 2281 }, { "epoch": 0.048056522219823844, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6063, "step": 2282 }, { "epoch": 0.04807758116908757, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6145, "step": 2283 }, { "epoch": 0.04809864011835129, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5623, "step": 2284 }, { "epoch": 0.04811969906761502, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5876, "step": 2285 }, { "epoch": 0.04814075801687875, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6138, "step": 2286 }, { "epoch": 0.048161816966142475, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6106, "step": 2287 }, { "epoch": 0.0481828759154062, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5788, "step": 2288 }, { "epoch": 0.04820393486466993, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.624, "step": 2289 }, { "epoch": 0.04822499381393365, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5955, "step": 2290 }, { "epoch": 0.04824605276319738, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5946, "step": 2291 }, { "epoch": 0.048267111712461105, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6034, "step": 2292 }, { "epoch": 0.04828817066172483, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6028, "step": 2293 }, { "epoch": 0.04830922961098856, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5989, "step": 2294 }, { "epoch": 0.04833028856025229, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.5621, "step": 2295 }, { "epoch": 0.048351347509516016, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6268, "step": 2296 }, { "epoch": 0.048372406458779736, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5719, "step": 2297 }, { "epoch": 0.048393465408043464, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6334, "step": 2298 }, { "epoch": 0.04841452435730719, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5648, "step": 2299 }, { "epoch": 0.04843558330657092, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5861, "step": 2300 }, { "epoch": 0.048456642255834646, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5743, "step": 2301 }, { "epoch": 0.048477701205098374, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5762, "step": 2302 }, { "epoch": 0.0484987601543621, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6071, "step": 2303 }, { "epoch": 0.04851981910362582, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6033, "step": 2304 }, { "epoch": 0.04854087805288955, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5778, "step": 2305 }, { "epoch": 0.04856193700215328, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5825, "step": 2306 }, { "epoch": 0.048582995951417005, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6034, "step": 2307 }, { "epoch": 0.04860405490068073, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5886, "step": 2308 }, { "epoch": 0.04862511384994446, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5816, "step": 2309 }, { "epoch": 0.04864617279920818, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5981, "step": 2310 }, { "epoch": 0.04866723174847191, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5786, "step": 2311 }, { "epoch": 0.048688290697735635, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5972, "step": 2312 }, { "epoch": 0.04870934964699936, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.611, "step": 2313 }, { "epoch": 0.04873040859626309, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5743, "step": 2314 }, { "epoch": 0.04875146754552682, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.606, "step": 2315 }, { "epoch": 0.048772526494790545, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5898, "step": 2316 }, { "epoch": 0.048793585444054266, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5734, "step": 2317 }, { "epoch": 0.048814644393317994, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6076, "step": 2318 }, { "epoch": 0.04883570334258172, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5997, "step": 2319 }, { "epoch": 0.04885676229184545, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6098, "step": 2320 }, { "epoch": 0.048877821241109176, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5645, "step": 2321 }, { "epoch": 0.048898880190372904, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5676, "step": 2322 }, { "epoch": 0.04891993913963663, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5777, "step": 2323 }, { "epoch": 0.04894099808890035, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.598, "step": 2324 }, { "epoch": 0.04896205703816408, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.6044, "step": 2325 }, { "epoch": 0.04898311598742781, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6068, "step": 2326 }, { "epoch": 0.049004174936691534, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5947, "step": 2327 }, { "epoch": 0.04902523388595526, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.5867, "step": 2328 }, { "epoch": 0.04904629283521899, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5654, "step": 2329 }, { "epoch": 0.04906735178448271, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5914, "step": 2330 }, { "epoch": 0.04908841073374644, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.586, "step": 2331 }, { "epoch": 0.049109469683010165, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5908, "step": 2332 }, { "epoch": 0.04913052863227389, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5924, "step": 2333 }, { "epoch": 0.04915158758153762, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6278, "step": 2334 }, { "epoch": 0.04917264653080135, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5881, "step": 2335 }, { "epoch": 0.049193705480065075, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5798, "step": 2336 }, { "epoch": 0.049214764429328796, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5777, "step": 2337 }, { "epoch": 0.04923582337859252, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5701, "step": 2338 }, { "epoch": 0.04925688232785625, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5824, "step": 2339 }, { "epoch": 0.04927794127711998, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5982, "step": 2340 }, { "epoch": 0.049299000226383706, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5909, "step": 2341 }, { "epoch": 0.049320059175647434, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6241, "step": 2342 }, { "epoch": 0.049341118124911154, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5857, "step": 2343 }, { "epoch": 0.04936217707417488, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5957, "step": 2344 }, { "epoch": 0.04938323602343861, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5719, "step": 2345 }, { "epoch": 0.04940429497270234, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5909, "step": 2346 }, { "epoch": 0.049425353921966064, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6035, "step": 2347 }, { "epoch": 0.04944641287122979, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.611, "step": 2348 }, { "epoch": 0.04946747182049352, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5928, "step": 2349 }, { "epoch": 0.04948853076975724, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6077, "step": 2350 }, { "epoch": 0.04950958971902097, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.583, "step": 2351 }, { "epoch": 0.049530648668284695, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5872, "step": 2352 }, { "epoch": 0.04955170761754842, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.61, "step": 2353 }, { "epoch": 0.04957276656681215, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5635, "step": 2354 }, { "epoch": 0.04959382551607588, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6209, "step": 2355 }, { "epoch": 0.049614884465339605, "grad_norm": 0.1337890625, "learning_rate": 0.0002, "loss": 1.5828, "step": 2356 }, { "epoch": 0.049635943414603326, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5779, "step": 2357 }, { "epoch": 0.04965700236386705, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5767, "step": 2358 }, { "epoch": 0.04967806131313078, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5973, "step": 2359 }, { "epoch": 0.04969912026239451, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6, "step": 2360 }, { "epoch": 0.049720179211658236, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5986, "step": 2361 }, { "epoch": 0.04974123816092196, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5897, "step": 2362 }, { "epoch": 0.049762297110185684, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5719, "step": 2363 }, { "epoch": 0.04978335605944941, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5908, "step": 2364 }, { "epoch": 0.04980441500871314, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5645, "step": 2365 }, { "epoch": 0.04982547395797687, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5835, "step": 2366 }, { "epoch": 0.049846532907240594, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6007, "step": 2367 }, { "epoch": 0.04986759185650432, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6236, "step": 2368 }, { "epoch": 0.04988865080576805, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6026, "step": 2369 }, { "epoch": 0.04990970975503177, "grad_norm": 0.1826171875, "learning_rate": 0.0002, "loss": 1.6107, "step": 2370 }, { "epoch": 0.0499307687042955, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5673, "step": 2371 }, { "epoch": 0.049951827653559225, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6112, "step": 2372 }, { "epoch": 0.04997288660282295, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6088, "step": 2373 }, { "epoch": 0.04999394555208668, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6032, "step": 2374 }, { "epoch": 0.05001500450135041, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.59, "step": 2375 }, { "epoch": 0.050036063450614135, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.61, "step": 2376 }, { "epoch": 0.050057122399877856, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5624, "step": 2377 }, { "epoch": 0.05007818134914158, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5492, "step": 2378 }, { "epoch": 0.05009924029840531, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6157, "step": 2379 }, { "epoch": 0.05012029924766904, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.5921, "step": 2380 }, { "epoch": 0.050141358196932766, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5712, "step": 2381 }, { "epoch": 0.05016241714619649, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5943, "step": 2382 }, { "epoch": 0.050183476095460214, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6012, "step": 2383 }, { "epoch": 0.05020453504472394, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5972, "step": 2384 }, { "epoch": 0.05022559399398767, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6081, "step": 2385 }, { "epoch": 0.050246652943251396, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6204, "step": 2386 }, { "epoch": 0.050267711892515124, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5849, "step": 2387 }, { "epoch": 0.05028877084177885, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5819, "step": 2388 }, { "epoch": 0.05030982979104258, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6225, "step": 2389 }, { "epoch": 0.0503308887403063, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.605, "step": 2390 }, { "epoch": 0.05035194768957003, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.6118, "step": 2391 }, { "epoch": 0.050373006638833755, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5658, "step": 2392 }, { "epoch": 0.05039406558809748, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.609, "step": 2393 }, { "epoch": 0.05041512453736121, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6149, "step": 2394 }, { "epoch": 0.05043618348662494, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5982, "step": 2395 }, { "epoch": 0.05045724243588866, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.5839, "step": 2396 }, { "epoch": 0.050478301385152385, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5866, "step": 2397 }, { "epoch": 0.05049936033441611, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6133, "step": 2398 }, { "epoch": 0.05052041928367984, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5834, "step": 2399 }, { "epoch": 0.05054147823294357, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5836, "step": 2400 }, { "epoch": 0.050562537182207296, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5626, "step": 2401 }, { "epoch": 0.05058359613147102, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6125, "step": 2402 }, { "epoch": 0.050604655080734744, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6058, "step": 2403 }, { "epoch": 0.05062571402999847, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5926, "step": 2404 }, { "epoch": 0.0506467729792622, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5939, "step": 2405 }, { "epoch": 0.050667831928525926, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6173, "step": 2406 }, { "epoch": 0.050688890877789654, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5952, "step": 2407 }, { "epoch": 0.05070994982705338, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6027, "step": 2408 }, { "epoch": 0.05073100877631711, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5826, "step": 2409 }, { "epoch": 0.05075206772558083, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6061, "step": 2410 }, { "epoch": 0.05077312667484456, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5703, "step": 2411 }, { "epoch": 0.050794185624108285, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6009, "step": 2412 }, { "epoch": 0.05081524457337201, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5704, "step": 2413 }, { "epoch": 0.05083630352263574, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5692, "step": 2414 }, { "epoch": 0.05085736247189947, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5654, "step": 2415 }, { "epoch": 0.05087842142116319, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.612, "step": 2416 }, { "epoch": 0.050899480370426915, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5802, "step": 2417 }, { "epoch": 0.05092053931969064, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5807, "step": 2418 }, { "epoch": 0.05094159826895437, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5972, "step": 2419 }, { "epoch": 0.0509626572182181, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5843, "step": 2420 }, { "epoch": 0.050983716167481825, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.622, "step": 2421 }, { "epoch": 0.05100477511674555, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5887, "step": 2422 }, { "epoch": 0.051025834066009274, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5709, "step": 2423 }, { "epoch": 0.051046893015273, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6231, "step": 2424 }, { "epoch": 0.05106795196453673, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6185, "step": 2425 }, { "epoch": 0.051089010913800456, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5905, "step": 2426 }, { "epoch": 0.051110069863064184, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5775, "step": 2427 }, { "epoch": 0.05113112881232791, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5758, "step": 2428 }, { "epoch": 0.05115218776159164, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6167, "step": 2429 }, { "epoch": 0.05117324671085536, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5977, "step": 2430 }, { "epoch": 0.05119430566011909, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.5738, "step": 2431 }, { "epoch": 0.051215364609382814, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5768, "step": 2432 }, { "epoch": 0.05123642355864654, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5878, "step": 2433 }, { "epoch": 0.05125748250791027, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5849, "step": 2434 }, { "epoch": 0.051278541457174, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5851, "step": 2435 }, { "epoch": 0.05129960040643772, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5922, "step": 2436 }, { "epoch": 0.051320659355701445, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.5886, "step": 2437 }, { "epoch": 0.05134171830496517, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.594, "step": 2438 }, { "epoch": 0.0513627772542289, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5773, "step": 2439 }, { "epoch": 0.05138383620349263, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5868, "step": 2440 }, { "epoch": 0.051404895152756355, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6003, "step": 2441 }, { "epoch": 0.05142595410202008, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6046, "step": 2442 }, { "epoch": 0.0514470130512838, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.5878, "step": 2443 }, { "epoch": 0.05146807200054753, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6082, "step": 2444 }, { "epoch": 0.05148913094981126, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5925, "step": 2445 }, { "epoch": 0.051510189899074986, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5907, "step": 2446 }, { "epoch": 0.051531248848338713, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5636, "step": 2447 }, { "epoch": 0.05155230779760244, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6255, "step": 2448 }, { "epoch": 0.05157336674686617, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5846, "step": 2449 }, { "epoch": 0.05159442569612989, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6109, "step": 2450 }, { "epoch": 0.05161548464539362, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.5711, "step": 2451 }, { "epoch": 0.051636543594657344, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5943, "step": 2452 }, { "epoch": 0.05165760254392107, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6017, "step": 2453 }, { "epoch": 0.0516786614931848, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.617, "step": 2454 }, { "epoch": 0.05169972044244853, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6096, "step": 2455 }, { "epoch": 0.05172077939171225, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5665, "step": 2456 }, { "epoch": 0.051741838340975975, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5819, "step": 2457 }, { "epoch": 0.0517628972902397, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6206, "step": 2458 }, { "epoch": 0.05178395623950343, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6381, "step": 2459 }, { "epoch": 0.05180501518876716, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5907, "step": 2460 }, { "epoch": 0.051826074138030885, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5836, "step": 2461 }, { "epoch": 0.05184713308729461, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.598, "step": 2462 }, { "epoch": 0.05186819203655833, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.627, "step": 2463 }, { "epoch": 0.05188925098582206, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5617, "step": 2464 }, { "epoch": 0.05191030993508579, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5799, "step": 2465 }, { "epoch": 0.051931368884349516, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5963, "step": 2466 }, { "epoch": 0.05195242783361324, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5858, "step": 2467 }, { "epoch": 0.05197348678287697, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6237, "step": 2468 }, { "epoch": 0.05199454573214069, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5768, "step": 2469 }, { "epoch": 0.05201560468140442, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5735, "step": 2470 }, { "epoch": 0.05203666363066815, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5988, "step": 2471 }, { "epoch": 0.052057722579931874, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5947, "step": 2472 }, { "epoch": 0.0520787815291956, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5686, "step": 2473 }, { "epoch": 0.05209984047845933, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5656, "step": 2474 }, { "epoch": 0.05212089942772306, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6348, "step": 2475 }, { "epoch": 0.05214195837698678, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6371, "step": 2476 }, { "epoch": 0.052163017326250505, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5803, "step": 2477 }, { "epoch": 0.05218407627551423, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6156, "step": 2478 }, { "epoch": 0.05220513522477796, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5873, "step": 2479 }, { "epoch": 0.05222619417404169, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6313, "step": 2480 }, { "epoch": 0.052247253123305415, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5947, "step": 2481 }, { "epoch": 0.05226831207256914, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 1.6069, "step": 2482 }, { "epoch": 0.05228937102183286, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5696, "step": 2483 }, { "epoch": 0.05231042997109659, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5729, "step": 2484 }, { "epoch": 0.05233148892036032, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5934, "step": 2485 }, { "epoch": 0.052352547869624046, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5961, "step": 2486 }, { "epoch": 0.05237360681888777, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5966, "step": 2487 }, { "epoch": 0.0523946657681515, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6011, "step": 2488 }, { "epoch": 0.05241572471741522, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.612, "step": 2489 }, { "epoch": 0.05243678366667895, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5847, "step": 2490 }, { "epoch": 0.052457842615942676, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6032, "step": 2491 }, { "epoch": 0.052478901565206404, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.602, "step": 2492 }, { "epoch": 0.05249996051447013, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6155, "step": 2493 }, { "epoch": 0.05252101946373386, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5852, "step": 2494 }, { "epoch": 0.052542078412997587, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5672, "step": 2495 }, { "epoch": 0.05256313736226131, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5793, "step": 2496 }, { "epoch": 0.052584196311525035, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5786, "step": 2497 }, { "epoch": 0.05260525526078876, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5817, "step": 2498 }, { "epoch": 0.05262631421005249, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5944, "step": 2499 }, { "epoch": 0.05264737315931622, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5953, "step": 2500 }, { "epoch": 0.05264737315931622, "eval_loss": 1.8555322885513306, "eval_runtime": 897.3744, "eval_samples_per_second": 68.868, "eval_steps_per_second": 2.153, "step": 2500 }, { "epoch": 0.052668432108579945, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5904, "step": 2501 }, { "epoch": 0.05268949105784367, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5793, "step": 2502 }, { "epoch": 0.05271055000710739, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5756, "step": 2503 }, { "epoch": 0.05273160895637112, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6115, "step": 2504 }, { "epoch": 0.05275266790563485, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5798, "step": 2505 }, { "epoch": 0.052773726854898576, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6013, "step": 2506 }, { "epoch": 0.0527947858041623, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5795, "step": 2507 }, { "epoch": 0.05281584475342603, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6114, "step": 2508 }, { "epoch": 0.05283690370268975, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5896, "step": 2509 }, { "epoch": 0.05285796265195348, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.597, "step": 2510 }, { "epoch": 0.052879021601217206, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.605, "step": 2511 }, { "epoch": 0.052900080550480934, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5908, "step": 2512 }, { "epoch": 0.05292113949974466, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6022, "step": 2513 }, { "epoch": 0.05294219844900839, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5697, "step": 2514 }, { "epoch": 0.052963257398272116, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6134, "step": 2515 }, { "epoch": 0.05298431634753584, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5768, "step": 2516 }, { "epoch": 0.053005375296799564, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5885, "step": 2517 }, { "epoch": 0.05302643424606329, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.585, "step": 2518 }, { "epoch": 0.05304749319532702, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.583, "step": 2519 }, { "epoch": 0.05306855214459075, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6008, "step": 2520 }, { "epoch": 0.053089611093854475, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6221, "step": 2521 }, { "epoch": 0.053110670043118195, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.592, "step": 2522 }, { "epoch": 0.05313172899238192, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5909, "step": 2523 }, { "epoch": 0.05315278794164565, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5942, "step": 2524 }, { "epoch": 0.05317384689090938, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6052, "step": 2525 }, { "epoch": 0.053194905840173105, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5789, "step": 2526 }, { "epoch": 0.05321596478943683, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5898, "step": 2527 }, { "epoch": 0.05323702373870056, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5964, "step": 2528 }, { "epoch": 0.05325808268796428, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6376, "step": 2529 }, { "epoch": 0.05327914163722801, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.602, "step": 2530 }, { "epoch": 0.053300200586491736, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5657, "step": 2531 }, { "epoch": 0.053321259535755464, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5909, "step": 2532 }, { "epoch": 0.05334231848501919, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5756, "step": 2533 }, { "epoch": 0.05336337743428292, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6181, "step": 2534 }, { "epoch": 0.053384436383546646, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.593, "step": 2535 }, { "epoch": 0.05340549533281037, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.5748, "step": 2536 }, { "epoch": 0.053426554282074094, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6143, "step": 2537 }, { "epoch": 0.05344761323133782, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6145, "step": 2538 }, { "epoch": 0.05346867218060155, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5718, "step": 2539 }, { "epoch": 0.05348973112986528, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6014, "step": 2540 }, { "epoch": 0.053510790079129004, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5941, "step": 2541 }, { "epoch": 0.053531849028392725, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5955, "step": 2542 }, { "epoch": 0.05355290797765645, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.595, "step": 2543 }, { "epoch": 0.05357396692692018, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6078, "step": 2544 }, { "epoch": 0.05359502587618391, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5888, "step": 2545 }, { "epoch": 0.053616084825447635, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6112, "step": 2546 }, { "epoch": 0.05363714377471136, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5546, "step": 2547 }, { "epoch": 0.05365820272397509, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6221, "step": 2548 }, { "epoch": 0.05367926167323881, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5854, "step": 2549 }, { "epoch": 0.05370032062250254, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5973, "step": 2550 }, { "epoch": 0.053721379571766266, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5784, "step": 2551 }, { "epoch": 0.053742438521029993, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6059, "step": 2552 }, { "epoch": 0.05376349747029372, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.5758, "step": 2553 }, { "epoch": 0.05378455641955745, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6042, "step": 2554 }, { "epoch": 0.053805615368821176, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5824, "step": 2555 }, { "epoch": 0.0538266743180849, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5912, "step": 2556 }, { "epoch": 0.053847733267348624, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5921, "step": 2557 }, { "epoch": 0.05386879221661235, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6186, "step": 2558 }, { "epoch": 0.05388985116587608, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5744, "step": 2559 }, { "epoch": 0.05391091011513981, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6282, "step": 2560 }, { "epoch": 0.053931969064403534, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5777, "step": 2561 }, { "epoch": 0.053953028013667255, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5955, "step": 2562 }, { "epoch": 0.05397408696293098, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6096, "step": 2563 }, { "epoch": 0.05399514591219471, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5787, "step": 2564 }, { "epoch": 0.05401620486145844, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5965, "step": 2565 }, { "epoch": 0.054037263810722165, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6102, "step": 2566 }, { "epoch": 0.05405832275998589, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5921, "step": 2567 }, { "epoch": 0.05407938170924962, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5937, "step": 2568 }, { "epoch": 0.05410044065851334, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5991, "step": 2569 }, { "epoch": 0.05412149960777707, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5935, "step": 2570 }, { "epoch": 0.054142558557040796, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5601, "step": 2571 }, { "epoch": 0.05416361750630452, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.585, "step": 2572 }, { "epoch": 0.05418467645556825, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5844, "step": 2573 }, { "epoch": 0.05420573540483198, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5671, "step": 2574 }, { "epoch": 0.0542267943540957, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.597, "step": 2575 }, { "epoch": 0.054247853303359427, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5861, "step": 2576 }, { "epoch": 0.054268912252623154, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6102, "step": 2577 }, { "epoch": 0.05428997120188688, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5528, "step": 2578 }, { "epoch": 0.05431103015115061, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6068, "step": 2579 }, { "epoch": 0.05433208910041434, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6016, "step": 2580 }, { "epoch": 0.054353148049678064, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.607, "step": 2581 }, { "epoch": 0.054374206998941785, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6063, "step": 2582 }, { "epoch": 0.05439526594820551, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6144, "step": 2583 }, { "epoch": 0.05441632489746924, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5942, "step": 2584 }, { "epoch": 0.05443738384673297, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.6068, "step": 2585 }, { "epoch": 0.054458442795996695, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6001, "step": 2586 }, { "epoch": 0.05447950174526042, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5979, "step": 2587 }, { "epoch": 0.05450056069452415, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5743, "step": 2588 }, { "epoch": 0.05452161964378787, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5878, "step": 2589 }, { "epoch": 0.0545426785930516, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6118, "step": 2590 }, { "epoch": 0.054563737542315326, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5729, "step": 2591 }, { "epoch": 0.05458479649157905, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6074, "step": 2592 }, { "epoch": 0.05460585544084278, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5974, "step": 2593 }, { "epoch": 0.05462691439010651, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5633, "step": 2594 }, { "epoch": 0.05464797333937023, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6041, "step": 2595 }, { "epoch": 0.054669032288633956, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5818, "step": 2596 }, { "epoch": 0.054690091237897684, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5868, "step": 2597 }, { "epoch": 0.05471115018716141, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.5836, "step": 2598 }, { "epoch": 0.05473220913642514, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5778, "step": 2599 }, { "epoch": 0.054753268085688866, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5863, "step": 2600 }, { "epoch": 0.054774327034952594, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5708, "step": 2601 }, { "epoch": 0.054795385984216315, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5663, "step": 2602 }, { "epoch": 0.05481644493348004, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5629, "step": 2603 }, { "epoch": 0.05483750388274377, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5663, "step": 2604 }, { "epoch": 0.0548585628320075, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6129, "step": 2605 }, { "epoch": 0.054879621781271225, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5985, "step": 2606 }, { "epoch": 0.05490068073053495, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5558, "step": 2607 }, { "epoch": 0.05492173967979868, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5482, "step": 2608 }, { "epoch": 0.0549427986290624, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5628, "step": 2609 }, { "epoch": 0.05496385757832613, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5968, "step": 2610 }, { "epoch": 0.054984916527589855, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5704, "step": 2611 }, { "epoch": 0.05500597547685358, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6135, "step": 2612 }, { "epoch": 0.05502703442611731, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5626, "step": 2613 }, { "epoch": 0.05504809337538104, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5811, "step": 2614 }, { "epoch": 0.05506915232464476, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5719, "step": 2615 }, { "epoch": 0.055090211273908486, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.554, "step": 2616 }, { "epoch": 0.055111270223172214, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6253, "step": 2617 }, { "epoch": 0.05513232917243594, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.615, "step": 2618 }, { "epoch": 0.05515338812169967, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.624, "step": 2619 }, { "epoch": 0.055174447070963396, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6247, "step": 2620 }, { "epoch": 0.055195506020227124, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5748, "step": 2621 }, { "epoch": 0.055216564969490844, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.591, "step": 2622 }, { "epoch": 0.05523762391875457, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6238, "step": 2623 }, { "epoch": 0.0552586828680183, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5523, "step": 2624 }, { "epoch": 0.05527974181728203, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6159, "step": 2625 }, { "epoch": 0.055300800766545755, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5625, "step": 2626 }, { "epoch": 0.05532185971580948, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5852, "step": 2627 }, { "epoch": 0.0553429186650732, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5943, "step": 2628 }, { "epoch": 0.05536397761433693, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5604, "step": 2629 }, { "epoch": 0.05538503656360066, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6234, "step": 2630 }, { "epoch": 0.055406095512864385, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5855, "step": 2631 }, { "epoch": 0.05542715446212811, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6307, "step": 2632 }, { "epoch": 0.05544821341139184, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5798, "step": 2633 }, { "epoch": 0.05546927236065557, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5895, "step": 2634 }, { "epoch": 0.05549033130991929, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5597, "step": 2635 }, { "epoch": 0.055511390259183016, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6119, "step": 2636 }, { "epoch": 0.055532449208446744, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5777, "step": 2637 }, { "epoch": 0.05555350815771047, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5953, "step": 2638 }, { "epoch": 0.0555745671069742, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6017, "step": 2639 }, { "epoch": 0.055595626056237926, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5907, "step": 2640 }, { "epoch": 0.055616685005501654, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.6218, "step": 2641 }, { "epoch": 0.055637743954765374, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5843, "step": 2642 }, { "epoch": 0.0556588029040291, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6108, "step": 2643 }, { "epoch": 0.05567986185329283, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6051, "step": 2644 }, { "epoch": 0.05570092080255656, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5759, "step": 2645 }, { "epoch": 0.055721979751820284, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5936, "step": 2646 }, { "epoch": 0.05574303870108401, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5875, "step": 2647 }, { "epoch": 0.05576409765034773, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5902, "step": 2648 }, { "epoch": 0.05578515659961146, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5842, "step": 2649 }, { "epoch": 0.05580621554887519, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6006, "step": 2650 }, { "epoch": 0.055827274498138915, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5868, "step": 2651 }, { "epoch": 0.05584833344740264, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.6069, "step": 2652 }, { "epoch": 0.05586939239666637, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5873, "step": 2653 }, { "epoch": 0.0558904513459301, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6088, "step": 2654 }, { "epoch": 0.05591151029519382, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5855, "step": 2655 }, { "epoch": 0.055932569244457546, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6026, "step": 2656 }, { "epoch": 0.05595362819372127, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5953, "step": 2657 }, { "epoch": 0.055974687142985, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6051, "step": 2658 }, { "epoch": 0.05599574609224873, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6029, "step": 2659 }, { "epoch": 0.056016805041512456, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5797, "step": 2660 }, { "epoch": 0.056037863990776184, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5865, "step": 2661 }, { "epoch": 0.056058922940039904, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5665, "step": 2662 }, { "epoch": 0.05607998188930363, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5664, "step": 2663 }, { "epoch": 0.05610104083856736, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5957, "step": 2664 }, { "epoch": 0.05612209978783109, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6215, "step": 2665 }, { "epoch": 0.056143158737094814, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.581, "step": 2666 }, { "epoch": 0.05616421768635854, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6117, "step": 2667 }, { "epoch": 0.05618527663562226, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6226, "step": 2668 }, { "epoch": 0.05620633558488599, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.58, "step": 2669 }, { "epoch": 0.05622739453414972, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5784, "step": 2670 }, { "epoch": 0.056248453483413445, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5885, "step": 2671 }, { "epoch": 0.05626951243267717, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6313, "step": 2672 }, { "epoch": 0.0562905713819409, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5783, "step": 2673 }, { "epoch": 0.05631163033120463, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5676, "step": 2674 }, { "epoch": 0.05633268928046835, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5918, "step": 2675 }, { "epoch": 0.056353748229732076, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5929, "step": 2676 }, { "epoch": 0.0563748071789958, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.583, "step": 2677 }, { "epoch": 0.05639586612825953, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6212, "step": 2678 }, { "epoch": 0.05641692507752326, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.574, "step": 2679 }, { "epoch": 0.056437984026786986, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6188, "step": 2680 }, { "epoch": 0.056459042976050706, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6065, "step": 2681 }, { "epoch": 0.056480101925314434, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.605, "step": 2682 }, { "epoch": 0.05650116087457816, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5809, "step": 2683 }, { "epoch": 0.05652221982384189, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5884, "step": 2684 }, { "epoch": 0.05654327877310562, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5698, "step": 2685 }, { "epoch": 0.056564337722369344, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5956, "step": 2686 }, { "epoch": 0.05658539667163307, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.593, "step": 2687 }, { "epoch": 0.05660645562089679, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.577, "step": 2688 }, { "epoch": 0.05662751457016052, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5796, "step": 2689 }, { "epoch": 0.05664857351942425, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5872, "step": 2690 }, { "epoch": 0.056669632468687975, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6305, "step": 2691 }, { "epoch": 0.0566906914179517, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6102, "step": 2692 }, { "epoch": 0.05671175036721543, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5731, "step": 2693 }, { "epoch": 0.05673280931647916, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5589, "step": 2694 }, { "epoch": 0.05675386826574288, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5873, "step": 2695 }, { "epoch": 0.056774927215006606, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6103, "step": 2696 }, { "epoch": 0.05679598616427033, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.578, "step": 2697 }, { "epoch": 0.05681704511353406, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6029, "step": 2698 }, { "epoch": 0.05683810406279779, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.608, "step": 2699 }, { "epoch": 0.056859163012061516, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5909, "step": 2700 }, { "epoch": 0.056880221961325236, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5855, "step": 2701 }, { "epoch": 0.056901280910588964, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5619, "step": 2702 }, { "epoch": 0.05692233985985269, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5706, "step": 2703 }, { "epoch": 0.05694339880911642, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.562, "step": 2704 }, { "epoch": 0.056964457758380146, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5656, "step": 2705 }, { "epoch": 0.056985516707643874, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5647, "step": 2706 }, { "epoch": 0.0570065756569076, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5954, "step": 2707 }, { "epoch": 0.05702763460617132, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5997, "step": 2708 }, { "epoch": 0.05704869355543505, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5598, "step": 2709 }, { "epoch": 0.05706975250469878, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5839, "step": 2710 }, { "epoch": 0.057090811453962505, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5533, "step": 2711 }, { "epoch": 0.05711187040322623, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5614, "step": 2712 }, { "epoch": 0.05713292935248996, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5905, "step": 2713 }, { "epoch": 0.05715398830175369, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5641, "step": 2714 }, { "epoch": 0.05717504725101741, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6035, "step": 2715 }, { "epoch": 0.057196106200281135, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6472, "step": 2716 }, { "epoch": 0.05721716514954486, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6103, "step": 2717 }, { "epoch": 0.05723822409880859, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5716, "step": 2718 }, { "epoch": 0.05725928304807232, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5923, "step": 2719 }, { "epoch": 0.057280341997336046, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5762, "step": 2720 }, { "epoch": 0.057301400946599766, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6077, "step": 2721 }, { "epoch": 0.057322459895863494, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6077, "step": 2722 }, { "epoch": 0.05734351884512722, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6006, "step": 2723 }, { "epoch": 0.05736457779439095, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5734, "step": 2724 }, { "epoch": 0.057385636743654676, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6142, "step": 2725 }, { "epoch": 0.057406695692918404, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5973, "step": 2726 }, { "epoch": 0.05742775464218213, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6103, "step": 2727 }, { "epoch": 0.05744881359144585, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6247, "step": 2728 }, { "epoch": 0.05746987254070958, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5799, "step": 2729 }, { "epoch": 0.05749093148997331, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5964, "step": 2730 }, { "epoch": 0.057511990439237035, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.6099, "step": 2731 }, { "epoch": 0.05753304938850076, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5822, "step": 2732 }, { "epoch": 0.05755410833776449, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5873, "step": 2733 }, { "epoch": 0.05757516728702822, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6042, "step": 2734 }, { "epoch": 0.05759622623629194, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6014, "step": 2735 }, { "epoch": 0.057617285185555665, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5787, "step": 2736 }, { "epoch": 0.05763834413481939, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.567, "step": 2737 }, { "epoch": 0.05765940308408312, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6316, "step": 2738 }, { "epoch": 0.05768046203334685, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5763, "step": 2739 }, { "epoch": 0.057701520982610575, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6204, "step": 2740 }, { "epoch": 0.057722579931874296, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5699, "step": 2741 }, { "epoch": 0.057743638881138024, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6197, "step": 2742 }, { "epoch": 0.05776469783040175, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5903, "step": 2743 }, { "epoch": 0.05778575677966548, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6286, "step": 2744 }, { "epoch": 0.057806815728929206, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6015, "step": 2745 }, { "epoch": 0.057827874678192934, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5798, "step": 2746 }, { "epoch": 0.05784893362745666, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5844, "step": 2747 }, { "epoch": 0.05786999257672038, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6236, "step": 2748 }, { "epoch": 0.05789105152598411, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6141, "step": 2749 }, { "epoch": 0.05791211047524784, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5956, "step": 2750 }, { "epoch": 0.057933169424511564, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.613, "step": 2751 }, { "epoch": 0.05795422837377529, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5997, "step": 2752 }, { "epoch": 0.05797528732303902, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.599, "step": 2753 }, { "epoch": 0.05799634627230274, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5759, "step": 2754 }, { "epoch": 0.05801740522156647, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5712, "step": 2755 }, { "epoch": 0.058038464170830195, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.588, "step": 2756 }, { "epoch": 0.05805952312009392, "grad_norm": 0.1904296875, "learning_rate": 0.0002, "loss": 1.5824, "step": 2757 }, { "epoch": 0.05808058206935765, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5989, "step": 2758 }, { "epoch": 0.05810164101862138, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6239, "step": 2759 }, { "epoch": 0.058122699967885105, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6158, "step": 2760 }, { "epoch": 0.058143758917148826, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5622, "step": 2761 }, { "epoch": 0.05816481786641255, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6141, "step": 2762 }, { "epoch": 0.05818587681567628, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6099, "step": 2763 }, { "epoch": 0.05820693576494001, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6027, "step": 2764 }, { "epoch": 0.058227994714203736, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5761, "step": 2765 }, { "epoch": 0.058249053663467464, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5925, "step": 2766 }, { "epoch": 0.05827011261273119, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6139, "step": 2767 }, { "epoch": 0.05829117156199491, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6219, "step": 2768 }, { "epoch": 0.05831223051125864, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5672, "step": 2769 }, { "epoch": 0.05833328946052237, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6296, "step": 2770 }, { "epoch": 0.058354348409786094, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5672, "step": 2771 }, { "epoch": 0.05837540735904982, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6268, "step": 2772 }, { "epoch": 0.05839646630831355, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5972, "step": 2773 }, { "epoch": 0.05841752525757727, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.5838, "step": 2774 }, { "epoch": 0.058438584206841, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5931, "step": 2775 }, { "epoch": 0.058459643156104725, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5864, "step": 2776 }, { "epoch": 0.05848070210536845, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5737, "step": 2777 }, { "epoch": 0.05850176105463218, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6105, "step": 2778 }, { "epoch": 0.05852282000389591, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.6129, "step": 2779 }, { "epoch": 0.058543878953159635, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5703, "step": 2780 }, { "epoch": 0.058564937902423356, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6097, "step": 2781 }, { "epoch": 0.05858599685168708, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5902, "step": 2782 }, { "epoch": 0.05860705580095081, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6116, "step": 2783 }, { "epoch": 0.05862811475021454, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6121, "step": 2784 }, { "epoch": 0.058649173699478266, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5928, "step": 2785 }, { "epoch": 0.05867023264874199, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5852, "step": 2786 }, { "epoch": 0.05869129159800572, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.602, "step": 2787 }, { "epoch": 0.05871235054726944, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6231, "step": 2788 }, { "epoch": 0.05873340949653317, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5872, "step": 2789 }, { "epoch": 0.0587544684457969, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.604, "step": 2790 }, { "epoch": 0.058775527395060624, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5861, "step": 2791 }, { "epoch": 0.05879658634432435, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6121, "step": 2792 }, { "epoch": 0.05881764529358808, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5855, "step": 2793 }, { "epoch": 0.0588387042428518, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6039, "step": 2794 }, { "epoch": 0.05885976319211553, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5732, "step": 2795 }, { "epoch": 0.058880822141379255, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5777, "step": 2796 }, { "epoch": 0.05890188109064298, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5859, "step": 2797 }, { "epoch": 0.05892294003990671, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5689, "step": 2798 }, { "epoch": 0.05894399898917044, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6183, "step": 2799 }, { "epoch": 0.058965057938434165, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5746, "step": 2800 }, { "epoch": 0.058986116887697886, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.636, "step": 2801 }, { "epoch": 0.05900717583696161, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5808, "step": 2802 }, { "epoch": 0.05902823478622534, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5998, "step": 2803 }, { "epoch": 0.05904929373548907, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5915, "step": 2804 }, { "epoch": 0.059070352684752796, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5949, "step": 2805 }, { "epoch": 0.05909141163401652, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6258, "step": 2806 }, { "epoch": 0.059112470583280244, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5923, "step": 2807 }, { "epoch": 0.05913352953254397, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5979, "step": 2808 }, { "epoch": 0.0591545884818077, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5856, "step": 2809 }, { "epoch": 0.059175647431071426, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.595, "step": 2810 }, { "epoch": 0.059196706380335154, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5934, "step": 2811 }, { "epoch": 0.05921776532959888, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6112, "step": 2812 }, { "epoch": 0.05923882427886261, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6359, "step": 2813 }, { "epoch": 0.05925988322812633, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6083, "step": 2814 }, { "epoch": 0.05928094217739006, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5733, "step": 2815 }, { "epoch": 0.059302001126653785, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.585, "step": 2816 }, { "epoch": 0.05932306007591751, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5912, "step": 2817 }, { "epoch": 0.05934411902518124, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6004, "step": 2818 }, { "epoch": 0.05936517797444497, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5871, "step": 2819 }, { "epoch": 0.059386236923708695, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.5781, "step": 2820 }, { "epoch": 0.059407295872972415, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6671, "step": 2821 }, { "epoch": 0.05942835482223614, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5751, "step": 2822 }, { "epoch": 0.05944941377149987, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6136, "step": 2823 }, { "epoch": 0.0594704727207636, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5822, "step": 2824 }, { "epoch": 0.059491531670027326, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5868, "step": 2825 }, { "epoch": 0.05951259061929105, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5675, "step": 2826 }, { "epoch": 0.059533649568554774, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6064, "step": 2827 }, { "epoch": 0.0595547085178185, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5617, "step": 2828 }, { "epoch": 0.05957576746708223, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5764, "step": 2829 }, { "epoch": 0.059596826416345956, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5262, "step": 2830 }, { "epoch": 0.059617885365609684, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5867, "step": 2831 }, { "epoch": 0.05963894431487341, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5631, "step": 2832 }, { "epoch": 0.05966000326413714, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5727, "step": 2833 }, { "epoch": 0.05968106221340086, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6295, "step": 2834 }, { "epoch": 0.05970212116266459, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5926, "step": 2835 }, { "epoch": 0.059723180111928315, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5513, "step": 2836 }, { "epoch": 0.05974423906119204, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.588, "step": 2837 }, { "epoch": 0.05976529801045577, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5922, "step": 2838 }, { "epoch": 0.0597863569597195, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6122, "step": 2839 }, { "epoch": 0.059807415908983225, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5758, "step": 2840 }, { "epoch": 0.059828474858246945, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6076, "step": 2841 }, { "epoch": 0.05984953380751067, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5862, "step": 2842 }, { "epoch": 0.0598705927567744, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5776, "step": 2843 }, { "epoch": 0.05989165170603813, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5941, "step": 2844 }, { "epoch": 0.059912710655301855, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5857, "step": 2845 }, { "epoch": 0.05993376960456558, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6107, "step": 2846 }, { "epoch": 0.059954828553829304, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5781, "step": 2847 }, { "epoch": 0.05997588750309303, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6101, "step": 2848 }, { "epoch": 0.05999694645235676, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5965, "step": 2849 }, { "epoch": 0.060018005401620486, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6008, "step": 2850 }, { "epoch": 0.060039064350884214, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5428, "step": 2851 }, { "epoch": 0.06006012330014794, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5873, "step": 2852 }, { "epoch": 0.06008118224941167, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.578, "step": 2853 }, { "epoch": 0.06010224119867539, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6043, "step": 2854 }, { "epoch": 0.06012330014793912, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5796, "step": 2855 }, { "epoch": 0.060144359097202844, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6123, "step": 2856 }, { "epoch": 0.06016541804646657, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5845, "step": 2857 }, { "epoch": 0.0601864769957303, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5982, "step": 2858 }, { "epoch": 0.06020753594499403, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5843, "step": 2859 }, { "epoch": 0.06022859489425775, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5734, "step": 2860 }, { "epoch": 0.060249653843521475, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5915, "step": 2861 }, { "epoch": 0.0602707127927852, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5905, "step": 2862 }, { "epoch": 0.06029177174204893, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.6011, "step": 2863 }, { "epoch": 0.06031283069131266, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6006, "step": 2864 }, { "epoch": 0.060333889640576385, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6002, "step": 2865 }, { "epoch": 0.06035494858984011, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6319, "step": 2866 }, { "epoch": 0.06037600753910383, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5671, "step": 2867 }, { "epoch": 0.06039706648836756, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6143, "step": 2868 }, { "epoch": 0.06041812543763129, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5957, "step": 2869 }, { "epoch": 0.060439184386895016, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6095, "step": 2870 }, { "epoch": 0.060460243336158744, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6156, "step": 2871 }, { "epoch": 0.06048130228542247, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.6086, "step": 2872 }, { "epoch": 0.0605023612346862, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5702, "step": 2873 }, { "epoch": 0.06052342018394992, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5553, "step": 2874 }, { "epoch": 0.06054447913321365, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6046, "step": 2875 }, { "epoch": 0.060565538082477374, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5751, "step": 2876 }, { "epoch": 0.0605865970317411, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5865, "step": 2877 }, { "epoch": 0.06060765598100483, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6128, "step": 2878 }, { "epoch": 0.06062871493026856, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5626, "step": 2879 }, { "epoch": 0.06064977387953228, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6007, "step": 2880 }, { "epoch": 0.060670832828796005, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5916, "step": 2881 }, { "epoch": 0.06069189177805973, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5888, "step": 2882 }, { "epoch": 0.06071295072732346, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5954, "step": 2883 }, { "epoch": 0.06073400967658719, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5966, "step": 2884 }, { "epoch": 0.060755068625850915, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6092, "step": 2885 }, { "epoch": 0.06077612757511464, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.561, "step": 2886 }, { "epoch": 0.06079718652437836, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.581, "step": 2887 }, { "epoch": 0.06081824547364209, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5738, "step": 2888 }, { "epoch": 0.06083930442290582, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5896, "step": 2889 }, { "epoch": 0.060860363372169546, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5899, "step": 2890 }, { "epoch": 0.06088142232143327, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.561, "step": 2891 }, { "epoch": 0.060902481270697, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5747, "step": 2892 }, { "epoch": 0.06092354021996073, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6181, "step": 2893 }, { "epoch": 0.06094459916922445, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5812, "step": 2894 }, { "epoch": 0.06096565811848818, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5968, "step": 2895 }, { "epoch": 0.060986717067751904, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5755, "step": 2896 }, { "epoch": 0.06100777601701563, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6082, "step": 2897 }, { "epoch": 0.06102883496627936, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5916, "step": 2898 }, { "epoch": 0.06104989391554309, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6061, "step": 2899 }, { "epoch": 0.06107095286480681, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5612, "step": 2900 }, { "epoch": 0.061092011814070535, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5545, "step": 2901 }, { "epoch": 0.06111307076333426, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6013, "step": 2902 }, { "epoch": 0.06113412971259799, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5885, "step": 2903 }, { "epoch": 0.06115518866186172, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5584, "step": 2904 }, { "epoch": 0.061176247611125445, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5952, "step": 2905 }, { "epoch": 0.06119730656038917, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6214, "step": 2906 }, { "epoch": 0.06121836550965289, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5832, "step": 2907 }, { "epoch": 0.06123942445891662, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5581, "step": 2908 }, { "epoch": 0.06126048340818035, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5858, "step": 2909 }, { "epoch": 0.061281542357444076, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5741, "step": 2910 }, { "epoch": 0.0613026013067078, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5896, "step": 2911 }, { "epoch": 0.06132366025597153, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6074, "step": 2912 }, { "epoch": 0.06134471920523525, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5767, "step": 2913 }, { "epoch": 0.06136577815449898, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5643, "step": 2914 }, { "epoch": 0.061386837103762706, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5622, "step": 2915 }, { "epoch": 0.061407896053026434, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6049, "step": 2916 }, { "epoch": 0.06142895500229016, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5876, "step": 2917 }, { "epoch": 0.06145001395155389, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5791, "step": 2918 }, { "epoch": 0.06147107290081762, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5735, "step": 2919 }, { "epoch": 0.06149213185008134, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6117, "step": 2920 }, { "epoch": 0.061513190799345065, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5737, "step": 2921 }, { "epoch": 0.06153424974860879, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5987, "step": 2922 }, { "epoch": 0.06155530869787252, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5796, "step": 2923 }, { "epoch": 0.06157636764713625, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6035, "step": 2924 }, { "epoch": 0.061597426596399975, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5918, "step": 2925 }, { "epoch": 0.0616184855456637, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5885, "step": 2926 }, { "epoch": 0.06163954449492742, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5628, "step": 2927 }, { "epoch": 0.06166060344419115, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5971, "step": 2928 }, { "epoch": 0.06168166239345488, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5758, "step": 2929 }, { "epoch": 0.061702721342718606, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5956, "step": 2930 }, { "epoch": 0.06172378029198233, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5761, "step": 2931 }, { "epoch": 0.06174483924124606, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5888, "step": 2932 }, { "epoch": 0.06176589819050978, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5697, "step": 2933 }, { "epoch": 0.06178695713977351, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5948, "step": 2934 }, { "epoch": 0.061808016089037236, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5668, "step": 2935 }, { "epoch": 0.061829075038300964, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5874, "step": 2936 }, { "epoch": 0.06185013398756469, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.6189, "step": 2937 }, { "epoch": 0.06187119293682842, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.593, "step": 2938 }, { "epoch": 0.061892251886092146, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6187, "step": 2939 }, { "epoch": 0.06191331083535587, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6067, "step": 2940 }, { "epoch": 0.061934369784619595, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5938, "step": 2941 }, { "epoch": 0.06195542873388332, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6212, "step": 2942 }, { "epoch": 0.06197648768314705, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5653, "step": 2943 }, { "epoch": 0.06199754663241078, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6083, "step": 2944 }, { "epoch": 0.062018605581674505, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5916, "step": 2945 }, { "epoch": 0.06203966453093823, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6082, "step": 2946 }, { "epoch": 0.06206072348020195, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6118, "step": 2947 }, { "epoch": 0.06208178242946568, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5965, "step": 2948 }, { "epoch": 0.06210284137872941, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5617, "step": 2949 }, { "epoch": 0.062123900327993135, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5769, "step": 2950 }, { "epoch": 0.06214495927725686, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5891, "step": 2951 }, { "epoch": 0.06216601822652059, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5869, "step": 2952 }, { "epoch": 0.06218707717578431, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5884, "step": 2953 }, { "epoch": 0.06220813612504804, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6096, "step": 2954 }, { "epoch": 0.062229195074311766, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6331, "step": 2955 }, { "epoch": 0.062250254023575494, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.574, "step": 2956 }, { "epoch": 0.06227131297283922, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6117, "step": 2957 }, { "epoch": 0.06229237192210295, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5479, "step": 2958 }, { "epoch": 0.062313430871366676, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6285, "step": 2959 }, { "epoch": 0.0623344898206304, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5694, "step": 2960 }, { "epoch": 0.062355548769894124, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5903, "step": 2961 }, { "epoch": 0.06237660771915785, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5592, "step": 2962 }, { "epoch": 0.06239766666842158, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5959, "step": 2963 }, { "epoch": 0.06241872561768531, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.568, "step": 2964 }, { "epoch": 0.062439784566949035, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5848, "step": 2965 }, { "epoch": 0.062460843516212755, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5861, "step": 2966 }, { "epoch": 0.06248190246547648, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6007, "step": 2967 }, { "epoch": 0.06250296141474021, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5867, "step": 2968 }, { "epoch": 0.06252402036400394, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5906, "step": 2969 }, { "epoch": 0.06254507931326767, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.6045, "step": 2970 }, { "epoch": 0.06256613826253139, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.626, "step": 2971 }, { "epoch": 0.06258719721179512, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6292, "step": 2972 }, { "epoch": 0.06260825616105885, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.608, "step": 2973 }, { "epoch": 0.06262931511032258, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6152, "step": 2974 }, { "epoch": 0.0626503740595863, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5555, "step": 2975 }, { "epoch": 0.06267143300885002, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5855, "step": 2976 }, { "epoch": 0.06269249195811374, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5993, "step": 2977 }, { "epoch": 0.06271355090737747, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5872, "step": 2978 }, { "epoch": 0.0627346098566412, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.637, "step": 2979 }, { "epoch": 0.06275566880590493, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5912, "step": 2980 }, { "epoch": 0.06277672775516865, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5965, "step": 2981 }, { "epoch": 0.06279778670443238, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5999, "step": 2982 }, { "epoch": 0.06281884565369611, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5987, "step": 2983 }, { "epoch": 0.06283990460295984, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6009, "step": 2984 }, { "epoch": 0.06286096355222356, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5902, "step": 2985 }, { "epoch": 0.06288202250148729, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6002, "step": 2986 }, { "epoch": 0.06290308145075102, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6044, "step": 2987 }, { "epoch": 0.06292414040001475, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6099, "step": 2988 }, { "epoch": 0.06294519934927847, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5466, "step": 2989 }, { "epoch": 0.06296625829854219, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5766, "step": 2990 }, { "epoch": 0.06298731724780592, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.572, "step": 2991 }, { "epoch": 0.06300837619706964, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6028, "step": 2992 }, { "epoch": 0.06302943514633337, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6016, "step": 2993 }, { "epoch": 0.0630504940955971, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5996, "step": 2994 }, { "epoch": 0.06307155304486083, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.5902, "step": 2995 }, { "epoch": 0.06309261199412455, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.596, "step": 2996 }, { "epoch": 0.06311367094338828, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5985, "step": 2997 }, { "epoch": 0.06313472989265201, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5796, "step": 2998 }, { "epoch": 0.06315578884191574, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6203, "step": 2999 }, { "epoch": 0.06317684779117946, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.6133, "step": 3000 }, { "epoch": 0.06317684779117946, "eval_loss": 1.9333325624465942, "eval_runtime": 897.2967, "eval_samples_per_second": 68.874, "eval_steps_per_second": 2.153, "step": 3000 }, { "epoch": 0.06319790674044319, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5906, "step": 3001 }, { "epoch": 0.06321896568970692, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5969, "step": 3002 }, { "epoch": 0.06324002463897063, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.5957, "step": 3003 }, { "epoch": 0.06326108358823436, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5964, "step": 3004 }, { "epoch": 0.06328214253749809, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5886, "step": 3005 }, { "epoch": 0.06330320148676181, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5531, "step": 3006 }, { "epoch": 0.06332426043602554, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5673, "step": 3007 }, { "epoch": 0.06334531938528927, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6109, "step": 3008 }, { "epoch": 0.063366378334553, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5875, "step": 3009 }, { "epoch": 0.06338743728381672, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5615, "step": 3010 }, { "epoch": 0.06340849623308045, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5999, "step": 3011 }, { "epoch": 0.06342955518234418, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6145, "step": 3012 }, { "epoch": 0.06345061413160791, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6371, "step": 3013 }, { "epoch": 0.06347167308087164, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6008, "step": 3014 }, { "epoch": 0.06349273203013536, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5836, "step": 3015 }, { "epoch": 0.06351379097939908, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5772, "step": 3016 }, { "epoch": 0.0635348499286628, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6209, "step": 3017 }, { "epoch": 0.06355590887792653, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6118, "step": 3018 }, { "epoch": 0.06357696782719026, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5861, "step": 3019 }, { "epoch": 0.06359802677645399, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.593, "step": 3020 }, { "epoch": 0.06361908572571771, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5714, "step": 3021 }, { "epoch": 0.06364014467498144, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5833, "step": 3022 }, { "epoch": 0.06366120362424517, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6076, "step": 3023 }, { "epoch": 0.0636822625735089, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5769, "step": 3024 }, { "epoch": 0.06370332152277262, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5562, "step": 3025 }, { "epoch": 0.06372438047203635, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5904, "step": 3026 }, { "epoch": 0.06374543942130008, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5971, "step": 3027 }, { "epoch": 0.0637664983705638, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5802, "step": 3028 }, { "epoch": 0.06378755731982752, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6065, "step": 3029 }, { "epoch": 0.06380861626909125, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.5643, "step": 3030 }, { "epoch": 0.06382967521835498, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5886, "step": 3031 }, { "epoch": 0.0638507341676187, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6016, "step": 3032 }, { "epoch": 0.06387179311688243, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5743, "step": 3033 }, { "epoch": 0.06389285206614616, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5864, "step": 3034 }, { "epoch": 0.06391391101540989, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5741, "step": 3035 }, { "epoch": 0.06393496996467361, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6237, "step": 3036 }, { "epoch": 0.06395602891393734, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5835, "step": 3037 }, { "epoch": 0.06397708786320107, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.58, "step": 3038 }, { "epoch": 0.0639981468124648, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.587, "step": 3039 }, { "epoch": 0.06401920576172852, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5824, "step": 3040 }, { "epoch": 0.06404026471099225, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5974, "step": 3041 }, { "epoch": 0.06406132366025598, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5969, "step": 3042 }, { "epoch": 0.06408238260951969, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5584, "step": 3043 }, { "epoch": 0.06410344155878342, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6098, "step": 3044 }, { "epoch": 0.06412450050804715, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5999, "step": 3045 }, { "epoch": 0.06414555945731087, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6047, "step": 3046 }, { "epoch": 0.0641666184065746, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5648, "step": 3047 }, { "epoch": 0.06418767735583833, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5723, "step": 3048 }, { "epoch": 0.06420873630510206, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6345, "step": 3049 }, { "epoch": 0.06422979525436578, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.6192, "step": 3050 }, { "epoch": 0.06425085420362951, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5963, "step": 3051 }, { "epoch": 0.06427191315289324, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6091, "step": 3052 }, { "epoch": 0.06429297210215697, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5706, "step": 3053 }, { "epoch": 0.0643140310514207, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5992, "step": 3054 }, { "epoch": 0.06433509000068442, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6289, "step": 3055 }, { "epoch": 0.06435614894994814, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5666, "step": 3056 }, { "epoch": 0.06437720789921186, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5682, "step": 3057 }, { "epoch": 0.06439826684847559, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6141, "step": 3058 }, { "epoch": 0.06441932579773932, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6054, "step": 3059 }, { "epoch": 0.06444038474700305, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5862, "step": 3060 }, { "epoch": 0.06446144369626677, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5831, "step": 3061 }, { "epoch": 0.0644825026455305, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6014, "step": 3062 }, { "epoch": 0.06450356159479423, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5685, "step": 3063 }, { "epoch": 0.06452462054405796, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6278, "step": 3064 }, { "epoch": 0.06454567949332168, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6106, "step": 3065 }, { "epoch": 0.06456673844258541, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5928, "step": 3066 }, { "epoch": 0.06458779739184914, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.595, "step": 3067 }, { "epoch": 0.06460885634111287, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5907, "step": 3068 }, { "epoch": 0.06462991529037658, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.59, "step": 3069 }, { "epoch": 0.06465097423964031, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5732, "step": 3070 }, { "epoch": 0.06467203318890404, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5765, "step": 3071 }, { "epoch": 0.06469309213816776, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.578, "step": 3072 }, { "epoch": 0.06471415108743149, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6227, "step": 3073 }, { "epoch": 0.06473521003669522, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6012, "step": 3074 }, { "epoch": 0.06475626898595895, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.6134, "step": 3075 }, { "epoch": 0.06477732793522267, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5859, "step": 3076 }, { "epoch": 0.0647983868844864, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5393, "step": 3077 }, { "epoch": 0.06481944583375013, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6228, "step": 3078 }, { "epoch": 0.06484050478301386, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.567, "step": 3079 }, { "epoch": 0.06486156373227758, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5892, "step": 3080 }, { "epoch": 0.06488262268154131, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.607, "step": 3081 }, { "epoch": 0.06490368163080502, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5929, "step": 3082 }, { "epoch": 0.06492474058006875, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6184, "step": 3083 }, { "epoch": 0.06494579952933248, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5705, "step": 3084 }, { "epoch": 0.0649668584785962, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5949, "step": 3085 }, { "epoch": 0.06498791742785993, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6382, "step": 3086 }, { "epoch": 0.06500897637712366, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6078, "step": 3087 }, { "epoch": 0.06503003532638739, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.591, "step": 3088 }, { "epoch": 0.06505109427565112, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6111, "step": 3089 }, { "epoch": 0.06507215322491484, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.583, "step": 3090 }, { "epoch": 0.06509321217417857, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5775, "step": 3091 }, { "epoch": 0.0651142711234423, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5945, "step": 3092 }, { "epoch": 0.06513533007270603, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5741, "step": 3093 }, { "epoch": 0.06515638902196975, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6234, "step": 3094 }, { "epoch": 0.06517744797123348, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5584, "step": 3095 }, { "epoch": 0.0651985069204972, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5895, "step": 3096 }, { "epoch": 0.06521956586976092, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6119, "step": 3097 }, { "epoch": 0.06524062481902465, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5983, "step": 3098 }, { "epoch": 0.06526168376828838, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6006, "step": 3099 }, { "epoch": 0.0652827427175521, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5793, "step": 3100 }, { "epoch": 0.06530380166681583, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6044, "step": 3101 }, { "epoch": 0.06532486061607956, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5612, "step": 3102 }, { "epoch": 0.06534591956534329, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6095, "step": 3103 }, { "epoch": 0.06536697851460702, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5979, "step": 3104 }, { "epoch": 0.06538803746387074, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5933, "step": 3105 }, { "epoch": 0.06540909641313447, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5868, "step": 3106 }, { "epoch": 0.0654301553623982, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5832, "step": 3107 }, { "epoch": 0.06545121431166193, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5897, "step": 3108 }, { "epoch": 0.06547227326092564, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6334, "step": 3109 }, { "epoch": 0.06549333221018937, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5847, "step": 3110 }, { "epoch": 0.0655143911594531, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6056, "step": 3111 }, { "epoch": 0.06553545010871682, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5805, "step": 3112 }, { "epoch": 0.06555650905798055, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5843, "step": 3113 }, { "epoch": 0.06557756800724428, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6168, "step": 3114 }, { "epoch": 0.065598626956508, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.606, "step": 3115 }, { "epoch": 0.06561968590577173, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6039, "step": 3116 }, { "epoch": 0.06564074485503546, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5845, "step": 3117 }, { "epoch": 0.06566180380429919, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5681, "step": 3118 }, { "epoch": 0.06568286275356292, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5683, "step": 3119 }, { "epoch": 0.06570392170282664, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.5582, "step": 3120 }, { "epoch": 0.06572498065209037, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6254, "step": 3121 }, { "epoch": 0.06574603960135408, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5739, "step": 3122 }, { "epoch": 0.06576709855061781, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5972, "step": 3123 }, { "epoch": 0.06578815749988154, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5846, "step": 3124 }, { "epoch": 0.06580921644914527, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5959, "step": 3125 }, { "epoch": 0.065830275398409, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5985, "step": 3126 }, { "epoch": 0.06585133434767272, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5916, "step": 3127 }, { "epoch": 0.06587239329693645, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5687, "step": 3128 }, { "epoch": 0.06589345224620018, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5815, "step": 3129 }, { "epoch": 0.0659145111954639, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5674, "step": 3130 }, { "epoch": 0.06593557014472763, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.573, "step": 3131 }, { "epoch": 0.06595662909399136, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5584, "step": 3132 }, { "epoch": 0.06597768804325509, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5609, "step": 3133 }, { "epoch": 0.06599874699251881, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6015, "step": 3134 }, { "epoch": 0.06601980594178254, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5862, "step": 3135 }, { "epoch": 0.06604086489104626, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5978, "step": 3136 }, { "epoch": 0.06606192384030998, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5902, "step": 3137 }, { "epoch": 0.06608298278957371, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6151, "step": 3138 }, { "epoch": 0.06610404173883744, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5702, "step": 3139 }, { "epoch": 0.06612510068810117, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5842, "step": 3140 }, { "epoch": 0.06614615963736489, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5535, "step": 3141 }, { "epoch": 0.06616721858662862, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5695, "step": 3142 }, { "epoch": 0.06618827753589235, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5851, "step": 3143 }, { "epoch": 0.06620933648515608, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5686, "step": 3144 }, { "epoch": 0.0662303954344198, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5837, "step": 3145 }, { "epoch": 0.06625145438368353, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6189, "step": 3146 }, { "epoch": 0.06627251333294726, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5518, "step": 3147 }, { "epoch": 0.06629357228221099, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5905, "step": 3148 }, { "epoch": 0.0663146312314747, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5973, "step": 3149 }, { "epoch": 0.06633569018073843, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6209, "step": 3150 }, { "epoch": 0.06635674913000215, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5701, "step": 3151 }, { "epoch": 0.06637780807926588, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.578, "step": 3152 }, { "epoch": 0.06639886702852961, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5621, "step": 3153 }, { "epoch": 0.06641992597779334, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5551, "step": 3154 }, { "epoch": 0.06644098492705706, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5914, "step": 3155 }, { "epoch": 0.06646204387632079, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5788, "step": 3156 }, { "epoch": 0.06648310282558452, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5987, "step": 3157 }, { "epoch": 0.06650416177484825, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5284, "step": 3158 }, { "epoch": 0.06652522072411197, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5909, "step": 3159 }, { "epoch": 0.0665462796733757, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5987, "step": 3160 }, { "epoch": 0.06656733862263943, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5567, "step": 3161 }, { "epoch": 0.06658839757190314, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5952, "step": 3162 }, { "epoch": 0.06660945652116687, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.634, "step": 3163 }, { "epoch": 0.0666305154704306, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5741, "step": 3164 }, { "epoch": 0.06665157441969433, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5805, "step": 3165 }, { "epoch": 0.06667263336895805, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5575, "step": 3166 }, { "epoch": 0.06669369231822178, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5991, "step": 3167 }, { "epoch": 0.06671475126748551, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5642, "step": 3168 }, { "epoch": 0.06673581021674924, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5953, "step": 3169 }, { "epoch": 0.06675686916601296, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5967, "step": 3170 }, { "epoch": 0.06677792811527669, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6008, "step": 3171 }, { "epoch": 0.06679898706454042, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6075, "step": 3172 }, { "epoch": 0.06682004601380415, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5334, "step": 3173 }, { "epoch": 0.06684110496306787, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5771, "step": 3174 }, { "epoch": 0.06686216391233159, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5455, "step": 3175 }, { "epoch": 0.06688322286159532, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5859, "step": 3176 }, { "epoch": 0.06690428181085904, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5846, "step": 3177 }, { "epoch": 0.06692534076012277, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6238, "step": 3178 }, { "epoch": 0.0669463997093865, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6122, "step": 3179 }, { "epoch": 0.06696745865865023, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5924, "step": 3180 }, { "epoch": 0.06698851760791395, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5964, "step": 3181 }, { "epoch": 0.06700957655717768, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5815, "step": 3182 }, { "epoch": 0.06703063550644141, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6021, "step": 3183 }, { "epoch": 0.06705169445570514, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.596, "step": 3184 }, { "epoch": 0.06707275340496886, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5652, "step": 3185 }, { "epoch": 0.06709381235423259, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5659, "step": 3186 }, { "epoch": 0.06711487130349632, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6112, "step": 3187 }, { "epoch": 0.06713593025276005, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5256, "step": 3188 }, { "epoch": 0.06715698920202376, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6053, "step": 3189 }, { "epoch": 0.06717804815128749, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5716, "step": 3190 }, { "epoch": 0.06719910710055121, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6144, "step": 3191 }, { "epoch": 0.06722016604981494, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5854, "step": 3192 }, { "epoch": 0.06724122499907867, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5918, "step": 3193 }, { "epoch": 0.0672622839483424, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5865, "step": 3194 }, { "epoch": 0.06728334289760612, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5762, "step": 3195 }, { "epoch": 0.06730440184686985, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5988, "step": 3196 }, { "epoch": 0.06732546079613358, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.628, "step": 3197 }, { "epoch": 0.0673465197453973, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5663, "step": 3198 }, { "epoch": 0.06736757869466103, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5896, "step": 3199 }, { "epoch": 0.06738863764392476, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.6017, "step": 3200 }, { "epoch": 0.06740969659318849, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5523, "step": 3201 }, { "epoch": 0.0674307555424522, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5864, "step": 3202 }, { "epoch": 0.06745181449171593, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6228, "step": 3203 }, { "epoch": 0.06747287344097966, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5868, "step": 3204 }, { "epoch": 0.06749393239024339, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6038, "step": 3205 }, { "epoch": 0.06751499133950711, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.565, "step": 3206 }, { "epoch": 0.06753605028877084, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6388, "step": 3207 }, { "epoch": 0.06755710923803457, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5923, "step": 3208 }, { "epoch": 0.0675781681872983, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6203, "step": 3209 }, { "epoch": 0.06759922713656202, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6211, "step": 3210 }, { "epoch": 0.06762028608582575, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.6208, "step": 3211 }, { "epoch": 0.06764134503508948, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6139, "step": 3212 }, { "epoch": 0.0676624039843532, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.6047, "step": 3213 }, { "epoch": 0.06768346293361693, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5747, "step": 3214 }, { "epoch": 0.06770452188288065, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6023, "step": 3215 }, { "epoch": 0.06772558083214437, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.576, "step": 3216 }, { "epoch": 0.0677466397814081, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6063, "step": 3217 }, { "epoch": 0.06776769873067183, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5833, "step": 3218 }, { "epoch": 0.06778875767993556, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5709, "step": 3219 }, { "epoch": 0.06780981662919928, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5727, "step": 3220 }, { "epoch": 0.06783087557846301, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6391, "step": 3221 }, { "epoch": 0.06785193452772674, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5911, "step": 3222 }, { "epoch": 0.06787299347699047, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5796, "step": 3223 }, { "epoch": 0.0678940524262542, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.542, "step": 3224 }, { "epoch": 0.06791511137551792, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5873, "step": 3225 }, { "epoch": 0.06793617032478165, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6205, "step": 3226 }, { "epoch": 0.06795722927404538, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5749, "step": 3227 }, { "epoch": 0.06797828822330909, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5544, "step": 3228 }, { "epoch": 0.06799934717257282, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5965, "step": 3229 }, { "epoch": 0.06802040612183655, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5878, "step": 3230 }, { "epoch": 0.06804146507110027, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5614, "step": 3231 }, { "epoch": 0.068062524020364, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5762, "step": 3232 }, { "epoch": 0.06808358296962773, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.6164, "step": 3233 }, { "epoch": 0.06810464191889146, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5852, "step": 3234 }, { "epoch": 0.06812570086815518, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5969, "step": 3235 }, { "epoch": 0.06814675981741891, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5763, "step": 3236 }, { "epoch": 0.06816781876668264, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5924, "step": 3237 }, { "epoch": 0.06818887771594637, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5841, "step": 3238 }, { "epoch": 0.0682099366652101, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5696, "step": 3239 }, { "epoch": 0.06823099561447382, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5658, "step": 3240 }, { "epoch": 0.06825205456373755, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5994, "step": 3241 }, { "epoch": 0.06827311351300126, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6002, "step": 3242 }, { "epoch": 0.06829417246226499, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5845, "step": 3243 }, { "epoch": 0.06831523141152872, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6075, "step": 3244 }, { "epoch": 0.06833629036079245, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5567, "step": 3245 }, { "epoch": 0.06835734931005617, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.623, "step": 3246 }, { "epoch": 0.0683784082593199, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5745, "step": 3247 }, { "epoch": 0.06839946720858363, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5872, "step": 3248 }, { "epoch": 0.06842052615784736, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6086, "step": 3249 }, { "epoch": 0.06844158510711108, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.576, "step": 3250 }, { "epoch": 0.06846264405637481, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5546, "step": 3251 }, { "epoch": 0.06848370300563854, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.593, "step": 3252 }, { "epoch": 0.06850476195490227, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6151, "step": 3253 }, { "epoch": 0.068525820904166, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5743, "step": 3254 }, { "epoch": 0.0685468798534297, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5772, "step": 3255 }, { "epoch": 0.06856793880269343, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5424, "step": 3256 }, { "epoch": 0.06858899775195716, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5404, "step": 3257 }, { "epoch": 0.06861005670122089, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6115, "step": 3258 }, { "epoch": 0.06863111565048462, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5763, "step": 3259 }, { "epoch": 0.06865217459974834, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5986, "step": 3260 }, { "epoch": 0.06867323354901207, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5683, "step": 3261 }, { "epoch": 0.0686942924982758, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.601, "step": 3262 }, { "epoch": 0.06871535144753953, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5778, "step": 3263 }, { "epoch": 0.06873641039680325, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5788, "step": 3264 }, { "epoch": 0.06875746934606698, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6188, "step": 3265 }, { "epoch": 0.06877852829533071, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5628, "step": 3266 }, { "epoch": 0.06879958724459444, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5833, "step": 3267 }, { "epoch": 0.06882064619385815, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5544, "step": 3268 }, { "epoch": 0.06884170514312188, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5928, "step": 3269 }, { "epoch": 0.0688627640923856, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5971, "step": 3270 }, { "epoch": 0.06888382304164933, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5722, "step": 3271 }, { "epoch": 0.06890488199091306, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5935, "step": 3272 }, { "epoch": 0.06892594094017679, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5724, "step": 3273 }, { "epoch": 0.06894699988944052, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5803, "step": 3274 }, { "epoch": 0.06896805883870424, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5884, "step": 3275 }, { "epoch": 0.06898911778796797, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6164, "step": 3276 }, { "epoch": 0.0690101767372317, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6018, "step": 3277 }, { "epoch": 0.06903123568649543, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5726, "step": 3278 }, { "epoch": 0.06905229463575915, "grad_norm": 0.189453125, "learning_rate": 0.0002, "loss": 1.5867, "step": 3279 }, { "epoch": 0.06907335358502288, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5925, "step": 3280 }, { "epoch": 0.0690944125342866, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5906, "step": 3281 }, { "epoch": 0.06911547148355032, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6063, "step": 3282 }, { "epoch": 0.06913653043281405, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6078, "step": 3283 }, { "epoch": 0.06915758938207778, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5895, "step": 3284 }, { "epoch": 0.0691786483313415, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5782, "step": 3285 }, { "epoch": 0.06919970728060523, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6235, "step": 3286 }, { "epoch": 0.06922076622986896, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.568, "step": 3287 }, { "epoch": 0.06924182517913269, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.611, "step": 3288 }, { "epoch": 0.06926288412839642, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5989, "step": 3289 }, { "epoch": 0.06928394307766014, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5946, "step": 3290 }, { "epoch": 0.06930500202692387, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5604, "step": 3291 }, { "epoch": 0.0693260609761876, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5668, "step": 3292 }, { "epoch": 0.06934711992545133, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5919, "step": 3293 }, { "epoch": 0.06936817887471505, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6296, "step": 3294 }, { "epoch": 0.06938923782397877, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5508, "step": 3295 }, { "epoch": 0.0694102967732425, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5985, "step": 3296 }, { "epoch": 0.06943135572250622, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6122, "step": 3297 }, { "epoch": 0.06945241467176995, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6016, "step": 3298 }, { "epoch": 0.06947347362103368, "grad_norm": 0.1962890625, "learning_rate": 0.0002, "loss": 1.561, "step": 3299 }, { "epoch": 0.0694945325702974, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5835, "step": 3300 }, { "epoch": 0.06951559151956113, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5935, "step": 3301 }, { "epoch": 0.06953665046882486, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.6008, "step": 3302 }, { "epoch": 0.06955770941808859, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5795, "step": 3303 }, { "epoch": 0.06957876836735231, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5838, "step": 3304 }, { "epoch": 0.06959982731661604, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6013, "step": 3305 }, { "epoch": 0.06962088626587977, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6131, "step": 3306 }, { "epoch": 0.0696419452151435, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5872, "step": 3307 }, { "epoch": 0.06966300416440721, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6032, "step": 3308 }, { "epoch": 0.06968406311367094, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5984, "step": 3309 }, { "epoch": 0.06970512206293467, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5868, "step": 3310 }, { "epoch": 0.0697261810121984, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5732, "step": 3311 }, { "epoch": 0.06974723996146212, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6163, "step": 3312 }, { "epoch": 0.06976829891072585, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.578, "step": 3313 }, { "epoch": 0.06978935785998958, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5921, "step": 3314 }, { "epoch": 0.0698104168092533, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5945, "step": 3315 }, { "epoch": 0.06983147575851703, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6089, "step": 3316 }, { "epoch": 0.06985253470778076, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5871, "step": 3317 }, { "epoch": 0.06987359365704449, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.565, "step": 3318 }, { "epoch": 0.06989465260630821, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6194, "step": 3319 }, { "epoch": 0.06991571155557194, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5778, "step": 3320 }, { "epoch": 0.06993677050483565, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5738, "step": 3321 }, { "epoch": 0.06995782945409938, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.627, "step": 3322 }, { "epoch": 0.06997888840336311, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5512, "step": 3323 }, { "epoch": 0.06999994735262684, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5877, "step": 3324 }, { "epoch": 0.07002100630189056, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5496, "step": 3325 }, { "epoch": 0.07004206525115429, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5985, "step": 3326 }, { "epoch": 0.07006312420041802, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5819, "step": 3327 }, { "epoch": 0.07008418314968175, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6326, "step": 3328 }, { "epoch": 0.07010524209894547, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5975, "step": 3329 }, { "epoch": 0.0701263010482092, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5582, "step": 3330 }, { "epoch": 0.07014735999747293, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.621, "step": 3331 }, { "epoch": 0.07016841894673666, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5931, "step": 3332 }, { "epoch": 0.07018947789600039, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5806, "step": 3333 }, { "epoch": 0.0702105368452641, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5703, "step": 3334 }, { "epoch": 0.07023159579452783, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5241, "step": 3335 }, { "epoch": 0.07025265474379155, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5873, "step": 3336 }, { "epoch": 0.07027371369305528, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6145, "step": 3337 }, { "epoch": 0.07029477264231901, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5453, "step": 3338 }, { "epoch": 0.07031583159158274, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6414, "step": 3339 }, { "epoch": 0.07033689054084646, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6024, "step": 3340 }, { "epoch": 0.07035794949011019, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5954, "step": 3341 }, { "epoch": 0.07037900843937392, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6043, "step": 3342 }, { "epoch": 0.07040006738863765, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5852, "step": 3343 }, { "epoch": 0.07042112633790137, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5989, "step": 3344 }, { "epoch": 0.0704421852871651, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5929, "step": 3345 }, { "epoch": 0.07046324423642883, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.591, "step": 3346 }, { "epoch": 0.07048430318569256, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5675, "step": 3347 }, { "epoch": 0.07050536213495627, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.6384, "step": 3348 }, { "epoch": 0.07052642108422, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6158, "step": 3349 }, { "epoch": 0.07054748003348373, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5776, "step": 3350 }, { "epoch": 0.07056853898274745, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5895, "step": 3351 }, { "epoch": 0.07058959793201118, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5845, "step": 3352 }, { "epoch": 0.07061065688127491, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5991, "step": 3353 }, { "epoch": 0.07063171583053864, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5643, "step": 3354 }, { "epoch": 0.07065277477980236, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5937, "step": 3355 }, { "epoch": 0.07067383372906609, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6088, "step": 3356 }, { "epoch": 0.07069489267832982, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5473, "step": 3357 }, { "epoch": 0.07071595162759355, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5495, "step": 3358 }, { "epoch": 0.07073701057685727, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5824, "step": 3359 }, { "epoch": 0.070758069526121, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5883, "step": 3360 }, { "epoch": 0.07077912847538471, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6201, "step": 3361 }, { "epoch": 0.07080018742464844, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6182, "step": 3362 }, { "epoch": 0.07082124637391217, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6024, "step": 3363 }, { "epoch": 0.0708423053231759, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5826, "step": 3364 }, { "epoch": 0.07086336427243962, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6057, "step": 3365 }, { "epoch": 0.07088442322170335, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5789, "step": 3366 }, { "epoch": 0.07090548217096708, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.605, "step": 3367 }, { "epoch": 0.07092654112023081, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6236, "step": 3368 }, { "epoch": 0.07094760006949453, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.58, "step": 3369 }, { "epoch": 0.07096865901875826, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.6024, "step": 3370 }, { "epoch": 0.07098971796802199, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.55, "step": 3371 }, { "epoch": 0.07101077691728572, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5934, "step": 3372 }, { "epoch": 0.07103183586654944, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5746, "step": 3373 }, { "epoch": 0.07105289481581316, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5806, "step": 3374 }, { "epoch": 0.07107395376507689, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5752, "step": 3375 }, { "epoch": 0.07109501271434061, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5422, "step": 3376 }, { "epoch": 0.07111607166360434, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5962, "step": 3377 }, { "epoch": 0.07113713061286807, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.558, "step": 3378 }, { "epoch": 0.0711581895621318, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5894, "step": 3379 }, { "epoch": 0.07117924851139552, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5858, "step": 3380 }, { "epoch": 0.07120030746065925, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5549, "step": 3381 }, { "epoch": 0.07122136640992298, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5861, "step": 3382 }, { "epoch": 0.0712424253591867, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5841, "step": 3383 }, { "epoch": 0.07126348430845043, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5844, "step": 3384 }, { "epoch": 0.07128454325771416, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5963, "step": 3385 }, { "epoch": 0.07130560220697789, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5755, "step": 3386 }, { "epoch": 0.0713266611562416, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6009, "step": 3387 }, { "epoch": 0.07134772010550533, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6011, "step": 3388 }, { "epoch": 0.07136877905476906, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5814, "step": 3389 }, { "epoch": 0.07138983800403279, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6063, "step": 3390 }, { "epoch": 0.07141089695329651, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.5543, "step": 3391 }, { "epoch": 0.07143195590256024, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5871, "step": 3392 }, { "epoch": 0.07145301485182397, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6066, "step": 3393 }, { "epoch": 0.0714740738010877, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6039, "step": 3394 }, { "epoch": 0.07149513275035142, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.609, "step": 3395 }, { "epoch": 0.07151619169961515, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.535, "step": 3396 }, { "epoch": 0.07153725064887888, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6119, "step": 3397 }, { "epoch": 0.0715583095981426, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5568, "step": 3398 }, { "epoch": 0.07157936854740633, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6563, "step": 3399 }, { "epoch": 0.07160042749667006, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5814, "step": 3400 }, { "epoch": 0.07162148644593377, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5728, "step": 3401 }, { "epoch": 0.0716425453951975, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5521, "step": 3402 }, { "epoch": 0.07166360434446123, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5606, "step": 3403 }, { "epoch": 0.07168466329372496, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5802, "step": 3404 }, { "epoch": 0.07170572224298868, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6006, "step": 3405 }, { "epoch": 0.07172678119225241, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5956, "step": 3406 }, { "epoch": 0.07174784014151614, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6143, "step": 3407 }, { "epoch": 0.07176889909077987, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5521, "step": 3408 }, { "epoch": 0.0717899580400436, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5941, "step": 3409 }, { "epoch": 0.07181101698930732, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5813, "step": 3410 }, { "epoch": 0.07183207593857105, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5846, "step": 3411 }, { "epoch": 0.07185313488783478, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5972, "step": 3412 }, { "epoch": 0.0718741938370985, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6199, "step": 3413 }, { "epoch": 0.07189525278636222, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6075, "step": 3414 }, { "epoch": 0.07191631173562595, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5843, "step": 3415 }, { "epoch": 0.07193737068488967, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5417, "step": 3416 }, { "epoch": 0.0719584296341534, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5617, "step": 3417 }, { "epoch": 0.07197948858341713, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5716, "step": 3418 }, { "epoch": 0.07200054753268086, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6202, "step": 3419 }, { "epoch": 0.07202160648194458, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6093, "step": 3420 }, { "epoch": 0.07204266543120831, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5577, "step": 3421 }, { "epoch": 0.07206372438047204, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.619, "step": 3422 }, { "epoch": 0.07208478332973577, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5855, "step": 3423 }, { "epoch": 0.0721058422789995, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5796, "step": 3424 }, { "epoch": 0.07212690122826322, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5688, "step": 3425 }, { "epoch": 0.07214796017752695, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5791, "step": 3426 }, { "epoch": 0.07216901912679066, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5866, "step": 3427 }, { "epoch": 0.07219007807605439, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5786, "step": 3428 }, { "epoch": 0.07221113702531812, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5435, "step": 3429 }, { "epoch": 0.07223219597458184, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.587, "step": 3430 }, { "epoch": 0.07225325492384557, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5762, "step": 3431 }, { "epoch": 0.0722743138731093, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.634, "step": 3432 }, { "epoch": 0.07229537282237303, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5788, "step": 3433 }, { "epoch": 0.07231643177163675, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5792, "step": 3434 }, { "epoch": 0.07233749072090048, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5701, "step": 3435 }, { "epoch": 0.07235854967016421, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6185, "step": 3436 }, { "epoch": 0.07237960861942794, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5845, "step": 3437 }, { "epoch": 0.07240066756869167, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5587, "step": 3438 }, { "epoch": 0.07242172651795539, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.6045, "step": 3439 }, { "epoch": 0.0724427854672191, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6058, "step": 3440 }, { "epoch": 0.07246384441648283, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5625, "step": 3441 }, { "epoch": 0.07248490336574656, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5778, "step": 3442 }, { "epoch": 0.07250596231501029, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5831, "step": 3443 }, { "epoch": 0.07252702126427402, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5807, "step": 3444 }, { "epoch": 0.07254808021353774, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5553, "step": 3445 }, { "epoch": 0.07256913916280147, "grad_norm": 0.1923828125, "learning_rate": 0.0002, "loss": 1.5624, "step": 3446 }, { "epoch": 0.0725901981120652, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5435, "step": 3447 }, { "epoch": 0.07261125706132893, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5991, "step": 3448 }, { "epoch": 0.07263231601059265, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6157, "step": 3449 }, { "epoch": 0.07265337495985638, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5991, "step": 3450 }, { "epoch": 0.07267443390912011, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5931, "step": 3451 }, { "epoch": 0.07269549285838384, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6002, "step": 3452 }, { "epoch": 0.07271655180764756, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5955, "step": 3453 }, { "epoch": 0.07273761075691128, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5912, "step": 3454 }, { "epoch": 0.072758669706175, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5777, "step": 3455 }, { "epoch": 0.07277972865543873, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5973, "step": 3456 }, { "epoch": 0.07280078760470246, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5901, "step": 3457 }, { "epoch": 0.07282184655396619, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6113, "step": 3458 }, { "epoch": 0.07284290550322992, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5924, "step": 3459 }, { "epoch": 0.07286396445249364, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6108, "step": 3460 }, { "epoch": 0.07288502340175737, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5556, "step": 3461 }, { "epoch": 0.0729060823510211, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.581, "step": 3462 }, { "epoch": 0.07292714130028483, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6052, "step": 3463 }, { "epoch": 0.07294820024954855, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6085, "step": 3464 }, { "epoch": 0.07296925919881228, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5569, "step": 3465 }, { "epoch": 0.07299031814807601, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.615, "step": 3466 }, { "epoch": 0.07301137709733972, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6093, "step": 3467 }, { "epoch": 0.07303243604660345, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.586, "step": 3468 }, { "epoch": 0.07305349499586718, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5704, "step": 3469 }, { "epoch": 0.0730745539451309, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6023, "step": 3470 }, { "epoch": 0.07309561289439463, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5889, "step": 3471 }, { "epoch": 0.07311667184365836, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5746, "step": 3472 }, { "epoch": 0.07313773079292209, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6073, "step": 3473 }, { "epoch": 0.07315878974218581, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6115, "step": 3474 }, { "epoch": 0.07317984869144954, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5391, "step": 3475 }, { "epoch": 0.07320090764071327, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5845, "step": 3476 }, { "epoch": 0.073221966589977, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6046, "step": 3477 }, { "epoch": 0.07324302553924072, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5583, "step": 3478 }, { "epoch": 0.07326408448850445, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.636, "step": 3479 }, { "epoch": 0.07328514343776817, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5836, "step": 3480 }, { "epoch": 0.0733062023870319, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.594, "step": 3481 }, { "epoch": 0.07332726133629562, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5718, "step": 3482 }, { "epoch": 0.07334832028555935, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5682, "step": 3483 }, { "epoch": 0.07336937923482308, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5765, "step": 3484 }, { "epoch": 0.0733904381840868, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6066, "step": 3485 }, { "epoch": 0.07341149713335053, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5663, "step": 3486 }, { "epoch": 0.07343255608261426, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.5665, "step": 3487 }, { "epoch": 0.07345361503187799, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6001, "step": 3488 }, { "epoch": 0.07347467398114171, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6255, "step": 3489 }, { "epoch": 0.07349573293040544, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5969, "step": 3490 }, { "epoch": 0.07351679187966917, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5601, "step": 3491 }, { "epoch": 0.0735378508289329, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5614, "step": 3492 }, { "epoch": 0.07355890977819661, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5852, "step": 3493 }, { "epoch": 0.07357996872746034, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5937, "step": 3494 }, { "epoch": 0.07360102767672407, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.596, "step": 3495 }, { "epoch": 0.07362208662598779, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6275, "step": 3496 }, { "epoch": 0.07364314557525152, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5641, "step": 3497 }, { "epoch": 0.07366420452451525, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6325, "step": 3498 }, { "epoch": 0.07368526347377898, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5639, "step": 3499 }, { "epoch": 0.0737063224230427, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5621, "step": 3500 }, { "epoch": 0.0737063224230427, "eval_loss": 2.018354892730713, "eval_runtime": 897.5659, "eval_samples_per_second": 68.853, "eval_steps_per_second": 2.152, "step": 3500 }, { "epoch": 0.07372738137230643, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6097, "step": 3501 }, { "epoch": 0.07374844032157016, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.587, "step": 3502 }, { "epoch": 0.07376949927083389, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6112, "step": 3503 }, { "epoch": 0.07379055822009761, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5799, "step": 3504 }, { "epoch": 0.07381161716936134, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6066, "step": 3505 }, { "epoch": 0.07383267611862507, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5433, "step": 3506 }, { "epoch": 0.07385373506788878, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5748, "step": 3507 }, { "epoch": 0.07387479401715251, "grad_norm": 0.19140625, "learning_rate": 0.0002, "loss": 1.6106, "step": 3508 }, { "epoch": 0.07389585296641624, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6073, "step": 3509 }, { "epoch": 0.07391691191567996, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5724, "step": 3510 }, { "epoch": 0.07393797086494369, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5596, "step": 3511 }, { "epoch": 0.07395902981420742, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5845, "step": 3512 }, { "epoch": 0.07398008876347115, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6012, "step": 3513 }, { "epoch": 0.07400114771273487, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.573, "step": 3514 }, { "epoch": 0.0740222066619986, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6054, "step": 3515 }, { "epoch": 0.07404326561126233, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5982, "step": 3516 }, { "epoch": 0.07406432456052606, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6318, "step": 3517 }, { "epoch": 0.07408538350978978, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6141, "step": 3518 }, { "epoch": 0.07410644245905351, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6044, "step": 3519 }, { "epoch": 0.07412750140831723, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5955, "step": 3520 }, { "epoch": 0.07414856035758095, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6132, "step": 3521 }, { "epoch": 0.07416961930684468, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.634, "step": 3522 }, { "epoch": 0.07419067825610841, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6095, "step": 3523 }, { "epoch": 0.07421173720537214, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6186, "step": 3524 }, { "epoch": 0.07423279615463586, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5757, "step": 3525 }, { "epoch": 0.07425385510389959, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5993, "step": 3526 }, { "epoch": 0.07427491405316332, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5753, "step": 3527 }, { "epoch": 0.07429597300242705, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6104, "step": 3528 }, { "epoch": 0.07431703195169077, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5885, "step": 3529 }, { "epoch": 0.0743380909009545, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6155, "step": 3530 }, { "epoch": 0.07435914985021823, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5737, "step": 3531 }, { "epoch": 0.07438020879948196, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5877, "step": 3532 }, { "epoch": 0.07440126774874567, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.581, "step": 3533 }, { "epoch": 0.0744223266980094, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.6153, "step": 3534 }, { "epoch": 0.07444338564727312, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5776, "step": 3535 }, { "epoch": 0.07446444459653685, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6121, "step": 3536 }, { "epoch": 0.07448550354580058, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5517, "step": 3537 }, { "epoch": 0.07450656249506431, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5706, "step": 3538 }, { "epoch": 0.07452762144432803, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5925, "step": 3539 }, { "epoch": 0.07454868039359176, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5817, "step": 3540 }, { "epoch": 0.07456973934285549, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5599, "step": 3541 }, { "epoch": 0.07459079829211922, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6037, "step": 3542 }, { "epoch": 0.07461185724138295, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6374, "step": 3543 }, { "epoch": 0.07463291619064667, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6131, "step": 3544 }, { "epoch": 0.0746539751399104, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5945, "step": 3545 }, { "epoch": 0.07467503408917411, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5538, "step": 3546 }, { "epoch": 0.07469609303843784, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6129, "step": 3547 }, { "epoch": 0.07471715198770157, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5956, "step": 3548 }, { "epoch": 0.0747382109369653, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5839, "step": 3549 }, { "epoch": 0.07475926988622902, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5767, "step": 3550 }, { "epoch": 0.07478032883549275, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5561, "step": 3551 }, { "epoch": 0.07480138778475648, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5828, "step": 3552 }, { "epoch": 0.0748224467340202, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5926, "step": 3553 }, { "epoch": 0.07484350568328393, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5643, "step": 3554 }, { "epoch": 0.07486456463254766, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5759, "step": 3555 }, { "epoch": 0.07488562358181139, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5893, "step": 3556 }, { "epoch": 0.07490668253107512, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5945, "step": 3557 }, { "epoch": 0.07492774148033884, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5925, "step": 3558 }, { "epoch": 0.07494880042960257, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5613, "step": 3559 }, { "epoch": 0.07496985937886629, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5672, "step": 3560 }, { "epoch": 0.07499091832813001, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5701, "step": 3561 }, { "epoch": 0.07501197727739374, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6199, "step": 3562 }, { "epoch": 0.07503303622665747, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6131, "step": 3563 }, { "epoch": 0.0750540951759212, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.5747, "step": 3564 }, { "epoch": 0.07507515412518492, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5847, "step": 3565 }, { "epoch": 0.07509621307444865, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6031, "step": 3566 }, { "epoch": 0.07511727202371238, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5539, "step": 3567 }, { "epoch": 0.0751383309729761, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5912, "step": 3568 }, { "epoch": 0.07515938992223983, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5981, "step": 3569 }, { "epoch": 0.07518044887150356, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5391, "step": 3570 }, { "epoch": 0.07520150782076729, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5792, "step": 3571 }, { "epoch": 0.07522256677003102, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5544, "step": 3572 }, { "epoch": 0.07524362571929473, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5911, "step": 3573 }, { "epoch": 0.07526468466855846, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6121, "step": 3574 }, { "epoch": 0.07528574361782218, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5803, "step": 3575 }, { "epoch": 0.07530680256708591, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5749, "step": 3576 }, { "epoch": 0.07532786151634964, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5404, "step": 3577 }, { "epoch": 0.07534892046561337, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.536, "step": 3578 }, { "epoch": 0.0753699794148771, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5996, "step": 3579 }, { "epoch": 0.07539103836414082, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5857, "step": 3580 }, { "epoch": 0.07541209731340455, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5584, "step": 3581 }, { "epoch": 0.07543315626266828, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5525, "step": 3582 }, { "epoch": 0.075454215211932, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5753, "step": 3583 }, { "epoch": 0.07547527416119573, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6083, "step": 3584 }, { "epoch": 0.07549633311045946, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6094, "step": 3585 }, { "epoch": 0.07551739205972317, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5954, "step": 3586 }, { "epoch": 0.0755384510089869, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5987, "step": 3587 }, { "epoch": 0.07555950995825063, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5873, "step": 3588 }, { "epoch": 0.07558056890751436, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6274, "step": 3589 }, { "epoch": 0.07560162785677808, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.551, "step": 3590 }, { "epoch": 0.07562268680604181, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6023, "step": 3591 }, { "epoch": 0.07564374575530554, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5997, "step": 3592 }, { "epoch": 0.07566480470456927, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.5825, "step": 3593 }, { "epoch": 0.075685863653833, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.5779, "step": 3594 }, { "epoch": 0.07570692260309672, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5499, "step": 3595 }, { "epoch": 0.07572798155236045, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6006, "step": 3596 }, { "epoch": 0.07574904050162418, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5951, "step": 3597 }, { "epoch": 0.0757700994508879, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5702, "step": 3598 }, { "epoch": 0.07579115840015162, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5896, "step": 3599 }, { "epoch": 0.07581221734941535, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5758, "step": 3600 }, { "epoch": 0.07583327629867907, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5685, "step": 3601 }, { "epoch": 0.0758543352479428, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.6113, "step": 3602 }, { "epoch": 0.07587539419720653, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5567, "step": 3603 }, { "epoch": 0.07589645314647026, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5755, "step": 3604 }, { "epoch": 0.07591751209573398, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5833, "step": 3605 }, { "epoch": 0.07593857104499771, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5932, "step": 3606 }, { "epoch": 0.07595962999426144, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5823, "step": 3607 }, { "epoch": 0.07598068894352517, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.576, "step": 3608 }, { "epoch": 0.07600174789278889, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5927, "step": 3609 }, { "epoch": 0.07602280684205262, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5696, "step": 3610 }, { "epoch": 0.07604386579131635, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6064, "step": 3611 }, { "epoch": 0.07606492474058008, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6075, "step": 3612 }, { "epoch": 0.07608598368984379, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5595, "step": 3613 }, { "epoch": 0.07610704263910752, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5595, "step": 3614 }, { "epoch": 0.07612810158837124, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5954, "step": 3615 }, { "epoch": 0.07614916053763497, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6064, "step": 3616 }, { "epoch": 0.0761702194868987, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5926, "step": 3617 }, { "epoch": 0.07619127843616243, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5642, "step": 3618 }, { "epoch": 0.07621233738542615, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6129, "step": 3619 }, { "epoch": 0.07623339633468988, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5795, "step": 3620 }, { "epoch": 0.07625445528395361, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5812, "step": 3621 }, { "epoch": 0.07627551423321734, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5679, "step": 3622 }, { "epoch": 0.07629657318248106, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5766, "step": 3623 }, { "epoch": 0.07631763213174479, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.608, "step": 3624 }, { "epoch": 0.07633869108100852, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.591, "step": 3625 }, { "epoch": 0.07635975003027223, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5778, "step": 3626 }, { "epoch": 0.07638080897953596, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5805, "step": 3627 }, { "epoch": 0.07640186792879969, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5558, "step": 3628 }, { "epoch": 0.07642292687806342, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.605, "step": 3629 }, { "epoch": 0.07644398582732714, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5889, "step": 3630 }, { "epoch": 0.07646504477659087, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.601, "step": 3631 }, { "epoch": 0.0764861037258546, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5688, "step": 3632 }, { "epoch": 0.07650716267511833, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5878, "step": 3633 }, { "epoch": 0.07652822162438205, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5862, "step": 3634 }, { "epoch": 0.07654928057364578, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5576, "step": 3635 }, { "epoch": 0.07657033952290951, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5689, "step": 3636 }, { "epoch": 0.07659139847217324, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5765, "step": 3637 }, { "epoch": 0.07661245742143696, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.596, "step": 3638 }, { "epoch": 0.07663351637070068, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6129, "step": 3639 }, { "epoch": 0.0766545753199644, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5989, "step": 3640 }, { "epoch": 0.07667563426922813, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6102, "step": 3641 }, { "epoch": 0.07669669321849186, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5825, "step": 3642 }, { "epoch": 0.07671775216775559, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5988, "step": 3643 }, { "epoch": 0.07673881111701931, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6051, "step": 3644 }, { "epoch": 0.07675987006628304, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5876, "step": 3645 }, { "epoch": 0.07678092901554677, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5593, "step": 3646 }, { "epoch": 0.0768019879648105, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5714, "step": 3647 }, { "epoch": 0.07682304691407423, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6031, "step": 3648 }, { "epoch": 0.07684410586333795, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5732, "step": 3649 }, { "epoch": 0.07686516481260168, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5712, "step": 3650 }, { "epoch": 0.07688622376186541, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5885, "step": 3651 }, { "epoch": 0.07690728271112912, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.633, "step": 3652 }, { "epoch": 0.07692834166039285, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5984, "step": 3653 }, { "epoch": 0.07694940060965658, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6018, "step": 3654 }, { "epoch": 0.0769704595589203, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6076, "step": 3655 }, { "epoch": 0.07699151850818403, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5681, "step": 3656 }, { "epoch": 0.07701257745744776, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.59, "step": 3657 }, { "epoch": 0.07703363640671149, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.575, "step": 3658 }, { "epoch": 0.07705469535597521, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5718, "step": 3659 }, { "epoch": 0.07707575430523894, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5835, "step": 3660 }, { "epoch": 0.07709681325450267, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5651, "step": 3661 }, { "epoch": 0.0771178722037664, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5599, "step": 3662 }, { "epoch": 0.07713893115303012, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5873, "step": 3663 }, { "epoch": 0.07715999010229385, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6038, "step": 3664 }, { "epoch": 0.07718104905155758, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5761, "step": 3665 }, { "epoch": 0.07720210800082129, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6053, "step": 3666 }, { "epoch": 0.07722316695008502, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5485, "step": 3667 }, { "epoch": 0.07724422589934875, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5739, "step": 3668 }, { "epoch": 0.07726528484861248, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5713, "step": 3669 }, { "epoch": 0.0772863437978762, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6199, "step": 3670 }, { "epoch": 0.07730740274713993, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6088, "step": 3671 }, { "epoch": 0.07732846169640366, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5761, "step": 3672 }, { "epoch": 0.07734952064566739, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6056, "step": 3673 }, { "epoch": 0.07737057959493111, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5823, "step": 3674 }, { "epoch": 0.07739163854419484, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5945, "step": 3675 }, { "epoch": 0.07741269749345857, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.6409, "step": 3676 }, { "epoch": 0.0774337564427223, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6021, "step": 3677 }, { "epoch": 0.07745481539198602, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5784, "step": 3678 }, { "epoch": 0.07747587434124974, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5877, "step": 3679 }, { "epoch": 0.07749693329051346, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5781, "step": 3680 }, { "epoch": 0.07751799223977719, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5993, "step": 3681 }, { "epoch": 0.07753905118904092, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5975, "step": 3682 }, { "epoch": 0.07756011013830465, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6273, "step": 3683 }, { "epoch": 0.07758116908756837, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5863, "step": 3684 }, { "epoch": 0.0776022280368321, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5913, "step": 3685 }, { "epoch": 0.07762328698609583, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5759, "step": 3686 }, { "epoch": 0.07764434593535956, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5976, "step": 3687 }, { "epoch": 0.07766540488462328, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.591, "step": 3688 }, { "epoch": 0.07768646383388701, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5528, "step": 3689 }, { "epoch": 0.07770752278315074, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5809, "step": 3690 }, { "epoch": 0.07772858173241447, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5863, "step": 3691 }, { "epoch": 0.07774964068167818, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5802, "step": 3692 }, { "epoch": 0.07777069963094191, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6001, "step": 3693 }, { "epoch": 0.07779175858020564, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6108, "step": 3694 }, { "epoch": 0.07781281752946936, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5979, "step": 3695 }, { "epoch": 0.07783387647873309, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6082, "step": 3696 }, { "epoch": 0.07785493542799682, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6079, "step": 3697 }, { "epoch": 0.07787599437726055, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.5785, "step": 3698 }, { "epoch": 0.07789705332652427, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5612, "step": 3699 }, { "epoch": 0.077918112275788, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5838, "step": 3700 }, { "epoch": 0.07793917122505173, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5769, "step": 3701 }, { "epoch": 0.07796023017431546, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6104, "step": 3702 }, { "epoch": 0.07798128912357918, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.576, "step": 3703 }, { "epoch": 0.07800234807284291, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6191, "step": 3704 }, { "epoch": 0.07802340702210664, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5742, "step": 3705 }, { "epoch": 0.07804446597137035, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5669, "step": 3706 }, { "epoch": 0.07806552492063408, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5614, "step": 3707 }, { "epoch": 0.07808658386989781, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6113, "step": 3708 }, { "epoch": 0.07810764281916154, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6016, "step": 3709 }, { "epoch": 0.07812870176842526, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5706, "step": 3710 }, { "epoch": 0.07814976071768899, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5678, "step": 3711 }, { "epoch": 0.07817081966695272, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5952, "step": 3712 }, { "epoch": 0.07819187861621645, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6148, "step": 3713 }, { "epoch": 0.07821293756548017, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5946, "step": 3714 }, { "epoch": 0.0782339965147439, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5859, "step": 3715 }, { "epoch": 0.07825505546400763, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.578, "step": 3716 }, { "epoch": 0.07827611441327136, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5699, "step": 3717 }, { "epoch": 0.07829717336253508, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5702, "step": 3718 }, { "epoch": 0.0783182323117988, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5995, "step": 3719 }, { "epoch": 0.07833929126106252, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5918, "step": 3720 }, { "epoch": 0.07836035021032625, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5779, "step": 3721 }, { "epoch": 0.07838140915958998, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5815, "step": 3722 }, { "epoch": 0.0784024681088537, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5629, "step": 3723 }, { "epoch": 0.07842352705811743, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.616, "step": 3724 }, { "epoch": 0.07844458600738116, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6054, "step": 3725 }, { "epoch": 0.07846564495664489, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5746, "step": 3726 }, { "epoch": 0.07848670390590862, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5866, "step": 3727 }, { "epoch": 0.07850776285517234, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5859, "step": 3728 }, { "epoch": 0.07852882180443607, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5515, "step": 3729 }, { "epoch": 0.0785498807536998, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.585, "step": 3730 }, { "epoch": 0.07857093970296353, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5717, "step": 3731 }, { "epoch": 0.07859199865222724, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5535, "step": 3732 }, { "epoch": 0.07861305760149097, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.596, "step": 3733 }, { "epoch": 0.0786341165507547, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.58, "step": 3734 }, { "epoch": 0.07865517550001842, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5906, "step": 3735 }, { "epoch": 0.07867623444928215, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6056, "step": 3736 }, { "epoch": 0.07869729339854588, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5606, "step": 3737 }, { "epoch": 0.0787183523478096, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5969, "step": 3738 }, { "epoch": 0.07873941129707333, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5736, "step": 3739 }, { "epoch": 0.07876047024633706, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5978, "step": 3740 }, { "epoch": 0.07878152919560079, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6203, "step": 3741 }, { "epoch": 0.07880258814486452, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5674, "step": 3742 }, { "epoch": 0.07882364709412824, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.576, "step": 3743 }, { "epoch": 0.07884470604339197, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5857, "step": 3744 }, { "epoch": 0.07886576499265568, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5984, "step": 3745 }, { "epoch": 0.07888682394191941, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5794, "step": 3746 }, { "epoch": 0.07890788289118314, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5576, "step": 3747 }, { "epoch": 0.07892894184044687, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5756, "step": 3748 }, { "epoch": 0.0789500007897106, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5734, "step": 3749 }, { "epoch": 0.07897105973897432, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5882, "step": 3750 }, { "epoch": 0.07899211868823805, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5962, "step": 3751 }, { "epoch": 0.07901317763750178, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5565, "step": 3752 }, { "epoch": 0.0790342365867655, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5731, "step": 3753 }, { "epoch": 0.07905529553602923, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5806, "step": 3754 }, { "epoch": 0.07907635448529296, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5807, "step": 3755 }, { "epoch": 0.07909741343455669, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5608, "step": 3756 }, { "epoch": 0.07911847238382042, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5828, "step": 3757 }, { "epoch": 0.07913953133308414, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6233, "step": 3758 }, { "epoch": 0.07916059028234786, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.57, "step": 3759 }, { "epoch": 0.07918164923161158, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.573, "step": 3760 }, { "epoch": 0.07920270818087531, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6179, "step": 3761 }, { "epoch": 0.07922376713013904, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6117, "step": 3762 }, { "epoch": 0.07924482607940277, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5659, "step": 3763 }, { "epoch": 0.0792658850286665, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5852, "step": 3764 }, { "epoch": 0.07928694397793022, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6018, "step": 3765 }, { "epoch": 0.07930800292719395, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5907, "step": 3766 }, { "epoch": 0.07932906187645768, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5704, "step": 3767 }, { "epoch": 0.0793501208257214, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6199, "step": 3768 }, { "epoch": 0.07937117977498513, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5887, "step": 3769 }, { "epoch": 0.07939223872424886, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5995, "step": 3770 }, { "epoch": 0.07941329767351259, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5907, "step": 3771 }, { "epoch": 0.0794343566227763, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5937, "step": 3772 }, { "epoch": 0.07945541557204003, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.6053, "step": 3773 }, { "epoch": 0.07947647452130376, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.5964, "step": 3774 }, { "epoch": 0.07949753347056748, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5861, "step": 3775 }, { "epoch": 0.07951859241983121, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6002, "step": 3776 }, { "epoch": 0.07953965136909494, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6099, "step": 3777 }, { "epoch": 0.07956071031835867, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6108, "step": 3778 }, { "epoch": 0.0795817692676224, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5484, "step": 3779 }, { "epoch": 0.07960282821688612, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5812, "step": 3780 }, { "epoch": 0.07962388716614985, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5864, "step": 3781 }, { "epoch": 0.07964494611541358, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5866, "step": 3782 }, { "epoch": 0.0796660050646773, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5994, "step": 3783 }, { "epoch": 0.07968706401394103, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6268, "step": 3784 }, { "epoch": 0.07970812296320474, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5903, "step": 3785 }, { "epoch": 0.07972918191246847, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5424, "step": 3786 }, { "epoch": 0.0797502408617322, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5644, "step": 3787 }, { "epoch": 0.07977129981099593, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5977, "step": 3788 }, { "epoch": 0.07979235876025965, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5727, "step": 3789 }, { "epoch": 0.07981341770952338, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.5634, "step": 3790 }, { "epoch": 0.07983447665878711, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5651, "step": 3791 }, { "epoch": 0.07985553560805084, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6226, "step": 3792 }, { "epoch": 0.07987659455731456, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5542, "step": 3793 }, { "epoch": 0.07989765350657829, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5956, "step": 3794 }, { "epoch": 0.07991871245584202, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5789, "step": 3795 }, { "epoch": 0.07993977140510575, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5623, "step": 3796 }, { "epoch": 0.07996083035436947, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6089, "step": 3797 }, { "epoch": 0.07998188930363319, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5873, "step": 3798 }, { "epoch": 0.08000294825289692, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5929, "step": 3799 }, { "epoch": 0.08002400720216064, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5595, "step": 3800 }, { "epoch": 0.08004506615142437, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.586, "step": 3801 }, { "epoch": 0.0800661251006881, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5884, "step": 3802 }, { "epoch": 0.08008718404995183, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5775, "step": 3803 }, { "epoch": 0.08010824299921555, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5825, "step": 3804 }, { "epoch": 0.08012930194847928, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.5689, "step": 3805 }, { "epoch": 0.08015036089774301, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6086, "step": 3806 }, { "epoch": 0.08017141984700674, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5863, "step": 3807 }, { "epoch": 0.08019247879627046, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5761, "step": 3808 }, { "epoch": 0.08021353774553419, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5858, "step": 3809 }, { "epoch": 0.08023459669479792, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5851, "step": 3810 }, { "epoch": 0.08025565564406165, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5996, "step": 3811 }, { "epoch": 0.08027671459332536, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5653, "step": 3812 }, { "epoch": 0.08029777354258909, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6083, "step": 3813 }, { "epoch": 0.08031883249185282, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5893, "step": 3814 }, { "epoch": 0.08033989144111654, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5803, "step": 3815 }, { "epoch": 0.08036095039038027, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.594, "step": 3816 }, { "epoch": 0.080382009339644, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5744, "step": 3817 }, { "epoch": 0.08040306828890773, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5767, "step": 3818 }, { "epoch": 0.08042412723817145, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.5805, "step": 3819 }, { "epoch": 0.08044518618743518, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5754, "step": 3820 }, { "epoch": 0.08046624513669891, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5991, "step": 3821 }, { "epoch": 0.08048730408596264, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6183, "step": 3822 }, { "epoch": 0.08050836303522636, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5916, "step": 3823 }, { "epoch": 0.08052942198449009, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6063, "step": 3824 }, { "epoch": 0.0805504809337538, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5825, "step": 3825 }, { "epoch": 0.08057153988301753, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5571, "step": 3826 }, { "epoch": 0.08059259883228126, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6108, "step": 3827 }, { "epoch": 0.08061365778154499, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5882, "step": 3828 }, { "epoch": 0.08063471673080871, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5555, "step": 3829 }, { "epoch": 0.08065577568007244, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6218, "step": 3830 }, { "epoch": 0.08067683462933617, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6215, "step": 3831 }, { "epoch": 0.0806978935785999, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6023, "step": 3832 }, { "epoch": 0.08071895252786362, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5751, "step": 3833 }, { "epoch": 0.08074001147712735, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5808, "step": 3834 }, { "epoch": 0.08076107042639108, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5703, "step": 3835 }, { "epoch": 0.08078212937565481, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5615, "step": 3836 }, { "epoch": 0.08080318832491853, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.6016, "step": 3837 }, { "epoch": 0.08082424727418225, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5814, "step": 3838 }, { "epoch": 0.08084530622344598, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5893, "step": 3839 }, { "epoch": 0.0808663651727097, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5915, "step": 3840 }, { "epoch": 0.08088742412197343, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5764, "step": 3841 }, { "epoch": 0.08090848307123716, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5761, "step": 3842 }, { "epoch": 0.08092954202050089, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5667, "step": 3843 }, { "epoch": 0.08095060096976461, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6, "step": 3844 }, { "epoch": 0.08097165991902834, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.6131, "step": 3845 }, { "epoch": 0.08099271886829207, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5961, "step": 3846 }, { "epoch": 0.0810137778175558, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5825, "step": 3847 }, { "epoch": 0.08103483676681952, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.5858, "step": 3848 }, { "epoch": 0.08105589571608325, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5835, "step": 3849 }, { "epoch": 0.08107695466534698, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5482, "step": 3850 }, { "epoch": 0.08109801361461069, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6136, "step": 3851 }, { "epoch": 0.08111907256387442, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.584, "step": 3852 }, { "epoch": 0.08114013151313815, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5347, "step": 3853 }, { "epoch": 0.08116119046240187, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5785, "step": 3854 }, { "epoch": 0.0811822494116656, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5818, "step": 3855 }, { "epoch": 0.08120330836092933, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.572, "step": 3856 }, { "epoch": 0.08122436731019306, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5732, "step": 3857 }, { "epoch": 0.08124542625945678, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5929, "step": 3858 }, { "epoch": 0.08126648520872051, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.587, "step": 3859 }, { "epoch": 0.08128754415798424, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5869, "step": 3860 }, { "epoch": 0.08130860310724797, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5925, "step": 3861 }, { "epoch": 0.0813296620565117, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5722, "step": 3862 }, { "epoch": 0.08135072100577542, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5834, "step": 3863 }, { "epoch": 0.08137177995503915, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6138, "step": 3864 }, { "epoch": 0.08139283890430286, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5931, "step": 3865 }, { "epoch": 0.08141389785356659, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5812, "step": 3866 }, { "epoch": 0.08143495680283032, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5822, "step": 3867 }, { "epoch": 0.08145601575209405, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6299, "step": 3868 }, { "epoch": 0.08147707470135777, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6222, "step": 3869 }, { "epoch": 0.0814981336506215, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.616, "step": 3870 }, { "epoch": 0.08151919259988523, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5648, "step": 3871 }, { "epoch": 0.08154025154914896, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.587, "step": 3872 }, { "epoch": 0.08156131049841268, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.591, "step": 3873 }, { "epoch": 0.08158236944767641, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.545, "step": 3874 }, { "epoch": 0.08160342839694014, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6478, "step": 3875 }, { "epoch": 0.08162448734620387, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.574, "step": 3876 }, { "epoch": 0.0816455462954676, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5823, "step": 3877 }, { "epoch": 0.08166660524473131, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5827, "step": 3878 }, { "epoch": 0.08168766419399504, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5786, "step": 3879 }, { "epoch": 0.08170872314325876, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.5983, "step": 3880 }, { "epoch": 0.08172978209252249, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5559, "step": 3881 }, { "epoch": 0.08175084104178622, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5897, "step": 3882 }, { "epoch": 0.08177189999104995, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5838, "step": 3883 }, { "epoch": 0.08179295894031367, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5518, "step": 3884 }, { "epoch": 0.0818140178895774, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5711, "step": 3885 }, { "epoch": 0.08183507683884113, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5927, "step": 3886 }, { "epoch": 0.08185613578810486, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5575, "step": 3887 }, { "epoch": 0.08187719473736858, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5799, "step": 3888 }, { "epoch": 0.08189825368663231, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.554, "step": 3889 }, { "epoch": 0.08191931263589604, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6094, "step": 3890 }, { "epoch": 0.08194037158515975, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5863, "step": 3891 }, { "epoch": 0.08196143053442348, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5628, "step": 3892 }, { "epoch": 0.08198248948368721, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6101, "step": 3893 }, { "epoch": 0.08200354843295093, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5377, "step": 3894 }, { "epoch": 0.08202460738221466, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5739, "step": 3895 }, { "epoch": 0.08204566633147839, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6073, "step": 3896 }, { "epoch": 0.08206672528074212, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.594, "step": 3897 }, { "epoch": 0.08208778423000584, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5833, "step": 3898 }, { "epoch": 0.08210884317926957, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6135, "step": 3899 }, { "epoch": 0.0821299021285333, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5964, "step": 3900 }, { "epoch": 0.08215096107779703, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5941, "step": 3901 }, { "epoch": 0.08217202002706075, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6095, "step": 3902 }, { "epoch": 0.08219307897632448, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6087, "step": 3903 }, { "epoch": 0.0822141379255882, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5702, "step": 3904 }, { "epoch": 0.08223519687485192, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5922, "step": 3905 }, { "epoch": 0.08225625582411565, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5749, "step": 3906 }, { "epoch": 0.08227731477337938, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5844, "step": 3907 }, { "epoch": 0.0822983737226431, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5683, "step": 3908 }, { "epoch": 0.08231943267190683, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5987, "step": 3909 }, { "epoch": 0.08234049162117056, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6063, "step": 3910 }, { "epoch": 0.08236155057043429, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.572, "step": 3911 }, { "epoch": 0.08238260951969802, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5661, "step": 3912 }, { "epoch": 0.08240366846896174, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6054, "step": 3913 }, { "epoch": 0.08242472741822547, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5797, "step": 3914 }, { "epoch": 0.0824457863674892, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5874, "step": 3915 }, { "epoch": 0.08246684531675293, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5916, "step": 3916 }, { "epoch": 0.08248790426601665, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6077, "step": 3917 }, { "epoch": 0.08250896321528037, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.593, "step": 3918 }, { "epoch": 0.0825300221645441, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5967, "step": 3919 }, { "epoch": 0.08255108111380782, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6174, "step": 3920 }, { "epoch": 0.08257214006307155, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5941, "step": 3921 }, { "epoch": 0.08259319901233528, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5828, "step": 3922 }, { "epoch": 0.082614257961599, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5557, "step": 3923 }, { "epoch": 0.08263531691086273, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6207, "step": 3924 }, { "epoch": 0.08265637586012646, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5856, "step": 3925 }, { "epoch": 0.08267743480939019, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5842, "step": 3926 }, { "epoch": 0.08269849375865392, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5838, "step": 3927 }, { "epoch": 0.08271955270791764, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5953, "step": 3928 }, { "epoch": 0.08274061165718137, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.593, "step": 3929 }, { "epoch": 0.0827616706064451, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5751, "step": 3930 }, { "epoch": 0.08278272955570881, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5749, "step": 3931 }, { "epoch": 0.08280378850497254, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.5896, "step": 3932 }, { "epoch": 0.08282484745423627, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6058, "step": 3933 }, { "epoch": 0.0828459064035, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6298, "step": 3934 }, { "epoch": 0.08286696535276372, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5974, "step": 3935 }, { "epoch": 0.08288802430202745, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5773, "step": 3936 }, { "epoch": 0.08290908325129118, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5909, "step": 3937 }, { "epoch": 0.0829301422005549, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5679, "step": 3938 }, { "epoch": 0.08295120114981863, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6073, "step": 3939 }, { "epoch": 0.08297226009908236, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5631, "step": 3940 }, { "epoch": 0.08299331904834609, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5829, "step": 3941 }, { "epoch": 0.08301437799760981, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5922, "step": 3942 }, { "epoch": 0.08303543694687354, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5625, "step": 3943 }, { "epoch": 0.08305649589613726, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6263, "step": 3944 }, { "epoch": 0.08307755484540098, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6159, "step": 3945 }, { "epoch": 0.08309861379466471, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5987, "step": 3946 }, { "epoch": 0.08311967274392844, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5685, "step": 3947 }, { "epoch": 0.08314073169319217, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5815, "step": 3948 }, { "epoch": 0.0831617906424559, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5767, "step": 3949 }, { "epoch": 0.08318284959171962, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6038, "step": 3950 }, { "epoch": 0.08320390854098335, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5815, "step": 3951 }, { "epoch": 0.08322496749024708, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.5727, "step": 3952 }, { "epoch": 0.0832460264395108, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5769, "step": 3953 }, { "epoch": 0.08326708538877453, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6272, "step": 3954 }, { "epoch": 0.08328814433803826, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5731, "step": 3955 }, { "epoch": 0.08330920328730199, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5644, "step": 3956 }, { "epoch": 0.0833302622365657, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5596, "step": 3957 }, { "epoch": 0.08335132118582943, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6111, "step": 3958 }, { "epoch": 0.08337238013509315, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6061, "step": 3959 }, { "epoch": 0.08339343908435688, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.581, "step": 3960 }, { "epoch": 0.08341449803362061, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6083, "step": 3961 }, { "epoch": 0.08343555698288434, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5434, "step": 3962 }, { "epoch": 0.08345661593214806, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5932, "step": 3963 }, { "epoch": 0.08347767488141179, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5912, "step": 3964 }, { "epoch": 0.08349873383067552, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5998, "step": 3965 }, { "epoch": 0.08351979277993925, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5314, "step": 3966 }, { "epoch": 0.08354085172920298, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.6056, "step": 3967 }, { "epoch": 0.0835619106784667, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5697, "step": 3968 }, { "epoch": 0.08358296962773043, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5678, "step": 3969 }, { "epoch": 0.08360402857699416, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6399, "step": 3970 }, { "epoch": 0.08362508752625787, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5924, "step": 3971 }, { "epoch": 0.0836461464755216, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6072, "step": 3972 }, { "epoch": 0.08366720542478533, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.6014, "step": 3973 }, { "epoch": 0.08368826437404905, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5933, "step": 3974 }, { "epoch": 0.08370932332331278, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5721, "step": 3975 }, { "epoch": 0.08373038227257651, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5739, "step": 3976 }, { "epoch": 0.08375144122184024, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5979, "step": 3977 }, { "epoch": 0.08377250017110396, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6014, "step": 3978 }, { "epoch": 0.08379355912036769, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5868, "step": 3979 }, { "epoch": 0.08381461806963142, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5782, "step": 3980 }, { "epoch": 0.08383567701889515, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5918, "step": 3981 }, { "epoch": 0.08385673596815887, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5816, "step": 3982 }, { "epoch": 0.0838777949174226, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5691, "step": 3983 }, { "epoch": 0.08389885386668632, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5709, "step": 3984 }, { "epoch": 0.08391991281595004, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6117, "step": 3985 }, { "epoch": 0.08394097176521377, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5888, "step": 3986 }, { "epoch": 0.0839620307144775, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5504, "step": 3987 }, { "epoch": 0.08398308966374123, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5408, "step": 3988 }, { "epoch": 0.08400414861300495, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5982, "step": 3989 }, { "epoch": 0.08402520756226868, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5785, "step": 3990 }, { "epoch": 0.08404626651153241, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5939, "step": 3991 }, { "epoch": 0.08406732546079614, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5608, "step": 3992 }, { "epoch": 0.08408838441005986, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.5855, "step": 3993 }, { "epoch": 0.08410944335932359, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5583, "step": 3994 }, { "epoch": 0.08413050230858732, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5972, "step": 3995 }, { "epoch": 0.08415156125785105, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.5906, "step": 3996 }, { "epoch": 0.08417262020711476, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5982, "step": 3997 }, { "epoch": 0.08419367915637849, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5777, "step": 3998 }, { "epoch": 0.08421473810564221, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5935, "step": 3999 }, { "epoch": 0.08423579705490594, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.56, "step": 4000 }, { "epoch": 0.08423579705490594, "eval_loss": 2.1100332736968994, "eval_runtime": 990.4764, "eval_samples_per_second": 62.394, "eval_steps_per_second": 1.951, "step": 4000 }, { "epoch": 0.08425685600416967, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5891, "step": 4001 }, { "epoch": 0.0842779149534334, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6125, "step": 4002 }, { "epoch": 0.08429897390269712, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5656, "step": 4003 }, { "epoch": 0.08432003285196085, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5996, "step": 4004 }, { "epoch": 0.08434109180122458, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5828, "step": 4005 }, { "epoch": 0.08436215075048831, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5553, "step": 4006 }, { "epoch": 0.08438320969975203, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5919, "step": 4007 }, { "epoch": 0.08440426864901576, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5849, "step": 4008 }, { "epoch": 0.08442532759827949, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6342, "step": 4009 }, { "epoch": 0.0844463865475432, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5912, "step": 4010 }, { "epoch": 0.08446744549680693, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.589, "step": 4011 }, { "epoch": 0.08448850444607066, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5954, "step": 4012 }, { "epoch": 0.08450956339533439, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.5923, "step": 4013 }, { "epoch": 0.08453062234459811, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6099, "step": 4014 }, { "epoch": 0.08455168129386184, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.6005, "step": 4015 }, { "epoch": 0.08457274024312557, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5652, "step": 4016 }, { "epoch": 0.0845937991923893, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6411, "step": 4017 }, { "epoch": 0.08461485814165302, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5379, "step": 4018 }, { "epoch": 0.08463591709091675, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5945, "step": 4019 }, { "epoch": 0.08465697604018048, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6066, "step": 4020 }, { "epoch": 0.0846780349894442, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5596, "step": 4021 }, { "epoch": 0.08469909393870793, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6042, "step": 4022 }, { "epoch": 0.08472015288797166, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6043, "step": 4023 }, { "epoch": 0.08474121183723538, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5601, "step": 4024 }, { "epoch": 0.0847622707864991, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6272, "step": 4025 }, { "epoch": 0.08478332973576283, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.582, "step": 4026 }, { "epoch": 0.08480438868502656, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5457, "step": 4027 }, { "epoch": 0.08482544763429029, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5613, "step": 4028 }, { "epoch": 0.08484650658355401, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6101, "step": 4029 }, { "epoch": 0.08486756553281774, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5534, "step": 4030 }, { "epoch": 0.08488862448208147, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5849, "step": 4031 }, { "epoch": 0.0849096834313452, "grad_norm": 0.1982421875, "learning_rate": 0.0002, "loss": 1.5762, "step": 4032 }, { "epoch": 0.08493074238060892, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5405, "step": 4033 }, { "epoch": 0.08495180132987265, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.588, "step": 4034 }, { "epoch": 0.08497286027913638, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6134, "step": 4035 }, { "epoch": 0.0849939192284001, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5778, "step": 4036 }, { "epoch": 0.08501497817766382, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5787, "step": 4037 }, { "epoch": 0.08503603712692755, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6093, "step": 4038 }, { "epoch": 0.08505709607619127, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6064, "step": 4039 }, { "epoch": 0.085078155025455, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.607, "step": 4040 }, { "epoch": 0.08509921397471873, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5795, "step": 4041 }, { "epoch": 0.08512027292398246, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6022, "step": 4042 }, { "epoch": 0.08514133187324618, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5921, "step": 4043 }, { "epoch": 0.08516239082250991, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6174, "step": 4044 }, { "epoch": 0.08518344977177364, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6026, "step": 4045 }, { "epoch": 0.08520450872103737, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5504, "step": 4046 }, { "epoch": 0.0852255676703011, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6043, "step": 4047 }, { "epoch": 0.08524662661956482, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5691, "step": 4048 }, { "epoch": 0.08526768556882855, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5868, "step": 4049 }, { "epoch": 0.08528874451809226, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.554, "step": 4050 }, { "epoch": 0.08530980346735599, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5782, "step": 4051 }, { "epoch": 0.08533086241661972, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6496, "step": 4052 }, { "epoch": 0.08535192136588345, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5904, "step": 4053 }, { "epoch": 0.08537298031514717, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5592, "step": 4054 }, { "epoch": 0.0853940392644109, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5766, "step": 4055 }, { "epoch": 0.08541509821367463, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5647, "step": 4056 }, { "epoch": 0.08543615716293836, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5597, "step": 4057 }, { "epoch": 0.08545721611220208, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5685, "step": 4058 }, { "epoch": 0.08547827506146581, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.6284, "step": 4059 }, { "epoch": 0.08549933401072954, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5666, "step": 4060 }, { "epoch": 0.08552039295999327, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5728, "step": 4061 }, { "epoch": 0.085541451909257, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5866, "step": 4062 }, { "epoch": 0.08556251085852071, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5571, "step": 4063 }, { "epoch": 0.08558356980778443, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5601, "step": 4064 }, { "epoch": 0.08560462875704816, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6075, "step": 4065 }, { "epoch": 0.08562568770631189, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5675, "step": 4066 }, { "epoch": 0.08564674665557562, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5699, "step": 4067 }, { "epoch": 0.08566780560483934, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.554, "step": 4068 }, { "epoch": 0.08568886455410307, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5507, "step": 4069 }, { "epoch": 0.0857099235033668, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5855, "step": 4070 }, { "epoch": 0.08573098245263053, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5741, "step": 4071 }, { "epoch": 0.08575204140189426, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5787, "step": 4072 }, { "epoch": 0.08577310035115798, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5488, "step": 4073 }, { "epoch": 0.08579415930042171, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5575, "step": 4074 }, { "epoch": 0.08581521824968544, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5774, "step": 4075 }, { "epoch": 0.08583627719894917, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5579, "step": 4076 }, { "epoch": 0.08585733614821288, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5807, "step": 4077 }, { "epoch": 0.0858783950974766, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5799, "step": 4078 }, { "epoch": 0.08589945404674033, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5634, "step": 4079 }, { "epoch": 0.08592051299600406, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6134, "step": 4080 }, { "epoch": 0.08594157194526779, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5569, "step": 4081 }, { "epoch": 0.08596263089453152, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5557, "step": 4082 }, { "epoch": 0.08598368984379524, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6015, "step": 4083 }, { "epoch": 0.08600474879305897, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5807, "step": 4084 }, { "epoch": 0.0860258077423227, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5863, "step": 4085 }, { "epoch": 0.08604686669158643, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5584, "step": 4086 }, { "epoch": 0.08606792564085015, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.638, "step": 4087 }, { "epoch": 0.08608898459011388, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5633, "step": 4088 }, { "epoch": 0.08611004353937761, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5691, "step": 4089 }, { "epoch": 0.08613110248864132, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.5981, "step": 4090 }, { "epoch": 0.08615216143790505, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5972, "step": 4091 }, { "epoch": 0.08617322038716878, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5719, "step": 4092 }, { "epoch": 0.0861942793364325, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5868, "step": 4093 }, { "epoch": 0.08621533828569623, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6196, "step": 4094 }, { "epoch": 0.08623639723495996, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5609, "step": 4095 }, { "epoch": 0.08625745618422369, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5594, "step": 4096 }, { "epoch": 0.08627851513348742, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5484, "step": 4097 }, { "epoch": 0.08629957408275114, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6014, "step": 4098 }, { "epoch": 0.08632063303201487, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5937, "step": 4099 }, { "epoch": 0.0863416919812786, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5857, "step": 4100 }, { "epoch": 0.08636275093054233, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5569, "step": 4101 }, { "epoch": 0.08638380987980605, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5532, "step": 4102 }, { "epoch": 0.08640486882906977, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5848, "step": 4103 }, { "epoch": 0.0864259277783335, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6018, "step": 4104 }, { "epoch": 0.08644698672759722, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5992, "step": 4105 }, { "epoch": 0.08646804567686095, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5558, "step": 4106 }, { "epoch": 0.08648910462612468, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6024, "step": 4107 }, { "epoch": 0.0865101635753884, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5963, "step": 4108 }, { "epoch": 0.08653122252465213, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6019, "step": 4109 }, { "epoch": 0.08655228147391586, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5948, "step": 4110 }, { "epoch": 0.08657334042317959, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5713, "step": 4111 }, { "epoch": 0.08659439937244331, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5913, "step": 4112 }, { "epoch": 0.08661545832170704, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5665, "step": 4113 }, { "epoch": 0.08663651727097077, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5586, "step": 4114 }, { "epoch": 0.0866575762202345, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5852, "step": 4115 }, { "epoch": 0.08667863516949821, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5899, "step": 4116 }, { "epoch": 0.08669969411876194, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6012, "step": 4117 }, { "epoch": 0.08672075306802567, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5713, "step": 4118 }, { "epoch": 0.0867418120172894, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5905, "step": 4119 }, { "epoch": 0.08676287096655312, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6097, "step": 4120 }, { "epoch": 0.08678392991581685, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6431, "step": 4121 }, { "epoch": 0.08680498886508058, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5615, "step": 4122 }, { "epoch": 0.0868260478143443, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6194, "step": 4123 }, { "epoch": 0.08684710676360803, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.587, "step": 4124 }, { "epoch": 0.08686816571287176, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5808, "step": 4125 }, { "epoch": 0.08688922466213549, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5967, "step": 4126 }, { "epoch": 0.08691028361139921, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6032, "step": 4127 }, { "epoch": 0.08693134256066294, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5887, "step": 4128 }, { "epoch": 0.08695240150992667, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6248, "step": 4129 }, { "epoch": 0.08697346045919038, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6105, "step": 4130 }, { "epoch": 0.08699451940845411, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6043, "step": 4131 }, { "epoch": 0.08701557835771784, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6094, "step": 4132 }, { "epoch": 0.08703663730698157, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6092, "step": 4133 }, { "epoch": 0.08705769625624529, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6055, "step": 4134 }, { "epoch": 0.08707875520550902, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5837, "step": 4135 }, { "epoch": 0.08709981415477275, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6092, "step": 4136 }, { "epoch": 0.08712087310403648, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5562, "step": 4137 }, { "epoch": 0.0871419320533002, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5651, "step": 4138 }, { "epoch": 0.08716299100256393, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5981, "step": 4139 }, { "epoch": 0.08718404995182766, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5675, "step": 4140 }, { "epoch": 0.08720510890109139, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5645, "step": 4141 }, { "epoch": 0.08722616785035511, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5881, "step": 4142 }, { "epoch": 0.08724722679961883, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5876, "step": 4143 }, { "epoch": 0.08726828574888255, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.591, "step": 4144 }, { "epoch": 0.08728934469814628, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.5777, "step": 4145 }, { "epoch": 0.08731040364741001, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5809, "step": 4146 }, { "epoch": 0.08733146259667374, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5889, "step": 4147 }, { "epoch": 0.08735252154593746, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5937, "step": 4148 }, { "epoch": 0.08737358049520119, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5889, "step": 4149 }, { "epoch": 0.08739463944446492, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5858, "step": 4150 }, { "epoch": 0.08741569839372865, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5533, "step": 4151 }, { "epoch": 0.08743675734299237, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5734, "step": 4152 }, { "epoch": 0.0874578162922561, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5629, "step": 4153 }, { "epoch": 0.08747887524151983, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5914, "step": 4154 }, { "epoch": 0.08749993419078356, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.594, "step": 4155 }, { "epoch": 0.08752099314004727, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5676, "step": 4156 }, { "epoch": 0.087542052089311, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5723, "step": 4157 }, { "epoch": 0.08756311103857473, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5533, "step": 4158 }, { "epoch": 0.08758416998783845, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5767, "step": 4159 }, { "epoch": 0.08760522893710218, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5894, "step": 4160 }, { "epoch": 0.08762628788636591, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.6121, "step": 4161 }, { "epoch": 0.08764734683562964, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5989, "step": 4162 }, { "epoch": 0.08766840578489336, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6408, "step": 4163 }, { "epoch": 0.08768946473415709, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6001, "step": 4164 }, { "epoch": 0.08771052368342082, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5846, "step": 4165 }, { "epoch": 0.08773158263268455, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5863, "step": 4166 }, { "epoch": 0.08775264158194827, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6249, "step": 4167 }, { "epoch": 0.087773700531212, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5793, "step": 4168 }, { "epoch": 0.08779475948047571, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.656, "step": 4169 }, { "epoch": 0.08781581842973944, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6073, "step": 4170 }, { "epoch": 0.08783687737900317, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5695, "step": 4171 }, { "epoch": 0.0878579363282669, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5621, "step": 4172 }, { "epoch": 0.08787899527753062, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5981, "step": 4173 }, { "epoch": 0.08790005422679435, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6283, "step": 4174 }, { "epoch": 0.08792111317605808, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.5937, "step": 4175 }, { "epoch": 0.08794217212532181, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5893, "step": 4176 }, { "epoch": 0.08796323107458554, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5719, "step": 4177 }, { "epoch": 0.08798429002384926, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5876, "step": 4178 }, { "epoch": 0.08800534897311299, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5818, "step": 4179 }, { "epoch": 0.08802640792237672, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6221, "step": 4180 }, { "epoch": 0.08804746687164045, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5782, "step": 4181 }, { "epoch": 0.08806852582090417, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5731, "step": 4182 }, { "epoch": 0.08808958477016789, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5683, "step": 4183 }, { "epoch": 0.08811064371943161, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5691, "step": 4184 }, { "epoch": 0.08813170266869534, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5968, "step": 4185 }, { "epoch": 0.08815276161795907, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5764, "step": 4186 }, { "epoch": 0.0881738205672228, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.6485, "step": 4187 }, { "epoch": 0.08819487951648652, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5936, "step": 4188 }, { "epoch": 0.08821593846575025, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5935, "step": 4189 }, { "epoch": 0.08823699741501398, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5761, "step": 4190 }, { "epoch": 0.0882580563642777, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5715, "step": 4191 }, { "epoch": 0.08827911531354143, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5753, "step": 4192 }, { "epoch": 0.08830017426280516, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5899, "step": 4193 }, { "epoch": 0.08832123321206889, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.592, "step": 4194 }, { "epoch": 0.08834229216133262, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5935, "step": 4195 }, { "epoch": 0.08836335111059633, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6238, "step": 4196 }, { "epoch": 0.08838441005986006, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5699, "step": 4197 }, { "epoch": 0.08840546900912379, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5642, "step": 4198 }, { "epoch": 0.08842652795838751, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5702, "step": 4199 }, { "epoch": 0.08844758690765124, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5811, "step": 4200 }, { "epoch": 0.08846864585691497, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5978, "step": 4201 }, { "epoch": 0.0884897048061787, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5812, "step": 4202 }, { "epoch": 0.08851076375544242, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.6292, "step": 4203 }, { "epoch": 0.08853182270470615, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5625, "step": 4204 }, { "epoch": 0.08855288165396988, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5882, "step": 4205 }, { "epoch": 0.0885739406032336, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5826, "step": 4206 }, { "epoch": 0.08859499955249733, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6048, "step": 4207 }, { "epoch": 0.08861605850176106, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6066, "step": 4208 }, { "epoch": 0.08863711745102477, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6181, "step": 4209 }, { "epoch": 0.0886581764002885, "grad_norm": 0.13671875, "learning_rate": 0.0002, "loss": 1.576, "step": 4210 }, { "epoch": 0.08867923534955223, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5765, "step": 4211 }, { "epoch": 0.08870029429881596, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5811, "step": 4212 }, { "epoch": 0.08872135324807968, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5938, "step": 4213 }, { "epoch": 0.08874241219734341, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6075, "step": 4214 }, { "epoch": 0.08876347114660714, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5544, "step": 4215 }, { "epoch": 0.08878453009587087, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6046, "step": 4216 }, { "epoch": 0.0888055890451346, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5895, "step": 4217 }, { "epoch": 0.08882664799439832, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5939, "step": 4218 }, { "epoch": 0.08884770694366205, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.6365, "step": 4219 }, { "epoch": 0.08886876589292578, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5596, "step": 4220 }, { "epoch": 0.0888898248421895, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5874, "step": 4221 }, { "epoch": 0.08891088379145322, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5398, "step": 4222 }, { "epoch": 0.08893194274071695, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5826, "step": 4223 }, { "epoch": 0.08895300168998067, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5677, "step": 4224 }, { "epoch": 0.0889740606392444, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5407, "step": 4225 }, { "epoch": 0.08899511958850813, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5561, "step": 4226 }, { "epoch": 0.08901617853777186, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5918, "step": 4227 }, { "epoch": 0.08903723748703558, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5475, "step": 4228 }, { "epoch": 0.08905829643629931, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5749, "step": 4229 }, { "epoch": 0.08907935538556304, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5699, "step": 4230 }, { "epoch": 0.08910041433482677, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5815, "step": 4231 }, { "epoch": 0.0891214732840905, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6148, "step": 4232 }, { "epoch": 0.08914253223335422, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6187, "step": 4233 }, { "epoch": 0.08916359118261795, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6143, "step": 4234 }, { "epoch": 0.08918465013188168, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5916, "step": 4235 }, { "epoch": 0.08920570908114539, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5795, "step": 4236 }, { "epoch": 0.08922676803040912, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6065, "step": 4237 }, { "epoch": 0.08924782697967285, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5985, "step": 4238 }, { "epoch": 0.08926888592893657, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5928, "step": 4239 }, { "epoch": 0.0892899448782003, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6073, "step": 4240 }, { "epoch": 0.08931100382746403, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.61, "step": 4241 }, { "epoch": 0.08933206277672776, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6209, "step": 4242 }, { "epoch": 0.08935312172599148, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5842, "step": 4243 }, { "epoch": 0.08937418067525521, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5733, "step": 4244 }, { "epoch": 0.08939523962451894, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5937, "step": 4245 }, { "epoch": 0.08941629857378267, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.578, "step": 4246 }, { "epoch": 0.08943735752304639, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5456, "step": 4247 }, { "epoch": 0.08945841647231012, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5927, "step": 4248 }, { "epoch": 0.08947947542157383, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6015, "step": 4249 }, { "epoch": 0.08950053437083756, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.6103, "step": 4250 }, { "epoch": 0.08952159332010129, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5965, "step": 4251 }, { "epoch": 0.08954265226936502, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5916, "step": 4252 }, { "epoch": 0.08956371121862874, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6145, "step": 4253 }, { "epoch": 0.08958477016789247, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5745, "step": 4254 }, { "epoch": 0.0896058291171562, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5524, "step": 4255 }, { "epoch": 0.08962688806641993, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5961, "step": 4256 }, { "epoch": 0.08964794701568365, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5805, "step": 4257 }, { "epoch": 0.08966900596494738, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.571, "step": 4258 }, { "epoch": 0.08969006491421111, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5863, "step": 4259 }, { "epoch": 0.08971112386347484, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6026, "step": 4260 }, { "epoch": 0.08973218281273856, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5858, "step": 4261 }, { "epoch": 0.08975324176200228, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5867, "step": 4262 }, { "epoch": 0.089774300711266, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5746, "step": 4263 }, { "epoch": 0.08979535966052973, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5785, "step": 4264 }, { "epoch": 0.08981641860979346, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5737, "step": 4265 }, { "epoch": 0.08983747755905719, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5686, "step": 4266 }, { "epoch": 0.08985853650832092, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5777, "step": 4267 }, { "epoch": 0.08987959545758464, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6159, "step": 4268 }, { "epoch": 0.08990065440684837, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6163, "step": 4269 }, { "epoch": 0.0899217133561121, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5991, "step": 4270 }, { "epoch": 0.08994277230537583, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5892, "step": 4271 }, { "epoch": 0.08996383125463955, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5886, "step": 4272 }, { "epoch": 0.08998489020390328, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5704, "step": 4273 }, { "epoch": 0.09000594915316701, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5332, "step": 4274 }, { "epoch": 0.09002700810243074, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.582, "step": 4275 }, { "epoch": 0.09004806705169445, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5712, "step": 4276 }, { "epoch": 0.09006912600095818, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5871, "step": 4277 }, { "epoch": 0.0900901849502219, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5751, "step": 4278 }, { "epoch": 0.09011124389948563, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5682, "step": 4279 }, { "epoch": 0.09013230284874936, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6082, "step": 4280 }, { "epoch": 0.09015336179801309, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5954, "step": 4281 }, { "epoch": 0.09017442074727682, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5668, "step": 4282 }, { "epoch": 0.09019547969654054, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5889, "step": 4283 }, { "epoch": 0.09021653864580427, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5681, "step": 4284 }, { "epoch": 0.090237597595068, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5682, "step": 4285 }, { "epoch": 0.09025865654433173, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5689, "step": 4286 }, { "epoch": 0.09027971549359545, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5682, "step": 4287 }, { "epoch": 0.09030077444285918, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5518, "step": 4288 }, { "epoch": 0.0903218333921229, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5745, "step": 4289 }, { "epoch": 0.09034289234138662, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5594, "step": 4290 }, { "epoch": 0.09036395129065035, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5884, "step": 4291 }, { "epoch": 0.09038501023991408, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6053, "step": 4292 }, { "epoch": 0.0904060691891778, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5704, "step": 4293 }, { "epoch": 0.09042712813844153, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5701, "step": 4294 }, { "epoch": 0.09044818708770526, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5498, "step": 4295 }, { "epoch": 0.09046924603696899, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5899, "step": 4296 }, { "epoch": 0.09049030498623271, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5929, "step": 4297 }, { "epoch": 0.09051136393549644, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5682, "step": 4298 }, { "epoch": 0.09053242288476017, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6108, "step": 4299 }, { "epoch": 0.0905534818340239, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5793, "step": 4300 }, { "epoch": 0.09057454078328762, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5885, "step": 4301 }, { "epoch": 0.09059559973255134, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5738, "step": 4302 }, { "epoch": 0.09061665868181507, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6013, "step": 4303 }, { "epoch": 0.09063771763107879, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.5859, "step": 4304 }, { "epoch": 0.09065877658034252, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5937, "step": 4305 }, { "epoch": 0.09067983552960625, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5848, "step": 4306 }, { "epoch": 0.09070089447886998, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5971, "step": 4307 }, { "epoch": 0.0907219534281337, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5663, "step": 4308 }, { "epoch": 0.09074301237739743, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6073, "step": 4309 }, { "epoch": 0.09076407132666116, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5471, "step": 4310 }, { "epoch": 0.09078513027592489, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5635, "step": 4311 }, { "epoch": 0.09080618922518861, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5767, "step": 4312 }, { "epoch": 0.09082724817445234, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5722, "step": 4313 }, { "epoch": 0.09084830712371607, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5631, "step": 4314 }, { "epoch": 0.09086936607297978, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.579, "step": 4315 }, { "epoch": 0.09089042502224351, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5923, "step": 4316 }, { "epoch": 0.09091148397150724, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5608, "step": 4317 }, { "epoch": 0.09093254292077096, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5991, "step": 4318 }, { "epoch": 0.09095360187003469, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5869, "step": 4319 }, { "epoch": 0.09097466081929842, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.579, "step": 4320 }, { "epoch": 0.09099571976856215, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5718, "step": 4321 }, { "epoch": 0.09101677871782587, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6072, "step": 4322 }, { "epoch": 0.0910378376670896, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5565, "step": 4323 }, { "epoch": 0.09105889661635333, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5873, "step": 4324 }, { "epoch": 0.09107995556561706, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.579, "step": 4325 }, { "epoch": 0.09110101451488078, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5642, "step": 4326 }, { "epoch": 0.09112207346414451, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5861, "step": 4327 }, { "epoch": 0.09114313241340824, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6046, "step": 4328 }, { "epoch": 0.09116419136267195, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5692, "step": 4329 }, { "epoch": 0.09118525031193568, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5746, "step": 4330 }, { "epoch": 0.09120630926119941, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6105, "step": 4331 }, { "epoch": 0.09122736821046314, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5596, "step": 4332 }, { "epoch": 0.09124842715972686, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5994, "step": 4333 }, { "epoch": 0.09126948610899059, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5531, "step": 4334 }, { "epoch": 0.09129054505825432, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5756, "step": 4335 }, { "epoch": 0.09131160400751805, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5302, "step": 4336 }, { "epoch": 0.09133266295678177, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5969, "step": 4337 }, { "epoch": 0.0913537219060455, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.561, "step": 4338 }, { "epoch": 0.09137478085530923, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6308, "step": 4339 }, { "epoch": 0.09139583980457296, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5895, "step": 4340 }, { "epoch": 0.09141689875383668, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5925, "step": 4341 }, { "epoch": 0.0914379577031004, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.604, "step": 4342 }, { "epoch": 0.09145901665236413, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5561, "step": 4343 }, { "epoch": 0.09148007560162785, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5743, "step": 4344 }, { "epoch": 0.09150113455089158, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5661, "step": 4345 }, { "epoch": 0.09152219350015531, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5908, "step": 4346 }, { "epoch": 0.09154325244941904, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6173, "step": 4347 }, { "epoch": 0.09156431139868276, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6097, "step": 4348 }, { "epoch": 0.09158537034794649, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5715, "step": 4349 }, { "epoch": 0.09160642929721022, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.607, "step": 4350 }, { "epoch": 0.09162748824647395, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5826, "step": 4351 }, { "epoch": 0.09164854719573767, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5455, "step": 4352 }, { "epoch": 0.0916696061450014, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6097, "step": 4353 }, { "epoch": 0.09169066509426513, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5569, "step": 4354 }, { "epoch": 0.09171172404352884, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.552, "step": 4355 }, { "epoch": 0.09173278299279257, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.621, "step": 4356 }, { "epoch": 0.0917538419420563, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5796, "step": 4357 }, { "epoch": 0.09177490089132002, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.576, "step": 4358 }, { "epoch": 0.09179595984058375, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5621, "step": 4359 }, { "epoch": 0.09181701878984748, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5843, "step": 4360 }, { "epoch": 0.0918380777391112, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5881, "step": 4361 }, { "epoch": 0.09185913668837493, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5735, "step": 4362 }, { "epoch": 0.09188019563763866, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6037, "step": 4363 }, { "epoch": 0.09190125458690239, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5835, "step": 4364 }, { "epoch": 0.09192231353616612, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.644, "step": 4365 }, { "epoch": 0.09194337248542984, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5896, "step": 4366 }, { "epoch": 0.09196443143469357, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6055, "step": 4367 }, { "epoch": 0.09198549038395729, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5924, "step": 4368 }, { "epoch": 0.09200654933322101, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5692, "step": 4369 }, { "epoch": 0.09202760828248474, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5646, "step": 4370 }, { "epoch": 0.09204866723174847, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5821, "step": 4371 }, { "epoch": 0.0920697261810122, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5779, "step": 4372 }, { "epoch": 0.09209078513027592, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5941, "step": 4373 }, { "epoch": 0.09211184407953965, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5565, "step": 4374 }, { "epoch": 0.09213290302880338, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6021, "step": 4375 }, { "epoch": 0.0921539619780671, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5647, "step": 4376 }, { "epoch": 0.09217502092733083, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5942, "step": 4377 }, { "epoch": 0.09219607987659456, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.572, "step": 4378 }, { "epoch": 0.09221713882585829, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5381, "step": 4379 }, { "epoch": 0.09223819777512202, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5875, "step": 4380 }, { "epoch": 0.09225925672438574, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6024, "step": 4381 }, { "epoch": 0.09228031567364946, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.5973, "step": 4382 }, { "epoch": 0.09230137462291318, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5902, "step": 4383 }, { "epoch": 0.09232243357217691, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6084, "step": 4384 }, { "epoch": 0.09234349252144064, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5993, "step": 4385 }, { "epoch": 0.09236455147070437, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.624, "step": 4386 }, { "epoch": 0.0923856104199681, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5944, "step": 4387 }, { "epoch": 0.09240666936923182, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5446, "step": 4388 }, { "epoch": 0.09242772831849555, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5563, "step": 4389 }, { "epoch": 0.09244878726775928, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5849, "step": 4390 }, { "epoch": 0.092469846217023, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5658, "step": 4391 }, { "epoch": 0.09249090516628673, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.592, "step": 4392 }, { "epoch": 0.09251196411555046, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5988, "step": 4393 }, { "epoch": 0.09253302306481419, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5676, "step": 4394 }, { "epoch": 0.0925540820140779, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6044, "step": 4395 }, { "epoch": 0.09257514096334163, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6029, "step": 4396 }, { "epoch": 0.09259619991260536, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5774, "step": 4397 }, { "epoch": 0.09261725886186908, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5541, "step": 4398 }, { "epoch": 0.09263831781113281, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.5892, "step": 4399 }, { "epoch": 0.09265937676039654, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5816, "step": 4400 }, { "epoch": 0.09268043570966027, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.571, "step": 4401 }, { "epoch": 0.092701494658924, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6011, "step": 4402 }, { "epoch": 0.09272255360818772, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.607, "step": 4403 }, { "epoch": 0.09274361255745145, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5817, "step": 4404 }, { "epoch": 0.09276467150671518, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6085, "step": 4405 }, { "epoch": 0.0927857304559789, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6069, "step": 4406 }, { "epoch": 0.09280678940524263, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5601, "step": 4407 }, { "epoch": 0.09282784835450635, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6021, "step": 4408 }, { "epoch": 0.09284890730377007, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.557, "step": 4409 }, { "epoch": 0.0928699662530338, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.595, "step": 4410 }, { "epoch": 0.09289102520229753, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5878, "step": 4411 }, { "epoch": 0.09291208415156126, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.577, "step": 4412 }, { "epoch": 0.09293314310082498, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6027, "step": 4413 }, { "epoch": 0.09295420205008871, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5565, "step": 4414 }, { "epoch": 0.09297526099935244, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5787, "step": 4415 }, { "epoch": 0.09299631994861617, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5773, "step": 4416 }, { "epoch": 0.0930173788978799, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6056, "step": 4417 }, { "epoch": 0.09303843784714362, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5709, "step": 4418 }, { "epoch": 0.09305949679640735, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5695, "step": 4419 }, { "epoch": 0.09308055574567108, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5839, "step": 4420 }, { "epoch": 0.09310161469493479, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5561, "step": 4421 }, { "epoch": 0.09312267364419852, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5706, "step": 4422 }, { "epoch": 0.09314373259346224, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5913, "step": 4423 }, { "epoch": 0.09316479154272597, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5888, "step": 4424 }, { "epoch": 0.0931858504919897, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5916, "step": 4425 }, { "epoch": 0.09320690944125343, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5884, "step": 4426 }, { "epoch": 0.09322796839051715, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6063, "step": 4427 }, { "epoch": 0.09324902733978088, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5809, "step": 4428 }, { "epoch": 0.09327008628904461, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.5798, "step": 4429 }, { "epoch": 0.09329114523830834, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6048, "step": 4430 }, { "epoch": 0.09331220418757206, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6299, "step": 4431 }, { "epoch": 0.09333326313683579, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5517, "step": 4432 }, { "epoch": 0.09335432208609952, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6036, "step": 4433 }, { "epoch": 0.09337538103536325, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5896, "step": 4434 }, { "epoch": 0.09339643998462696, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5919, "step": 4435 }, { "epoch": 0.09341749893389069, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6042, "step": 4436 }, { "epoch": 0.09343855788315442, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5872, "step": 4437 }, { "epoch": 0.09345961683241814, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6116, "step": 4438 }, { "epoch": 0.09348067578168187, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5594, "step": 4439 }, { "epoch": 0.0935017347309456, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5887, "step": 4440 }, { "epoch": 0.09352279368020933, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5727, "step": 4441 }, { "epoch": 0.09354385262947305, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5568, "step": 4442 }, { "epoch": 0.09356491157873678, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6175, "step": 4443 }, { "epoch": 0.09358597052800051, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.5872, "step": 4444 }, { "epoch": 0.09360702947726424, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5557, "step": 4445 }, { "epoch": 0.09362808842652796, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.596, "step": 4446 }, { "epoch": 0.09364914737579169, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5563, "step": 4447 }, { "epoch": 0.0936702063250554, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5726, "step": 4448 }, { "epoch": 0.09369126527431913, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6028, "step": 4449 }, { "epoch": 0.09371232422358286, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6025, "step": 4450 }, { "epoch": 0.09373338317284659, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6078, "step": 4451 }, { "epoch": 0.09375444212211032, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5658, "step": 4452 }, { "epoch": 0.09377550107137404, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5285, "step": 4453 }, { "epoch": 0.09379656002063777, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6155, "step": 4454 }, { "epoch": 0.0938176189699015, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5906, "step": 4455 }, { "epoch": 0.09383867791916523, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.598, "step": 4456 }, { "epoch": 0.09385973686842895, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5892, "step": 4457 }, { "epoch": 0.09388079581769268, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5531, "step": 4458 }, { "epoch": 0.09390185476695641, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5717, "step": 4459 }, { "epoch": 0.09392291371622014, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5693, "step": 4460 }, { "epoch": 0.09394397266548385, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5939, "step": 4461 }, { "epoch": 0.09396503161474758, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6006, "step": 4462 }, { "epoch": 0.0939860905640113, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5753, "step": 4463 }, { "epoch": 0.09400714951327503, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6176, "step": 4464 }, { "epoch": 0.09402820846253876, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5771, "step": 4465 }, { "epoch": 0.09404926741180249, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5715, "step": 4466 }, { "epoch": 0.09407032636106621, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5499, "step": 4467 }, { "epoch": 0.09409138531032994, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6075, "step": 4468 }, { "epoch": 0.09411244425959367, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5719, "step": 4469 }, { "epoch": 0.0941335032088574, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5692, "step": 4470 }, { "epoch": 0.09415456215812112, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5776, "step": 4471 }, { "epoch": 0.09417562110738485, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5693, "step": 4472 }, { "epoch": 0.09419668005664858, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5748, "step": 4473 }, { "epoch": 0.0942177390059123, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5788, "step": 4474 }, { "epoch": 0.09423879795517602, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6038, "step": 4475 }, { "epoch": 0.09425985690443975, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5678, "step": 4476 }, { "epoch": 0.09428091585370348, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5706, "step": 4477 }, { "epoch": 0.0943019748029672, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.593, "step": 4478 }, { "epoch": 0.09432303375223093, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6094, "step": 4479 }, { "epoch": 0.09434409270149466, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5652, "step": 4480 }, { "epoch": 0.09436515165075839, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5767, "step": 4481 }, { "epoch": 0.09438621060002211, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.6065, "step": 4482 }, { "epoch": 0.09440726954928584, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6009, "step": 4483 }, { "epoch": 0.09442832849854957, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5912, "step": 4484 }, { "epoch": 0.0944493874478133, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5867, "step": 4485 }, { "epoch": 0.09447044639707702, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6074, "step": 4486 }, { "epoch": 0.09449150534634075, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5674, "step": 4487 }, { "epoch": 0.09451256429560446, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5793, "step": 4488 }, { "epoch": 0.09453362324486819, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5848, "step": 4489 }, { "epoch": 0.09455468219413192, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6006, "step": 4490 }, { "epoch": 0.09457574114339565, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.597, "step": 4491 }, { "epoch": 0.09459680009265937, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5679, "step": 4492 }, { "epoch": 0.0946178590419231, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5814, "step": 4493 }, { "epoch": 0.09463891799118683, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5845, "step": 4494 }, { "epoch": 0.09465997694045056, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5778, "step": 4495 }, { "epoch": 0.09468103588971429, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5665, "step": 4496 }, { "epoch": 0.09470209483897801, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.5804, "step": 4497 }, { "epoch": 0.09472315378824174, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.596, "step": 4498 }, { "epoch": 0.09474421273750547, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6045, "step": 4499 }, { "epoch": 0.0947652716867692, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6209, "step": 4500 }, { "epoch": 0.0947652716867692, "eval_loss": 2.20310378074646, "eval_runtime": 897.7013, "eval_samples_per_second": 68.842, "eval_steps_per_second": 2.152, "step": 4500 }, { "epoch": 0.09478633063603291, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5777, "step": 4501 }, { "epoch": 0.09480738958529664, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5885, "step": 4502 }, { "epoch": 0.09482844853456036, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5977, "step": 4503 }, { "epoch": 0.09484950748382409, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6173, "step": 4504 }, { "epoch": 0.09487056643308782, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.568, "step": 4505 }, { "epoch": 0.09489162538235155, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5889, "step": 4506 }, { "epoch": 0.09491268433161527, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5593, "step": 4507 }, { "epoch": 0.094933743280879, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6217, "step": 4508 }, { "epoch": 0.09495480223014273, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5943, "step": 4509 }, { "epoch": 0.09497586117940646, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5811, "step": 4510 }, { "epoch": 0.09499692012867018, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5914, "step": 4511 }, { "epoch": 0.09501797907793391, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5892, "step": 4512 }, { "epoch": 0.09503903802719764, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5772, "step": 4513 }, { "epoch": 0.09506009697646135, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5777, "step": 4514 }, { "epoch": 0.09508115592572508, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5935, "step": 4515 }, { "epoch": 0.09510221487498881, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6134, "step": 4516 }, { "epoch": 0.09512327382425254, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.6021, "step": 4517 }, { "epoch": 0.09514433277351626, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5804, "step": 4518 }, { "epoch": 0.09516539172277999, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5752, "step": 4519 }, { "epoch": 0.09518645067204372, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.5396, "step": 4520 }, { "epoch": 0.09520750962130745, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5962, "step": 4521 }, { "epoch": 0.09522856857057117, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5922, "step": 4522 }, { "epoch": 0.0952496275198349, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5322, "step": 4523 }, { "epoch": 0.09527068646909863, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5903, "step": 4524 }, { "epoch": 0.09529174541836236, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6082, "step": 4525 }, { "epoch": 0.09531280436762608, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5842, "step": 4526 }, { "epoch": 0.0953338633168898, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5684, "step": 4527 }, { "epoch": 0.09535492226615352, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6035, "step": 4528 }, { "epoch": 0.09537598121541725, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5392, "step": 4529 }, { "epoch": 0.09539704016468098, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6136, "step": 4530 }, { "epoch": 0.09541809911394471, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.548, "step": 4531 }, { "epoch": 0.09543915806320843, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5683, "step": 4532 }, { "epoch": 0.09546021701247216, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5754, "step": 4533 }, { "epoch": 0.09548127596173589, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6265, "step": 4534 }, { "epoch": 0.09550233491099962, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6023, "step": 4535 }, { "epoch": 0.09552339386026334, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5859, "step": 4536 }, { "epoch": 0.09554445280952707, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.564, "step": 4537 }, { "epoch": 0.0955655117587908, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5721, "step": 4538 }, { "epoch": 0.09558657070805453, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5735, "step": 4539 }, { "epoch": 0.09560762965731825, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5704, "step": 4540 }, { "epoch": 0.09562868860658197, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5604, "step": 4541 }, { "epoch": 0.0956497475558457, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.6378, "step": 4542 }, { "epoch": 0.09567080650510942, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5357, "step": 4543 }, { "epoch": 0.09569186545437315, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5763, "step": 4544 }, { "epoch": 0.09571292440363688, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5945, "step": 4545 }, { "epoch": 0.0957339833529006, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5659, "step": 4546 }, { "epoch": 0.09575504230216433, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6143, "step": 4547 }, { "epoch": 0.09577610125142806, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5555, "step": 4548 }, { "epoch": 0.09579716020069179, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5852, "step": 4549 }, { "epoch": 0.09581821914995552, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6237, "step": 4550 }, { "epoch": 0.09583927809921924, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.58, "step": 4551 }, { "epoch": 0.09586033704848297, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5872, "step": 4552 }, { "epoch": 0.0958813959977467, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5679, "step": 4553 }, { "epoch": 0.09590245494701041, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5715, "step": 4554 }, { "epoch": 0.09592351389627414, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5677, "step": 4555 }, { "epoch": 0.09594457284553787, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5921, "step": 4556 }, { "epoch": 0.0959656317948016, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5783, "step": 4557 }, { "epoch": 0.09598669074406532, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5865, "step": 4558 }, { "epoch": 0.09600774969332905, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5759, "step": 4559 }, { "epoch": 0.09602880864259278, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5873, "step": 4560 }, { "epoch": 0.0960498675918565, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.576, "step": 4561 }, { "epoch": 0.09607092654112023, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.582, "step": 4562 }, { "epoch": 0.09609198549038396, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5849, "step": 4563 }, { "epoch": 0.09611304443964769, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.5598, "step": 4564 }, { "epoch": 0.09613410338891142, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5787, "step": 4565 }, { "epoch": 0.09615516233817514, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.577, "step": 4566 }, { "epoch": 0.09617622128743886, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5941, "step": 4567 }, { "epoch": 0.09619728023670258, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5258, "step": 4568 }, { "epoch": 0.09621833918596631, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5818, "step": 4569 }, { "epoch": 0.09623939813523004, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5556, "step": 4570 }, { "epoch": 0.09626045708449377, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5896, "step": 4571 }, { "epoch": 0.0962815160337575, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5773, "step": 4572 }, { "epoch": 0.09630257498302122, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5954, "step": 4573 }, { "epoch": 0.09632363393228495, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5748, "step": 4574 }, { "epoch": 0.09634469288154868, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6214, "step": 4575 }, { "epoch": 0.0963657518308124, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5914, "step": 4576 }, { "epoch": 0.09638681078007613, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5951, "step": 4577 }, { "epoch": 0.09640786972933986, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5502, "step": 4578 }, { "epoch": 0.09642892867860359, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5459, "step": 4579 }, { "epoch": 0.0964499876278673, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5479, "step": 4580 }, { "epoch": 0.09647104657713103, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5754, "step": 4581 }, { "epoch": 0.09649210552639476, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6126, "step": 4582 }, { "epoch": 0.09651316447565848, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5932, "step": 4583 }, { "epoch": 0.09653422342492221, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5848, "step": 4584 }, { "epoch": 0.09655528237418594, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6002, "step": 4585 }, { "epoch": 0.09657634132344967, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5953, "step": 4586 }, { "epoch": 0.0965974002727134, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6005, "step": 4587 }, { "epoch": 0.09661845922197712, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5955, "step": 4588 }, { "epoch": 0.09663951817124085, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6052, "step": 4589 }, { "epoch": 0.09666057712050458, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6168, "step": 4590 }, { "epoch": 0.0966816360697683, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5765, "step": 4591 }, { "epoch": 0.09670269501903203, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5675, "step": 4592 }, { "epoch": 0.09672375396829576, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5601, "step": 4593 }, { "epoch": 0.09674481291755947, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6295, "step": 4594 }, { "epoch": 0.0967658718668232, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5569, "step": 4595 }, { "epoch": 0.09678693081608693, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5529, "step": 4596 }, { "epoch": 0.09680798976535065, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5818, "step": 4597 }, { "epoch": 0.09682904871461438, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.594, "step": 4598 }, { "epoch": 0.09685010766387811, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6204, "step": 4599 }, { "epoch": 0.09687116661314184, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5707, "step": 4600 }, { "epoch": 0.09689222556240557, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5689, "step": 4601 }, { "epoch": 0.09691328451166929, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6196, "step": 4602 }, { "epoch": 0.09693434346093302, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5776, "step": 4603 }, { "epoch": 0.09695540241019675, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5882, "step": 4604 }, { "epoch": 0.09697646135946048, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.5801, "step": 4605 }, { "epoch": 0.0969975203087242, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6349, "step": 4606 }, { "epoch": 0.09701857925798792, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.616, "step": 4607 }, { "epoch": 0.09703963820725164, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5849, "step": 4608 }, { "epoch": 0.09706069715651537, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6066, "step": 4609 }, { "epoch": 0.0970817561057791, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5787, "step": 4610 }, { "epoch": 0.09710281505504283, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5864, "step": 4611 }, { "epoch": 0.09712387400430655, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5608, "step": 4612 }, { "epoch": 0.09714493295357028, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5856, "step": 4613 }, { "epoch": 0.09716599190283401, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5886, "step": 4614 }, { "epoch": 0.09718705085209774, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6034, "step": 4615 }, { "epoch": 0.09720810980136146, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5875, "step": 4616 }, { "epoch": 0.09722916875062519, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.582, "step": 4617 }, { "epoch": 0.09725022769988892, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5759, "step": 4618 }, { "epoch": 0.09727128664915265, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5954, "step": 4619 }, { "epoch": 0.09729234559841636, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6123, "step": 4620 }, { "epoch": 0.09731340454768009, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5896, "step": 4621 }, { "epoch": 0.09733446349694382, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5558, "step": 4622 }, { "epoch": 0.09735552244620754, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5939, "step": 4623 }, { "epoch": 0.09737658139547127, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5532, "step": 4624 }, { "epoch": 0.097397640344735, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.573, "step": 4625 }, { "epoch": 0.09741869929399873, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.6013, "step": 4626 }, { "epoch": 0.09743975824326245, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5944, "step": 4627 }, { "epoch": 0.09746081719252618, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.6027, "step": 4628 }, { "epoch": 0.09748187614178991, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5818, "step": 4629 }, { "epoch": 0.09750293509105364, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5649, "step": 4630 }, { "epoch": 0.09752399404031736, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5585, "step": 4631 }, { "epoch": 0.09754505298958109, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6072, "step": 4632 }, { "epoch": 0.0975661119388448, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5901, "step": 4633 }, { "epoch": 0.09758717088810853, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6071, "step": 4634 }, { "epoch": 0.09760822983737226, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5717, "step": 4635 }, { "epoch": 0.09762928878663599, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5918, "step": 4636 }, { "epoch": 0.09765034773589971, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.609, "step": 4637 }, { "epoch": 0.09767140668516344, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5811, "step": 4638 }, { "epoch": 0.09769246563442717, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5918, "step": 4639 }, { "epoch": 0.0977135245836909, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.5991, "step": 4640 }, { "epoch": 0.09773458353295462, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5984, "step": 4641 }, { "epoch": 0.09775564248221835, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6066, "step": 4642 }, { "epoch": 0.09777670143148208, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5762, "step": 4643 }, { "epoch": 0.09779776038074581, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5615, "step": 4644 }, { "epoch": 0.09781881933000953, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6052, "step": 4645 }, { "epoch": 0.09783987827927326, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5764, "step": 4646 }, { "epoch": 0.09786093722853698, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5461, "step": 4647 }, { "epoch": 0.0978819961778007, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5702, "step": 4648 }, { "epoch": 0.09790305512706443, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5613, "step": 4649 }, { "epoch": 0.09792411407632816, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5715, "step": 4650 }, { "epoch": 0.09794517302559189, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5733, "step": 4651 }, { "epoch": 0.09796623197485561, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5928, "step": 4652 }, { "epoch": 0.09798729092411934, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5814, "step": 4653 }, { "epoch": 0.09800834987338307, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6052, "step": 4654 }, { "epoch": 0.0980294088226468, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5894, "step": 4655 }, { "epoch": 0.09805046777191052, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5726, "step": 4656 }, { "epoch": 0.09807152672117425, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5926, "step": 4657 }, { "epoch": 0.09809258567043798, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5716, "step": 4658 }, { "epoch": 0.0981136446197017, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5963, "step": 4659 }, { "epoch": 0.09813470356896542, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.585, "step": 4660 }, { "epoch": 0.09815576251822915, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5869, "step": 4661 }, { "epoch": 0.09817682146749288, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5761, "step": 4662 }, { "epoch": 0.0981978804167566, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5731, "step": 4663 }, { "epoch": 0.09821893936602033, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.57, "step": 4664 }, { "epoch": 0.09823999831528406, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6285, "step": 4665 }, { "epoch": 0.09826105726454779, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5885, "step": 4666 }, { "epoch": 0.09828211621381151, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.571, "step": 4667 }, { "epoch": 0.09830317516307524, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6143, "step": 4668 }, { "epoch": 0.09832423411233897, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5926, "step": 4669 }, { "epoch": 0.0983452930616027, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5924, "step": 4670 }, { "epoch": 0.09836635201086642, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5833, "step": 4671 }, { "epoch": 0.09838741096013015, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5759, "step": 4672 }, { "epoch": 0.09840846990939386, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6126, "step": 4673 }, { "epoch": 0.09842952885865759, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5952, "step": 4674 }, { "epoch": 0.09845058780792132, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6463, "step": 4675 }, { "epoch": 0.09847164675718505, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5535, "step": 4676 }, { "epoch": 0.09849270570644877, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5926, "step": 4677 }, { "epoch": 0.0985137646557125, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6102, "step": 4678 }, { "epoch": 0.09853482360497623, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5682, "step": 4679 }, { "epoch": 0.09855588255423996, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5942, "step": 4680 }, { "epoch": 0.09857694150350368, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5953, "step": 4681 }, { "epoch": 0.09859800045276741, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5776, "step": 4682 }, { "epoch": 0.09861905940203114, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5425, "step": 4683 }, { "epoch": 0.09864011835129487, "grad_norm": 0.2265625, "learning_rate": 0.0002, "loss": 1.57, "step": 4684 }, { "epoch": 0.0986611773005586, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5671, "step": 4685 }, { "epoch": 0.09868223624982231, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5968, "step": 4686 }, { "epoch": 0.09870329519908604, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.5621, "step": 4687 }, { "epoch": 0.09872435414834976, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5489, "step": 4688 }, { "epoch": 0.09874541309761349, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6083, "step": 4689 }, { "epoch": 0.09876647204687722, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6168, "step": 4690 }, { "epoch": 0.09878753099614095, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5597, "step": 4691 }, { "epoch": 0.09880858994540467, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5965, "step": 4692 }, { "epoch": 0.0988296488946684, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5732, "step": 4693 }, { "epoch": 0.09885070784393213, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6013, "step": 4694 }, { "epoch": 0.09887176679319586, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5932, "step": 4695 }, { "epoch": 0.09889282574245958, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5682, "step": 4696 }, { "epoch": 0.09891388469172331, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5736, "step": 4697 }, { "epoch": 0.09893494364098704, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5798, "step": 4698 }, { "epoch": 0.09895600259025077, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5542, "step": 4699 }, { "epoch": 0.09897706153951448, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5595, "step": 4700 }, { "epoch": 0.09899812048877821, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5806, "step": 4701 }, { "epoch": 0.09901917943804193, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5723, "step": 4702 }, { "epoch": 0.09904023838730566, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5936, "step": 4703 }, { "epoch": 0.09906129733656939, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5798, "step": 4704 }, { "epoch": 0.09908235628583312, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5809, "step": 4705 }, { "epoch": 0.09910341523509685, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6059, "step": 4706 }, { "epoch": 0.09912447418436057, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5966, "step": 4707 }, { "epoch": 0.0991455331336243, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6326, "step": 4708 }, { "epoch": 0.09916659208288803, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6041, "step": 4709 }, { "epoch": 0.09918765103215176, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6137, "step": 4710 }, { "epoch": 0.09920870998141548, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6007, "step": 4711 }, { "epoch": 0.09922976893067921, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5889, "step": 4712 }, { "epoch": 0.09925082787994292, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5812, "step": 4713 }, { "epoch": 0.09927188682920665, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5933, "step": 4714 }, { "epoch": 0.09929294577847038, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.571, "step": 4715 }, { "epoch": 0.0993140047277341, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6076, "step": 4716 }, { "epoch": 0.09933506367699783, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6282, "step": 4717 }, { "epoch": 0.09935612262626156, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5573, "step": 4718 }, { "epoch": 0.09937718157552529, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.543, "step": 4719 }, { "epoch": 0.09939824052478902, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6088, "step": 4720 }, { "epoch": 0.09941929947405274, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5742, "step": 4721 }, { "epoch": 0.09944035842331647, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5938, "step": 4722 }, { "epoch": 0.0994614173725802, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5726, "step": 4723 }, { "epoch": 0.09948247632184393, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6096, "step": 4724 }, { "epoch": 0.09950353527110765, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5797, "step": 4725 }, { "epoch": 0.09952459422037137, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5553, "step": 4726 }, { "epoch": 0.0995456531696351, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.575, "step": 4727 }, { "epoch": 0.09956671211889882, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5962, "step": 4728 }, { "epoch": 0.09958777106816255, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5888, "step": 4729 }, { "epoch": 0.09960883001742628, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5727, "step": 4730 }, { "epoch": 0.09962988896669, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5721, "step": 4731 }, { "epoch": 0.09965094791595373, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5733, "step": 4732 }, { "epoch": 0.09967200686521746, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5696, "step": 4733 }, { "epoch": 0.09969306581448119, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5922, "step": 4734 }, { "epoch": 0.09971412476374492, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5753, "step": 4735 }, { "epoch": 0.09973518371300864, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5872, "step": 4736 }, { "epoch": 0.09975624266227237, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6019, "step": 4737 }, { "epoch": 0.0997773016115361, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.582, "step": 4738 }, { "epoch": 0.09979836056079981, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5436, "step": 4739 }, { "epoch": 0.09981941951006354, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6001, "step": 4740 }, { "epoch": 0.09984047845932727, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5918, "step": 4741 }, { "epoch": 0.099861537408591, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6088, "step": 4742 }, { "epoch": 0.09988259635785472, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5957, "step": 4743 }, { "epoch": 0.09990365530711845, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5778, "step": 4744 }, { "epoch": 0.09992471425638218, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5532, "step": 4745 }, { "epoch": 0.0999457732056459, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5815, "step": 4746 }, { "epoch": 0.09996683215490963, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5765, "step": 4747 }, { "epoch": 0.09998789110417336, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5895, "step": 4748 }, { "epoch": 0.10000895005343709, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6022, "step": 4749 }, { "epoch": 0.10003000900270081, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5659, "step": 4750 }, { "epoch": 0.10005106795196454, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5666, "step": 4751 }, { "epoch": 0.10007212690122827, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5604, "step": 4752 }, { "epoch": 0.10009318585049198, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5659, "step": 4753 }, { "epoch": 0.10011424479975571, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.564, "step": 4754 }, { "epoch": 0.10013530374901944, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6102, "step": 4755 }, { "epoch": 0.10015636269828317, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.591, "step": 4756 }, { "epoch": 0.1001774216475469, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5757, "step": 4757 }, { "epoch": 0.10019848059681062, "grad_norm": 0.1865234375, "learning_rate": 0.0002, "loss": 1.5806, "step": 4758 }, { "epoch": 0.10021953954607435, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5774, "step": 4759 }, { "epoch": 0.10024059849533808, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5845, "step": 4760 }, { "epoch": 0.1002616574446018, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5893, "step": 4761 }, { "epoch": 0.10028271639386553, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5514, "step": 4762 }, { "epoch": 0.10030377534312926, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5684, "step": 4763 }, { "epoch": 0.10032483429239299, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5916, "step": 4764 }, { "epoch": 0.10034589324165671, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5913, "step": 4765 }, { "epoch": 0.10036695219092043, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5664, "step": 4766 }, { "epoch": 0.10038801114018416, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5855, "step": 4767 }, { "epoch": 0.10040907008944788, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6303, "step": 4768 }, { "epoch": 0.10043012903871161, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5444, "step": 4769 }, { "epoch": 0.10045118798797534, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5704, "step": 4770 }, { "epoch": 0.10047224693723907, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5639, "step": 4771 }, { "epoch": 0.10049330588650279, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5744, "step": 4772 }, { "epoch": 0.10051436483576652, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.5783, "step": 4773 }, { "epoch": 0.10053542378503025, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5711, "step": 4774 }, { "epoch": 0.10055648273429398, "grad_norm": 0.1767578125, "learning_rate": 0.0002, "loss": 1.6102, "step": 4775 }, { "epoch": 0.1005775416835577, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5853, "step": 4776 }, { "epoch": 0.10059860063282143, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5531, "step": 4777 }, { "epoch": 0.10061965958208516, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5953, "step": 4778 }, { "epoch": 0.10064071853134887, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.6002, "step": 4779 }, { "epoch": 0.1006617774806126, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5454, "step": 4780 }, { "epoch": 0.10068283642987633, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5686, "step": 4781 }, { "epoch": 0.10070389537914005, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5857, "step": 4782 }, { "epoch": 0.10072495432840378, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5818, "step": 4783 }, { "epoch": 0.10074601327766751, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6072, "step": 4784 }, { "epoch": 0.10076707222693124, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5546, "step": 4785 }, { "epoch": 0.10078813117619496, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5741, "step": 4786 }, { "epoch": 0.10080919012545869, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6379, "step": 4787 }, { "epoch": 0.10083024907472242, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6138, "step": 4788 }, { "epoch": 0.10085130802398615, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5782, "step": 4789 }, { "epoch": 0.10087236697324987, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5807, "step": 4790 }, { "epoch": 0.1008934259225136, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5546, "step": 4791 }, { "epoch": 0.10091448487177732, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.5877, "step": 4792 }, { "epoch": 0.10093554382104104, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5676, "step": 4793 }, { "epoch": 0.10095660277030477, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5973, "step": 4794 }, { "epoch": 0.1009776617195685, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5917, "step": 4795 }, { "epoch": 0.10099872066883223, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5671, "step": 4796 }, { "epoch": 0.10101977961809595, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.6046, "step": 4797 }, { "epoch": 0.10104083856735968, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5591, "step": 4798 }, { "epoch": 0.10106189751662341, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5986, "step": 4799 }, { "epoch": 0.10108295646588714, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6103, "step": 4800 }, { "epoch": 0.10110401541515086, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5506, "step": 4801 }, { "epoch": 0.10112507436441459, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5817, "step": 4802 }, { "epoch": 0.10114613331367832, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5941, "step": 4803 }, { "epoch": 0.10116719226294205, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5832, "step": 4804 }, { "epoch": 0.10118825121220577, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.58, "step": 4805 }, { "epoch": 0.10120931016146949, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5767, "step": 4806 }, { "epoch": 0.10123036911073321, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5936, "step": 4807 }, { "epoch": 0.10125142805999694, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.606, "step": 4808 }, { "epoch": 0.10127248700926067, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5909, "step": 4809 }, { "epoch": 0.1012935459585244, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5674, "step": 4810 }, { "epoch": 0.10131460490778812, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6087, "step": 4811 }, { "epoch": 0.10133566385705185, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.622, "step": 4812 }, { "epoch": 0.10135672280631558, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.575, "step": 4813 }, { "epoch": 0.10137778175557931, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5611, "step": 4814 }, { "epoch": 0.10139884070484304, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6018, "step": 4815 }, { "epoch": 0.10141989965410676, "grad_norm": 0.17578125, "learning_rate": 0.0002, "loss": 1.5743, "step": 4816 }, { "epoch": 0.10144095860337049, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5478, "step": 4817 }, { "epoch": 0.10146201755263422, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5565, "step": 4818 }, { "epoch": 0.10148307650189793, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5672, "step": 4819 }, { "epoch": 0.10150413545116166, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5675, "step": 4820 }, { "epoch": 0.10152519440042539, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5764, "step": 4821 }, { "epoch": 0.10154625334968911, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5769, "step": 4822 }, { "epoch": 0.10156731229895284, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5642, "step": 4823 }, { "epoch": 0.10158837124821657, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5429, "step": 4824 }, { "epoch": 0.1016094301974803, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.583, "step": 4825 }, { "epoch": 0.10163048914674402, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5974, "step": 4826 }, { "epoch": 0.10165154809600775, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.594, "step": 4827 }, { "epoch": 0.10167260704527148, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5982, "step": 4828 }, { "epoch": 0.1016936659945352, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5589, "step": 4829 }, { "epoch": 0.10171472494379893, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5943, "step": 4830 }, { "epoch": 0.10173578389306266, "grad_norm": 0.1943359375, "learning_rate": 0.0002, "loss": 1.5532, "step": 4831 }, { "epoch": 0.10175684284232638, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6086, "step": 4832 }, { "epoch": 0.1017779017915901, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5628, "step": 4833 }, { "epoch": 0.10179896074085383, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.592, "step": 4834 }, { "epoch": 0.10182001969011756, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5747, "step": 4835 }, { "epoch": 0.10184107863938129, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5881, "step": 4836 }, { "epoch": 0.10186213758864501, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6124, "step": 4837 }, { "epoch": 0.10188319653790874, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5608, "step": 4838 }, { "epoch": 0.10190425548717247, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5702, "step": 4839 }, { "epoch": 0.1019253144364362, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5581, "step": 4840 }, { "epoch": 0.10194637338569992, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5612, "step": 4841 }, { "epoch": 0.10196743233496365, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5977, "step": 4842 }, { "epoch": 0.10198849128422738, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5574, "step": 4843 }, { "epoch": 0.1020095502334911, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.603, "step": 4844 }, { "epoch": 0.10203060918275483, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5731, "step": 4845 }, { "epoch": 0.10205166813201855, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6153, "step": 4846 }, { "epoch": 0.10207272708128227, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5863, "step": 4847 }, { "epoch": 0.102093786030546, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5806, "step": 4848 }, { "epoch": 0.10211484497980973, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5815, "step": 4849 }, { "epoch": 0.10213590392907346, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5704, "step": 4850 }, { "epoch": 0.10215696287833718, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5774, "step": 4851 }, { "epoch": 0.10217802182760091, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5596, "step": 4852 }, { "epoch": 0.10219908077686464, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5873, "step": 4853 }, { "epoch": 0.10222013972612837, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5946, "step": 4854 }, { "epoch": 0.1022411986753921, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5588, "step": 4855 }, { "epoch": 0.10226225762465582, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5951, "step": 4856 }, { "epoch": 0.10228331657391955, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.597, "step": 4857 }, { "epoch": 0.10230437552318328, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.578, "step": 4858 }, { "epoch": 0.10232543447244699, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.567, "step": 4859 }, { "epoch": 0.10234649342171072, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5895, "step": 4860 }, { "epoch": 0.10236755237097445, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5981, "step": 4861 }, { "epoch": 0.10238861132023817, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.609, "step": 4862 }, { "epoch": 0.1024096702695019, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5454, "step": 4863 }, { "epoch": 0.10243072921876563, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5611, "step": 4864 }, { "epoch": 0.10245178816802936, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6253, "step": 4865 }, { "epoch": 0.10247284711729308, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5831, "step": 4866 }, { "epoch": 0.10249390606655681, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5754, "step": 4867 }, { "epoch": 0.10251496501582054, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5888, "step": 4868 }, { "epoch": 0.10253602396508427, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5743, "step": 4869 }, { "epoch": 0.102557082914348, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5945, "step": 4870 }, { "epoch": 0.10257814186361172, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.5677, "step": 4871 }, { "epoch": 0.10259920081287544, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6344, "step": 4872 }, { "epoch": 0.10262025976213916, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6317, "step": 4873 }, { "epoch": 0.10264131871140289, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5644, "step": 4874 }, { "epoch": 0.10266237766066662, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6169, "step": 4875 }, { "epoch": 0.10268343660993035, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5953, "step": 4876 }, { "epoch": 0.10270449555919407, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5414, "step": 4877 }, { "epoch": 0.1027255545084578, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5563, "step": 4878 }, { "epoch": 0.10274661345772153, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5852, "step": 4879 }, { "epoch": 0.10276767240698526, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5693, "step": 4880 }, { "epoch": 0.10278873135624898, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5896, "step": 4881 }, { "epoch": 0.10280979030551271, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6075, "step": 4882 }, { "epoch": 0.10283084925477644, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5907, "step": 4883 }, { "epoch": 0.10285190820404017, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5803, "step": 4884 }, { "epoch": 0.10287296715330388, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5848, "step": 4885 }, { "epoch": 0.1028940261025676, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5559, "step": 4886 }, { "epoch": 0.10291508505183133, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5638, "step": 4887 }, { "epoch": 0.10293614400109506, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5466, "step": 4888 }, { "epoch": 0.10295720295035879, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.6031, "step": 4889 }, { "epoch": 0.10297826189962252, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.5845, "step": 4890 }, { "epoch": 0.10299932084888624, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.602, "step": 4891 }, { "epoch": 0.10302037979814997, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5532, "step": 4892 }, { "epoch": 0.1030414387474137, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5861, "step": 4893 }, { "epoch": 0.10306249769667743, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.6094, "step": 4894 }, { "epoch": 0.10308355664594115, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5706, "step": 4895 }, { "epoch": 0.10310461559520488, "grad_norm": 0.185546875, "learning_rate": 0.0002, "loss": 1.6062, "step": 4896 }, { "epoch": 0.10312567454446861, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.558, "step": 4897 }, { "epoch": 0.10314673349373234, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6091, "step": 4898 }, { "epoch": 0.10316779244299605, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.5385, "step": 4899 }, { "epoch": 0.10318885139225978, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5819, "step": 4900 }, { "epoch": 0.1032099103415235, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5955, "step": 4901 }, { "epoch": 0.10323096929078723, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5784, "step": 4902 }, { "epoch": 0.10325202824005096, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5531, "step": 4903 }, { "epoch": 0.10327308718931469, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5678, "step": 4904 }, { "epoch": 0.10329414613857842, "grad_norm": 0.1396484375, "learning_rate": 0.0002, "loss": 1.585, "step": 4905 }, { "epoch": 0.10331520508784214, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5931, "step": 4906 }, { "epoch": 0.10333626403710587, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5806, "step": 4907 }, { "epoch": 0.1033573229863696, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6001, "step": 4908 }, { "epoch": 0.10337838193563333, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5692, "step": 4909 }, { "epoch": 0.10339944088489705, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5904, "step": 4910 }, { "epoch": 0.10342049983416078, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.594, "step": 4911 }, { "epoch": 0.1034415587834245, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5801, "step": 4912 }, { "epoch": 0.10346261773268822, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5826, "step": 4913 }, { "epoch": 0.10348367668195195, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.569, "step": 4914 }, { "epoch": 0.10350473563121568, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5578, "step": 4915 }, { "epoch": 0.1035257945804794, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5837, "step": 4916 }, { "epoch": 0.10354685352974313, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5951, "step": 4917 }, { "epoch": 0.10356791247900686, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5623, "step": 4918 }, { "epoch": 0.10358897142827059, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5646, "step": 4919 }, { "epoch": 0.10361003037753432, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5646, "step": 4920 }, { "epoch": 0.10363108932679804, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.596, "step": 4921 }, { "epoch": 0.10365214827606177, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5915, "step": 4922 }, { "epoch": 0.1036732072253255, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5872, "step": 4923 }, { "epoch": 0.10369426617458923, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5703, "step": 4924 }, { "epoch": 0.10371532512385294, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5795, "step": 4925 }, { "epoch": 0.10373638407311667, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5715, "step": 4926 }, { "epoch": 0.1037574430223804, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5886, "step": 4927 }, { "epoch": 0.10377850197164412, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6063, "step": 4928 }, { "epoch": 0.10379956092090785, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5746, "step": 4929 }, { "epoch": 0.10382061987017158, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5695, "step": 4930 }, { "epoch": 0.1038416788194353, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5428, "step": 4931 }, { "epoch": 0.10386273776869903, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5612, "step": 4932 }, { "epoch": 0.10388379671796276, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5604, "step": 4933 }, { "epoch": 0.10390485566722649, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5352, "step": 4934 }, { "epoch": 0.10392591461649021, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5679, "step": 4935 }, { "epoch": 0.10394697356575394, "grad_norm": 0.1953125, "learning_rate": 0.0002, "loss": 1.5609, "step": 4936 }, { "epoch": 0.10396803251501767, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5896, "step": 4937 }, { "epoch": 0.10398909146428138, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.589, "step": 4938 }, { "epoch": 0.10401015041354511, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5784, "step": 4939 }, { "epoch": 0.10403120936280884, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6108, "step": 4940 }, { "epoch": 0.10405226831207257, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5645, "step": 4941 }, { "epoch": 0.1040733272613363, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5961, "step": 4942 }, { "epoch": 0.10409438621060002, "grad_norm": 0.138671875, "learning_rate": 0.0002, "loss": 1.6042, "step": 4943 }, { "epoch": 0.10411544515986375, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5785, "step": 4944 }, { "epoch": 0.10413650410912748, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5824, "step": 4945 }, { "epoch": 0.1041575630583912, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5756, "step": 4946 }, { "epoch": 0.10417862200765493, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.592, "step": 4947 }, { "epoch": 0.10419968095691866, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6292, "step": 4948 }, { "epoch": 0.10422073990618239, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6209, "step": 4949 }, { "epoch": 0.10424179885544611, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5519, "step": 4950 }, { "epoch": 0.10426285780470984, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5642, "step": 4951 }, { "epoch": 0.10428391675397355, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.593, "step": 4952 }, { "epoch": 0.10430497570323728, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5932, "step": 4953 }, { "epoch": 0.10432603465250101, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5937, "step": 4954 }, { "epoch": 0.10434709360176474, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5626, "step": 4955 }, { "epoch": 0.10436815255102846, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5514, "step": 4956 }, { "epoch": 0.10438921150029219, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5662, "step": 4957 }, { "epoch": 0.10441027044955592, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6007, "step": 4958 }, { "epoch": 0.10443132939881965, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5734, "step": 4959 }, { "epoch": 0.10445238834808337, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5737, "step": 4960 }, { "epoch": 0.1044734472973471, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5901, "step": 4961 }, { "epoch": 0.10449450624661083, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5684, "step": 4962 }, { "epoch": 0.10451556519587456, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5872, "step": 4963 }, { "epoch": 0.10453662414513828, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.607, "step": 4964 }, { "epoch": 0.104557683094402, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5951, "step": 4965 }, { "epoch": 0.10457874204366573, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6145, "step": 4966 }, { "epoch": 0.10459980099292945, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6245, "step": 4967 }, { "epoch": 0.10462085994219318, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.5643, "step": 4968 }, { "epoch": 0.10464191889145691, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5588, "step": 4969 }, { "epoch": 0.10466297784072064, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6144, "step": 4970 }, { "epoch": 0.10468403678998436, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5883, "step": 4971 }, { "epoch": 0.10470509573924809, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6145, "step": 4972 }, { "epoch": 0.10472615468851182, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.577, "step": 4973 }, { "epoch": 0.10474721363777555, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5911, "step": 4974 }, { "epoch": 0.10476827258703927, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6318, "step": 4975 }, { "epoch": 0.104789331536303, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5619, "step": 4976 }, { "epoch": 0.10481039048556673, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5573, "step": 4977 }, { "epoch": 0.10483144943483044, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5739, "step": 4978 }, { "epoch": 0.10485250838409417, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5615, "step": 4979 }, { "epoch": 0.1048735673333579, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5528, "step": 4980 }, { "epoch": 0.10489462628262163, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5809, "step": 4981 }, { "epoch": 0.10491568523188535, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.563, "step": 4982 }, { "epoch": 0.10493674418114908, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5721, "step": 4983 }, { "epoch": 0.10495780313041281, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6004, "step": 4984 }, { "epoch": 0.10497886207967654, "grad_norm": 0.1357421875, "learning_rate": 0.0002, "loss": 1.619, "step": 4985 }, { "epoch": 0.10499992102894026, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5889, "step": 4986 }, { "epoch": 0.10502097997820399, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5847, "step": 4987 }, { "epoch": 0.10504203892746772, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5467, "step": 4988 }, { "epoch": 0.10506309787673145, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.566, "step": 4989 }, { "epoch": 0.10508415682599517, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.547, "step": 4990 }, { "epoch": 0.10510521577525889, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5939, "step": 4991 }, { "epoch": 0.10512627472452261, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5902, "step": 4992 }, { "epoch": 0.10514733367378634, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6007, "step": 4993 }, { "epoch": 0.10516839262305007, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5982, "step": 4994 }, { "epoch": 0.1051894515723138, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.5839, "step": 4995 }, { "epoch": 0.10521051052157752, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.589, "step": 4996 }, { "epoch": 0.10523156947084125, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5983, "step": 4997 }, { "epoch": 0.10525262842010498, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5936, "step": 4998 }, { "epoch": 0.10527368736936871, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5952, "step": 4999 }, { "epoch": 0.10529474631863243, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5795, "step": 5000 }, { "epoch": 0.10529474631863243, "eval_loss": 2.2983527183532715, "eval_runtime": 908.9901, "eval_samples_per_second": 67.988, "eval_steps_per_second": 2.125, "step": 5000 }, { "epoch": 0.10531580526789616, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5877, "step": 5001 }, { "epoch": 0.10533686421715989, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5572, "step": 5002 }, { "epoch": 0.10535792316642362, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5884, "step": 5003 }, { "epoch": 0.10537898211568734, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5688, "step": 5004 }, { "epoch": 0.10540004106495106, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.552, "step": 5005 }, { "epoch": 0.10542110001421479, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5849, "step": 5006 }, { "epoch": 0.10544215896347851, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6067, "step": 5007 }, { "epoch": 0.10546321791274224, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5991, "step": 5008 }, { "epoch": 0.10548427686200597, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5874, "step": 5009 }, { "epoch": 0.1055053358112697, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6083, "step": 5010 }, { "epoch": 0.10552639476053342, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.6053, "step": 5011 }, { "epoch": 0.10554745370979715, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5833, "step": 5012 }, { "epoch": 0.10556851265906088, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5442, "step": 5013 }, { "epoch": 0.1055895716083246, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5651, "step": 5014 }, { "epoch": 0.10561063055758833, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5951, "step": 5015 }, { "epoch": 0.10563168950685206, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5547, "step": 5016 }, { "epoch": 0.10565274845611579, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5629, "step": 5017 }, { "epoch": 0.1056738074053795, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5629, "step": 5018 }, { "epoch": 0.10569486635464323, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.584, "step": 5019 }, { "epoch": 0.10571592530390696, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5783, "step": 5020 }, { "epoch": 0.10573698425317068, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5965, "step": 5021 }, { "epoch": 0.10575804320243441, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5732, "step": 5022 }, { "epoch": 0.10577910215169814, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6001, "step": 5023 }, { "epoch": 0.10580016110096187, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6072, "step": 5024 }, { "epoch": 0.1058212200502256, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5861, "step": 5025 }, { "epoch": 0.10584227899948932, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.574, "step": 5026 }, { "epoch": 0.10586333794875305, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5816, "step": 5027 }, { "epoch": 0.10588439689801678, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.597, "step": 5028 }, { "epoch": 0.1059054558472805, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5803, "step": 5029 }, { "epoch": 0.10592651479654423, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6053, "step": 5030 }, { "epoch": 0.10594757374580795, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5483, "step": 5031 }, { "epoch": 0.10596863269507167, "grad_norm": 0.1875, "learning_rate": 0.0002, "loss": 1.5434, "step": 5032 }, { "epoch": 0.1059896916443354, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5883, "step": 5033 }, { "epoch": 0.10601075059359913, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.592, "step": 5034 }, { "epoch": 0.10603180954286286, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5701, "step": 5035 }, { "epoch": 0.10605286849212658, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5742, "step": 5036 }, { "epoch": 0.10607392744139031, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5961, "step": 5037 }, { "epoch": 0.10609498639065404, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.5982, "step": 5038 }, { "epoch": 0.10611604533991777, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5774, "step": 5039 }, { "epoch": 0.1061371042891815, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.556, "step": 5040 }, { "epoch": 0.10615816323844522, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.57, "step": 5041 }, { "epoch": 0.10617922218770895, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.599, "step": 5042 }, { "epoch": 0.10620028113697268, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5853, "step": 5043 }, { "epoch": 0.10622134008623639, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5436, "step": 5044 }, { "epoch": 0.10624239903550012, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5619, "step": 5045 }, { "epoch": 0.10626345798476385, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.584, "step": 5046 }, { "epoch": 0.10628451693402757, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6059, "step": 5047 }, { "epoch": 0.1063055758832913, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.6124, "step": 5048 }, { "epoch": 0.10632663483255503, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5814, "step": 5049 }, { "epoch": 0.10634769378181876, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5583, "step": 5050 }, { "epoch": 0.10636875273108248, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5633, "step": 5051 }, { "epoch": 0.10638981168034621, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5957, "step": 5052 }, { "epoch": 0.10641087062960994, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5357, "step": 5053 }, { "epoch": 0.10643192957887367, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5512, "step": 5054 }, { "epoch": 0.1064529885281374, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5994, "step": 5055 }, { "epoch": 0.10647404747740112, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5752, "step": 5056 }, { "epoch": 0.10649510642666485, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6052, "step": 5057 }, { "epoch": 0.10651616537592856, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5898, "step": 5058 }, { "epoch": 0.10653722432519229, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5859, "step": 5059 }, { "epoch": 0.10655828327445602, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.568, "step": 5060 }, { "epoch": 0.10657934222371974, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.594, "step": 5061 }, { "epoch": 0.10660040117298347, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5896, "step": 5062 }, { "epoch": 0.1066214601222472, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5987, "step": 5063 }, { "epoch": 0.10664251907151093, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5697, "step": 5064 }, { "epoch": 0.10666357802077465, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.568, "step": 5065 }, { "epoch": 0.10668463697003838, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5795, "step": 5066 }, { "epoch": 0.10670569591930211, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5785, "step": 5067 }, { "epoch": 0.10672675486856584, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.543, "step": 5068 }, { "epoch": 0.10674781381782956, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5794, "step": 5069 }, { "epoch": 0.10676887276709329, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5456, "step": 5070 }, { "epoch": 0.106789931716357, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.572, "step": 5071 }, { "epoch": 0.10681099066562073, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5465, "step": 5072 }, { "epoch": 0.10683204961488446, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5574, "step": 5073 }, { "epoch": 0.10685310856414819, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5703, "step": 5074 }, { "epoch": 0.10687416751341192, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5909, "step": 5075 }, { "epoch": 0.10689522646267564, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5884, "step": 5076 }, { "epoch": 0.10691628541193937, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5698, "step": 5077 }, { "epoch": 0.1069373443612031, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5978, "step": 5078 }, { "epoch": 0.10695840331046683, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5992, "step": 5079 }, { "epoch": 0.10697946225973055, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5883, "step": 5080 }, { "epoch": 0.10700052120899428, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5926, "step": 5081 }, { "epoch": 0.10702158015825801, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5791, "step": 5082 }, { "epoch": 0.10704263910752174, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5717, "step": 5083 }, { "epoch": 0.10706369805678545, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5728, "step": 5084 }, { "epoch": 0.10708475700604918, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5631, "step": 5085 }, { "epoch": 0.1071058159553129, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6017, "step": 5086 }, { "epoch": 0.10712687490457663, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5889, "step": 5087 }, { "epoch": 0.10714793385384036, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5647, "step": 5088 }, { "epoch": 0.10716899280310409, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5824, "step": 5089 }, { "epoch": 0.10719005175236782, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5736, "step": 5090 }, { "epoch": 0.10721111070163154, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6236, "step": 5091 }, { "epoch": 0.10723216965089527, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5461, "step": 5092 }, { "epoch": 0.107253228600159, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5999, "step": 5093 }, { "epoch": 0.10727428754942273, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.576, "step": 5094 }, { "epoch": 0.10729534649868645, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5385, "step": 5095 }, { "epoch": 0.10731640544795018, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5842, "step": 5096 }, { "epoch": 0.1073374643972139, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5614, "step": 5097 }, { "epoch": 0.10735852334647762, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6028, "step": 5098 }, { "epoch": 0.10737958229574135, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5764, "step": 5099 }, { "epoch": 0.10740064124500508, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5836, "step": 5100 }, { "epoch": 0.1074217001942688, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5847, "step": 5101 }, { "epoch": 0.10744275914353253, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5996, "step": 5102 }, { "epoch": 0.10746381809279626, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6194, "step": 5103 }, { "epoch": 0.10748487704205999, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5639, "step": 5104 }, { "epoch": 0.10750593599132371, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5961, "step": 5105 }, { "epoch": 0.10752699494058744, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5944, "step": 5106 }, { "epoch": 0.10754805388985117, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5796, "step": 5107 }, { "epoch": 0.1075691128391149, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5355, "step": 5108 }, { "epoch": 0.10759017178837862, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.5677, "step": 5109 }, { "epoch": 0.10761123073764235, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5873, "step": 5110 }, { "epoch": 0.10763228968690607, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5757, "step": 5111 }, { "epoch": 0.1076533486361698, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5909, "step": 5112 }, { "epoch": 0.10767440758543352, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5969, "step": 5113 }, { "epoch": 0.10769546653469725, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.6183, "step": 5114 }, { "epoch": 0.10771652548396098, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5635, "step": 5115 }, { "epoch": 0.1077375844332247, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5932, "step": 5116 }, { "epoch": 0.10775864338248843, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5818, "step": 5117 }, { "epoch": 0.10777970233175216, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5889, "step": 5118 }, { "epoch": 0.10780076128101589, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5724, "step": 5119 }, { "epoch": 0.10782182023027961, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5715, "step": 5120 }, { "epoch": 0.10784287917954334, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5324, "step": 5121 }, { "epoch": 0.10786393812880707, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5722, "step": 5122 }, { "epoch": 0.1078849970780708, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5473, "step": 5123 }, { "epoch": 0.10790605602733451, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5598, "step": 5124 }, { "epoch": 0.10792711497659824, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5657, "step": 5125 }, { "epoch": 0.10794817392586196, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5816, "step": 5126 }, { "epoch": 0.10796923287512569, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.577, "step": 5127 }, { "epoch": 0.10799029182438942, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5868, "step": 5128 }, { "epoch": 0.10801135077365315, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5805, "step": 5129 }, { "epoch": 0.10803240972291688, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.587, "step": 5130 }, { "epoch": 0.1080534686721806, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5661, "step": 5131 }, { "epoch": 0.10807452762144433, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6286, "step": 5132 }, { "epoch": 0.10809558657070806, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.599, "step": 5133 }, { "epoch": 0.10811664551997179, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5453, "step": 5134 }, { "epoch": 0.10813770446923551, "grad_norm": 0.1416015625, "learning_rate": 0.0002, "loss": 1.5705, "step": 5135 }, { "epoch": 0.10815876341849924, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5694, "step": 5136 }, { "epoch": 0.10817982236776295, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.6127, "step": 5137 }, { "epoch": 0.10820088131702668, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.553, "step": 5138 }, { "epoch": 0.10822194026629041, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5719, "step": 5139 }, { "epoch": 0.10824299921555414, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5743, "step": 5140 }, { "epoch": 0.10826405816481786, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5776, "step": 5141 }, { "epoch": 0.10828511711408159, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5593, "step": 5142 }, { "epoch": 0.10830617606334532, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5555, "step": 5143 }, { "epoch": 0.10832723501260905, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5755, "step": 5144 }, { "epoch": 0.10834829396187277, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5793, "step": 5145 }, { "epoch": 0.1083693529111365, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.548, "step": 5146 }, { "epoch": 0.10839041186040023, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5994, "step": 5147 }, { "epoch": 0.10841147080966396, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5818, "step": 5148 }, { "epoch": 0.10843252975892768, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5822, "step": 5149 }, { "epoch": 0.1084535887081914, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.5955, "step": 5150 }, { "epoch": 0.10847464765745513, "grad_norm": 0.181640625, "learning_rate": 0.0002, "loss": 1.5635, "step": 5151 }, { "epoch": 0.10849570660671885, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5854, "step": 5152 }, { "epoch": 0.10851676555598258, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5592, "step": 5153 }, { "epoch": 0.10853782450524631, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5739, "step": 5154 }, { "epoch": 0.10855888345451004, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.578, "step": 5155 }, { "epoch": 0.10857994240377376, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5922, "step": 5156 }, { "epoch": 0.10860100135303749, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6204, "step": 5157 }, { "epoch": 0.10862206030230122, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5832, "step": 5158 }, { "epoch": 0.10864311925156495, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5465, "step": 5159 }, { "epoch": 0.10866417820082867, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5998, "step": 5160 }, { "epoch": 0.1086852371500924, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5541, "step": 5161 }, { "epoch": 0.10870629609935613, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5642, "step": 5162 }, { "epoch": 0.10872735504861986, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5657, "step": 5163 }, { "epoch": 0.10874841399788357, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5815, "step": 5164 }, { "epoch": 0.1087694729471473, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.59, "step": 5165 }, { "epoch": 0.10879053189641102, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5766, "step": 5166 }, { "epoch": 0.10881159084567475, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5954, "step": 5167 }, { "epoch": 0.10883264979493848, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5988, "step": 5168 }, { "epoch": 0.10885370874420221, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5494, "step": 5169 }, { "epoch": 0.10887476769346593, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5965, "step": 5170 }, { "epoch": 0.10889582664272966, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6017, "step": 5171 }, { "epoch": 0.10891688559199339, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5797, "step": 5172 }, { "epoch": 0.10893794454125712, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5661, "step": 5173 }, { "epoch": 0.10895900349052084, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.611, "step": 5174 }, { "epoch": 0.10898006243978457, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6259, "step": 5175 }, { "epoch": 0.1090011213890483, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5817, "step": 5176 }, { "epoch": 0.10902218033831201, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5861, "step": 5177 }, { "epoch": 0.10904323928757574, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.552, "step": 5178 }, { "epoch": 0.10906429823683947, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5803, "step": 5179 }, { "epoch": 0.1090853571861032, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5778, "step": 5180 }, { "epoch": 0.10910641613536692, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5706, "step": 5181 }, { "epoch": 0.10912747508463065, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5718, "step": 5182 }, { "epoch": 0.10914853403389438, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5881, "step": 5183 }, { "epoch": 0.1091695929831581, "grad_norm": 0.1376953125, "learning_rate": 0.0002, "loss": 1.5666, "step": 5184 }, { "epoch": 0.10919065193242183, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5943, "step": 5185 }, { "epoch": 0.10921171088168556, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6, "step": 5186 }, { "epoch": 0.10923276983094929, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5662, "step": 5187 }, { "epoch": 0.10925382878021302, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5766, "step": 5188 }, { "epoch": 0.10927488772947674, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5836, "step": 5189 }, { "epoch": 0.10929594667874046, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5714, "step": 5190 }, { "epoch": 0.10931700562800419, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5867, "step": 5191 }, { "epoch": 0.10933806457726791, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5771, "step": 5192 }, { "epoch": 0.10935912352653164, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5627, "step": 5193 }, { "epoch": 0.10938018247579537, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5891, "step": 5194 }, { "epoch": 0.1094012414250591, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5917, "step": 5195 }, { "epoch": 0.10942230037432282, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5678, "step": 5196 }, { "epoch": 0.10944335932358655, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5946, "step": 5197 }, { "epoch": 0.10946441827285028, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6273, "step": 5198 }, { "epoch": 0.109485477222114, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.572, "step": 5199 }, { "epoch": 0.10950653617137773, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5797, "step": 5200 }, { "epoch": 0.10952759512064146, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5816, "step": 5201 }, { "epoch": 0.10954865406990519, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5849, "step": 5202 }, { "epoch": 0.1095697130191689, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5933, "step": 5203 }, { "epoch": 0.10959077196843263, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5954, "step": 5204 }, { "epoch": 0.10961183091769636, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5984, "step": 5205 }, { "epoch": 0.10963288986696008, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5773, "step": 5206 }, { "epoch": 0.10965394881622381, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5774, "step": 5207 }, { "epoch": 0.10967500776548754, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5877, "step": 5208 }, { "epoch": 0.10969606671475127, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5948, "step": 5209 }, { "epoch": 0.109717125664015, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.63, "step": 5210 }, { "epoch": 0.10973818461327872, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.6111, "step": 5211 }, { "epoch": 0.10975924356254245, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5785, "step": 5212 }, { "epoch": 0.10978030251180618, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.6114, "step": 5213 }, { "epoch": 0.1098013614610699, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.5499, "step": 5214 }, { "epoch": 0.10982242041033363, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5681, "step": 5215 }, { "epoch": 0.10984347935959736, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5947, "step": 5216 }, { "epoch": 0.10986453830886107, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.581, "step": 5217 }, { "epoch": 0.1098855972581248, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5667, "step": 5218 }, { "epoch": 0.10990665620738853, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5796, "step": 5219 }, { "epoch": 0.10992771515665226, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5794, "step": 5220 }, { "epoch": 0.10994877410591598, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5737, "step": 5221 }, { "epoch": 0.10996983305517971, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6114, "step": 5222 }, { "epoch": 0.10999089200444344, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5648, "step": 5223 }, { "epoch": 0.11001195095370717, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5917, "step": 5224 }, { "epoch": 0.1100330099029709, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5727, "step": 5225 }, { "epoch": 0.11005406885223462, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5657, "step": 5226 }, { "epoch": 0.11007512780149835, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5701, "step": 5227 }, { "epoch": 0.11009618675076208, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.589, "step": 5228 }, { "epoch": 0.1101172457000258, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5689, "step": 5229 }, { "epoch": 0.11013830464928952, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5683, "step": 5230 }, { "epoch": 0.11015936359855324, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6193, "step": 5231 }, { "epoch": 0.11018042254781697, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5946, "step": 5232 }, { "epoch": 0.1102014814970807, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5773, "step": 5233 }, { "epoch": 0.11022254044634443, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5677, "step": 5234 }, { "epoch": 0.11024359939560816, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5716, "step": 5235 }, { "epoch": 0.11026465834487188, "grad_norm": 0.1787109375, "learning_rate": 0.0002, "loss": 1.5938, "step": 5236 }, { "epoch": 0.11028571729413561, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5791, "step": 5237 }, { "epoch": 0.11030677624339934, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5624, "step": 5238 }, { "epoch": 0.11032783519266307, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6113, "step": 5239 }, { "epoch": 0.11034889414192679, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6147, "step": 5240 }, { "epoch": 0.11036995309119052, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6148, "step": 5241 }, { "epoch": 0.11039101204045425, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5825, "step": 5242 }, { "epoch": 0.11041207098971796, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5944, "step": 5243 }, { "epoch": 0.11043312993898169, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5752, "step": 5244 }, { "epoch": 0.11045418888824542, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5438, "step": 5245 }, { "epoch": 0.11047524783750914, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5494, "step": 5246 }, { "epoch": 0.11049630678677287, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5801, "step": 5247 }, { "epoch": 0.1105173657360366, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.559, "step": 5248 }, { "epoch": 0.11053842468530033, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5978, "step": 5249 }, { "epoch": 0.11055948363456405, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.55, "step": 5250 }, { "epoch": 0.11058054258382778, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5341, "step": 5251 }, { "epoch": 0.11060160153309151, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5734, "step": 5252 }, { "epoch": 0.11062266048235524, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5993, "step": 5253 }, { "epoch": 0.11064371943161896, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5763, "step": 5254 }, { "epoch": 0.11066477838088269, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5968, "step": 5255 }, { "epoch": 0.1106858373301464, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5602, "step": 5256 }, { "epoch": 0.11070689627941013, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5588, "step": 5257 }, { "epoch": 0.11072795522867386, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5588, "step": 5258 }, { "epoch": 0.11074901417793759, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6229, "step": 5259 }, { "epoch": 0.11077007312720132, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5807, "step": 5260 }, { "epoch": 0.11079113207646504, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5508, "step": 5261 }, { "epoch": 0.11081219102572877, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5502, "step": 5262 }, { "epoch": 0.1108332499749925, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5654, "step": 5263 }, { "epoch": 0.11085430892425623, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5988, "step": 5264 }, { "epoch": 0.11087536787351995, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5903, "step": 5265 }, { "epoch": 0.11089642682278368, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5992, "step": 5266 }, { "epoch": 0.11091748577204741, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.61, "step": 5267 }, { "epoch": 0.11093854472131114, "grad_norm": 0.1748046875, "learning_rate": 0.0002, "loss": 1.598, "step": 5268 }, { "epoch": 0.11095960367057486, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.6084, "step": 5269 }, { "epoch": 0.11098066261983858, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6052, "step": 5270 }, { "epoch": 0.1110017215691023, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5933, "step": 5271 }, { "epoch": 0.11102278051836603, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5867, "step": 5272 }, { "epoch": 0.11104383946762976, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5912, "step": 5273 }, { "epoch": 0.11106489841689349, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5827, "step": 5274 }, { "epoch": 0.11108595736615721, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.5917, "step": 5275 }, { "epoch": 0.11110701631542094, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5671, "step": 5276 }, { "epoch": 0.11112807526468467, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.5716, "step": 5277 }, { "epoch": 0.1111491342139484, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5992, "step": 5278 }, { "epoch": 0.11117019316321212, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5805, "step": 5279 }, { "epoch": 0.11119125211247585, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5933, "step": 5280 }, { "epoch": 0.11121231106173958, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5723, "step": 5281 }, { "epoch": 0.11123337001100331, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6037, "step": 5282 }, { "epoch": 0.11125442896026702, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5609, "step": 5283 }, { "epoch": 0.11127548790953075, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.563, "step": 5284 }, { "epoch": 0.11129654685879448, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5795, "step": 5285 }, { "epoch": 0.1113176058080582, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5911, "step": 5286 }, { "epoch": 0.11133866475732193, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5958, "step": 5287 }, { "epoch": 0.11135972370658566, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5768, "step": 5288 }, { "epoch": 0.11138078265584939, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5656, "step": 5289 }, { "epoch": 0.11140184160511311, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5752, "step": 5290 }, { "epoch": 0.11142290055437684, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.5812, "step": 5291 }, { "epoch": 0.11144395950364057, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.5645, "step": 5292 }, { "epoch": 0.1114650184529043, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.572, "step": 5293 }, { "epoch": 0.11148607740216802, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5825, "step": 5294 }, { "epoch": 0.11150713635143175, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6001, "step": 5295 }, { "epoch": 0.11152819530069547, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5634, "step": 5296 }, { "epoch": 0.11154925424995919, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5846, "step": 5297 }, { "epoch": 0.11157031319922292, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5638, "step": 5298 }, { "epoch": 0.11159137214848665, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5392, "step": 5299 }, { "epoch": 0.11161243109775038, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5668, "step": 5300 }, { "epoch": 0.1116334900470141, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5751, "step": 5301 }, { "epoch": 0.11165454899627783, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6205, "step": 5302 }, { "epoch": 0.11167560794554156, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.569, "step": 5303 }, { "epoch": 0.11169666689480529, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5713, "step": 5304 }, { "epoch": 0.11171772584406901, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6086, "step": 5305 }, { "epoch": 0.11173878479333274, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6032, "step": 5306 }, { "epoch": 0.11175984374259647, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5887, "step": 5307 }, { "epoch": 0.1117809026918602, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5904, "step": 5308 }, { "epoch": 0.11180196164112391, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.6022, "step": 5309 }, { "epoch": 0.11182302059038764, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6052, "step": 5310 }, { "epoch": 0.11184407953965136, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5822, "step": 5311 }, { "epoch": 0.11186513848891509, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5844, "step": 5312 }, { "epoch": 0.11188619743817882, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.6124, "step": 5313 }, { "epoch": 0.11190725638744255, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5607, "step": 5314 }, { "epoch": 0.11192831533670627, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5705, "step": 5315 }, { "epoch": 0.11194937428597, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5617, "step": 5316 }, { "epoch": 0.11197043323523373, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6148, "step": 5317 }, { "epoch": 0.11199149218449746, "grad_norm": 0.140625, "learning_rate": 0.0002, "loss": 1.5705, "step": 5318 }, { "epoch": 0.11201255113376118, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5304, "step": 5319 }, { "epoch": 0.11203361008302491, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5561, "step": 5320 }, { "epoch": 0.11205466903228864, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5424, "step": 5321 }, { "epoch": 0.11207572798155237, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5863, "step": 5322 }, { "epoch": 0.11209678693081608, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6091, "step": 5323 }, { "epoch": 0.11211784588007981, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5968, "step": 5324 }, { "epoch": 0.11213890482934354, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5568, "step": 5325 }, { "epoch": 0.11215996377860726, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5975, "step": 5326 }, { "epoch": 0.11218102272787099, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5702, "step": 5327 }, { "epoch": 0.11220208167713472, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.588, "step": 5328 }, { "epoch": 0.11222314062639845, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5807, "step": 5329 }, { "epoch": 0.11224419957566217, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5695, "step": 5330 }, { "epoch": 0.1122652585249259, "grad_norm": 0.169921875, "learning_rate": 0.0002, "loss": 1.5798, "step": 5331 }, { "epoch": 0.11228631747418963, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.595, "step": 5332 }, { "epoch": 0.11230737642345336, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.595, "step": 5333 }, { "epoch": 0.11232843537271708, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5856, "step": 5334 }, { "epoch": 0.11234949432198081, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.603, "step": 5335 }, { "epoch": 0.11237055327124452, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6207, "step": 5336 }, { "epoch": 0.11239161222050825, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5916, "step": 5337 }, { "epoch": 0.11241267116977198, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5937, "step": 5338 }, { "epoch": 0.11243373011903571, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5823, "step": 5339 }, { "epoch": 0.11245478906829943, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5692, "step": 5340 }, { "epoch": 0.11247584801756316, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.6058, "step": 5341 }, { "epoch": 0.11249690696682689, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5844, "step": 5342 }, { "epoch": 0.11251796591609062, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5735, "step": 5343 }, { "epoch": 0.11253902486535435, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6018, "step": 5344 }, { "epoch": 0.11256008381461807, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5645, "step": 5345 }, { "epoch": 0.1125811427638818, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5649, "step": 5346 }, { "epoch": 0.11260220171314553, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.6068, "step": 5347 }, { "epoch": 0.11262326066240926, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5468, "step": 5348 }, { "epoch": 0.11264431961167297, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5454, "step": 5349 }, { "epoch": 0.1126653785609367, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5777, "step": 5350 }, { "epoch": 0.11268643751020042, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5436, "step": 5351 }, { "epoch": 0.11270749645946415, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5846, "step": 5352 }, { "epoch": 0.11272855540872788, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5699, "step": 5353 }, { "epoch": 0.1127496143579916, "grad_norm": 0.177734375, "learning_rate": 0.0002, "loss": 1.6036, "step": 5354 }, { "epoch": 0.11277067330725533, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5748, "step": 5355 }, { "epoch": 0.11279173225651906, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5741, "step": 5356 }, { "epoch": 0.11281279120578279, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6056, "step": 5357 }, { "epoch": 0.11283385015504652, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5673, "step": 5358 }, { "epoch": 0.11285490910431024, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6194, "step": 5359 }, { "epoch": 0.11287596805357397, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5633, "step": 5360 }, { "epoch": 0.1128970270028377, "grad_norm": 0.173828125, "learning_rate": 0.0002, "loss": 1.5882, "step": 5361 }, { "epoch": 0.11291808595210141, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5791, "step": 5362 }, { "epoch": 0.11293914490136514, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5739, "step": 5363 }, { "epoch": 0.11296020385062887, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5563, "step": 5364 }, { "epoch": 0.1129812627998926, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5849, "step": 5365 }, { "epoch": 0.11300232174915632, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5887, "step": 5366 }, { "epoch": 0.11302338069842005, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.551, "step": 5367 }, { "epoch": 0.11304443964768378, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5716, "step": 5368 }, { "epoch": 0.1130654985969475, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5929, "step": 5369 }, { "epoch": 0.11308655754621123, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5775, "step": 5370 }, { "epoch": 0.11310761649547496, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.596, "step": 5371 }, { "epoch": 0.11312867544473869, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.6188, "step": 5372 }, { "epoch": 0.11314973439400242, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6115, "step": 5373 }, { "epoch": 0.11317079334326614, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6004, "step": 5374 }, { "epoch": 0.11319185229252987, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5701, "step": 5375 }, { "epoch": 0.11321291124179358, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6098, "step": 5376 }, { "epoch": 0.11323397019105731, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5986, "step": 5377 }, { "epoch": 0.11325502914032104, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.6023, "step": 5378 }, { "epoch": 0.11327608808958477, "grad_norm": 0.1806640625, "learning_rate": 0.0002, "loss": 1.5505, "step": 5379 }, { "epoch": 0.1132971470388485, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5682, "step": 5380 }, { "epoch": 0.11331820598811222, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.6159, "step": 5381 }, { "epoch": 0.11333926493737595, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5783, "step": 5382 }, { "epoch": 0.11336032388663968, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5449, "step": 5383 }, { "epoch": 0.1133813828359034, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5492, "step": 5384 }, { "epoch": 0.11340244178516713, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5663, "step": 5385 }, { "epoch": 0.11342350073443086, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5579, "step": 5386 }, { "epoch": 0.11344455968369459, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5808, "step": 5387 }, { "epoch": 0.11346561863295831, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.589, "step": 5388 }, { "epoch": 0.11348667758222203, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5899, "step": 5389 }, { "epoch": 0.11350773653148576, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5711, "step": 5390 }, { "epoch": 0.11352879548074948, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5641, "step": 5391 }, { "epoch": 0.11354985443001321, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5746, "step": 5392 }, { "epoch": 0.11357091337927694, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5703, "step": 5393 }, { "epoch": 0.11359197232854067, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5952, "step": 5394 }, { "epoch": 0.1136130312778044, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5547, "step": 5395 }, { "epoch": 0.11363409022706812, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.5511, "step": 5396 }, { "epoch": 0.11365514917633185, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.5833, "step": 5397 }, { "epoch": 0.11367620812559558, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5566, "step": 5398 }, { "epoch": 0.1136972670748593, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5658, "step": 5399 }, { "epoch": 0.11371832602412303, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5584, "step": 5400 }, { "epoch": 0.11373938497338676, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.5737, "step": 5401 }, { "epoch": 0.11376044392265047, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6198, "step": 5402 }, { "epoch": 0.1137815028719142, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5585, "step": 5403 }, { "epoch": 0.11380256182117793, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5867, "step": 5404 }, { "epoch": 0.11382362077044166, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.572, "step": 5405 }, { "epoch": 0.11384467971970538, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5758, "step": 5406 }, { "epoch": 0.11386573866896911, "grad_norm": 0.1435546875, "learning_rate": 0.0002, "loss": 1.5436, "step": 5407 }, { "epoch": 0.11388679761823284, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5863, "step": 5408 }, { "epoch": 0.11390785656749657, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5911, "step": 5409 }, { "epoch": 0.11392891551676029, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5873, "step": 5410 }, { "epoch": 0.11394997446602402, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5915, "step": 5411 }, { "epoch": 0.11397103341528775, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5908, "step": 5412 }, { "epoch": 0.11399209236455148, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5507, "step": 5413 }, { "epoch": 0.1140131513138152, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5821, "step": 5414 }, { "epoch": 0.11403421026307893, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.6239, "step": 5415 }, { "epoch": 0.11405526921234264, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6023, "step": 5416 }, { "epoch": 0.11407632816160637, "grad_norm": 0.18359375, "learning_rate": 0.0002, "loss": 1.5531, "step": 5417 }, { "epoch": 0.1140973871108701, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.573, "step": 5418 }, { "epoch": 0.11411844606013383, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5759, "step": 5419 }, { "epoch": 0.11413950500939755, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5825, "step": 5420 }, { "epoch": 0.11416056395866128, "grad_norm": 0.1533203125, "learning_rate": 0.0002, "loss": 1.5533, "step": 5421 }, { "epoch": 0.11418162290792501, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.589, "step": 5422 }, { "epoch": 0.11420268185718874, "grad_norm": 0.1708984375, "learning_rate": 0.0002, "loss": 1.5615, "step": 5423 }, { "epoch": 0.11422374080645246, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.6026, "step": 5424 }, { "epoch": 0.11424479975571619, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5761, "step": 5425 }, { "epoch": 0.11426585870497992, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5704, "step": 5426 }, { "epoch": 0.11428691765424365, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5956, "step": 5427 }, { "epoch": 0.11430797660350737, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.548, "step": 5428 }, { "epoch": 0.11432903555277109, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6072, "step": 5429 }, { "epoch": 0.11435009450203482, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5926, "step": 5430 }, { "epoch": 0.11437115345129854, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5584, "step": 5431 }, { "epoch": 0.11439221240056227, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5815, "step": 5432 }, { "epoch": 0.114413271349826, "grad_norm": 0.171875, "learning_rate": 0.0002, "loss": 1.5428, "step": 5433 }, { "epoch": 0.11443433029908973, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.5716, "step": 5434 }, { "epoch": 0.11445538924835345, "grad_norm": 0.1669921875, "learning_rate": 0.0002, "loss": 1.5553, "step": 5435 }, { "epoch": 0.11447644819761718, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5829, "step": 5436 }, { "epoch": 0.11449750714688091, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5924, "step": 5437 }, { "epoch": 0.11451856609614464, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5867, "step": 5438 }, { "epoch": 0.11453962504540836, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5859, "step": 5439 }, { "epoch": 0.11456068399467209, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5278, "step": 5440 }, { "epoch": 0.11458174294393582, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.6249, "step": 5441 }, { "epoch": 0.11460280189319953, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5847, "step": 5442 }, { "epoch": 0.11462386084246326, "grad_norm": 0.1591796875, "learning_rate": 0.0002, "loss": 1.6403, "step": 5443 }, { "epoch": 0.11464491979172699, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5911, "step": 5444 }, { "epoch": 0.11466597874099071, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6062, "step": 5445 }, { "epoch": 0.11468703769025444, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5879, "step": 5446 }, { "epoch": 0.11470809663951817, "grad_norm": 0.166015625, "learning_rate": 0.0002, "loss": 1.5824, "step": 5447 }, { "epoch": 0.1147291555887819, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.594, "step": 5448 }, { "epoch": 0.11475021453804563, "grad_norm": 0.1728515625, "learning_rate": 0.0002, "loss": 1.5621, "step": 5449 }, { "epoch": 0.11477127348730935, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.551, "step": 5450 }, { "epoch": 0.11479233243657308, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.6232, "step": 5451 }, { "epoch": 0.11481339138583681, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5959, "step": 5452 }, { "epoch": 0.11483445033510054, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6236, "step": 5453 }, { "epoch": 0.11485550928436426, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5791, "step": 5454 }, { "epoch": 0.11487656823362798, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5871, "step": 5455 }, { "epoch": 0.1148976271828917, "grad_norm": 0.1484375, "learning_rate": 0.0002, "loss": 1.5993, "step": 5456 }, { "epoch": 0.11491868613215543, "grad_norm": 0.1689453125, "learning_rate": 0.0002, "loss": 1.5406, "step": 5457 }, { "epoch": 0.11493974508141916, "grad_norm": 0.1640625, "learning_rate": 0.0002, "loss": 1.5492, "step": 5458 }, { "epoch": 0.11496080403068289, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.5629, "step": 5459 }, { "epoch": 0.11498186297994661, "grad_norm": 0.1494140625, "learning_rate": 0.0002, "loss": 1.6103, "step": 5460 }, { "epoch": 0.11500292192921034, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5556, "step": 5461 }, { "epoch": 0.11502398087847407, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.5877, "step": 5462 }, { "epoch": 0.1150450398277378, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.5568, "step": 5463 }, { "epoch": 0.11506609877700152, "grad_norm": 0.1630859375, "learning_rate": 0.0002, "loss": 1.6126, "step": 5464 }, { "epoch": 0.11508715772626525, "grad_norm": 0.1611328125, "learning_rate": 0.0002, "loss": 1.5748, "step": 5465 }, { "epoch": 0.11510821667552898, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.6023, "step": 5466 }, { "epoch": 0.1151292756247927, "grad_norm": 0.142578125, "learning_rate": 0.0002, "loss": 1.6044, "step": 5467 }, { "epoch": 0.11515033457405643, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.5485, "step": 5468 }, { "epoch": 0.11517139352332015, "grad_norm": 0.1796875, "learning_rate": 0.0002, "loss": 1.5993, "step": 5469 }, { "epoch": 0.11519245247258388, "grad_norm": 0.193359375, "learning_rate": 0.0002, "loss": 1.5913, "step": 5470 }, { "epoch": 0.1152135114218476, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5698, "step": 5471 }, { "epoch": 0.11523457037111133, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5892, "step": 5472 }, { "epoch": 0.11525562932037506, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.592, "step": 5473 }, { "epoch": 0.11527668826963879, "grad_norm": 0.1650390625, "learning_rate": 0.0002, "loss": 1.578, "step": 5474 }, { "epoch": 0.11529774721890251, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5853, "step": 5475 }, { "epoch": 0.11531880616816624, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.6153, "step": 5476 }, { "epoch": 0.11533986511742997, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.56, "step": 5477 }, { "epoch": 0.1153609240666937, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5879, "step": 5478 }, { "epoch": 0.11538198301595742, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.5511, "step": 5479 }, { "epoch": 0.11540304196522115, "grad_norm": 0.1455078125, "learning_rate": 0.0002, "loss": 1.5615, "step": 5480 }, { "epoch": 0.11542410091448488, "grad_norm": 0.16015625, "learning_rate": 0.0002, "loss": 1.5635, "step": 5481 }, { "epoch": 0.11544515986374859, "grad_norm": 0.1552734375, "learning_rate": 0.0002, "loss": 1.5894, "step": 5482 }, { "epoch": 0.11546621881301232, "grad_norm": 0.1513671875, "learning_rate": 0.0002, "loss": 1.5455, "step": 5483 }, { "epoch": 0.11548727776227605, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6208, "step": 5484 }, { "epoch": 0.11550833671153977, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5363, "step": 5485 }, { "epoch": 0.1155293956608035, "grad_norm": 0.150390625, "learning_rate": 0.0002, "loss": 1.6207, "step": 5486 }, { "epoch": 0.11555045461006723, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.6014, "step": 5487 }, { "epoch": 0.11557151355933096, "grad_norm": 0.16796875, "learning_rate": 0.0002, "loss": 1.6124, "step": 5488 }, { "epoch": 0.11559257250859468, "grad_norm": 0.15234375, "learning_rate": 0.0002, "loss": 1.61, "step": 5489 }, { "epoch": 0.11561363145785841, "grad_norm": 0.15625, "learning_rate": 0.0002, "loss": 1.5642, "step": 5490 }, { "epoch": 0.11563469040712214, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5958, "step": 5491 }, { "epoch": 0.11565574935638587, "grad_norm": 0.158203125, "learning_rate": 0.0002, "loss": 1.5623, "step": 5492 }, { "epoch": 0.1156768083056496, "grad_norm": 0.162109375, "learning_rate": 0.0002, "loss": 1.6008, "step": 5493 }, { "epoch": 0.11569786725491332, "grad_norm": 0.154296875, "learning_rate": 0.0002, "loss": 1.5397, "step": 5494 }, { "epoch": 0.11571892620417704, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.592, "step": 5495 }, { "epoch": 0.11573998515344076, "grad_norm": 0.146484375, "learning_rate": 0.0002, "loss": 1.5626, "step": 5496 }, { "epoch": 0.11576104410270449, "grad_norm": 0.1572265625, "learning_rate": 0.0002, "loss": 1.5662, "step": 5497 }, { "epoch": 0.11578210305196822, "grad_norm": 0.1845703125, "learning_rate": 0.0002, "loss": 1.5576, "step": 5498 }, { "epoch": 0.11580316200123195, "grad_norm": 0.1474609375, "learning_rate": 0.0002, "loss": 1.5579, "step": 5499 }, { "epoch": 0.11582422095049567, "grad_norm": 0.14453125, "learning_rate": 0.0002, "loss": 1.5457, "step": 5500 }, { "epoch": 0.11582422095049567, "eval_loss": 2.3854761123657227, "eval_runtime": 1003.3844, "eval_samples_per_second": 61.592, "eval_steps_per_second": 1.925, "step": 5500 } ], "logging_steps": 1.0, "max_steps": 47485, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.7218310856704e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }